# Tweet Summarization:

This is a basic implementation of the Phrase Reinforcement Algorithm proposed by Sharifi et al. for Twitter

In [3]:
import pandas as pd
import numpy as np
import operator
import nltk
import string
from collections import Counter
import math
import re

#### import data

In [4]:
event_name = input('Enter Event Name: ')
event_filename = re.sub("\W+", "", event_name.strip())

Enter Event Name: Monaco Grand Prix


In [5]:
df = pd.read_csv('data/final/event_%s_data.txt' % event_filename, sep='\t', encoding='utf-8', header=0, parse_dates=['created_at'], dtype={'twitter_id' : 'str'})
df.set_index('twitter_id', inplace=True)
df.shape

(64483, 35)

In [63]:
dfsubevents = pd.read_csv('data/final/event_%s_subevents.txt' % event_filename, sep='\t', encoding='utf-8', header=0, index_col=0)
dfsubevents.shape

(4315, 5)

### Get tweets from top k

In [72]:
k = 10

In [118]:
dftemp = df.reset_index()[['twitter_id', 'created_at']]
dftemp['created_at'] = dftemp['created_at'].apply(lambda x: x.replace(second=0))
dftemp.set_index(['created_at'], inplace=True)

data = pd.merge(dftemp, dfsubevents, how='inner', left_index=True, right_index=True)
data.set_index('twitter_id', inplace=True)

data = pd.merge(df, data, how='left', left_index=True, right_index=True)

data = data[data['rank'] <= k][['created_at', 'text', 'rank']]

In [119]:
data = data.reset_index().set_index('created_at')

### Clean Text

In [125]:
#make sure to handle text as string
data['text'] = data['text'].astype('str')
#remove links
data['text_clean'] = data['text'].apply(lambda text: ' '.join([(w[:w.find('http')] if 'http' in w else w) for w in text.split()]))
#remove hashtag symbol
data['text_clean'] = data['text_clean'].apply(lambda text: text.replace('#', ''))
#remove mention symbol
data['text_clean'] = data['text_clean'].apply(lambda text: text.replace('@', ''))
#initiate, apply tokenization
tknzr = nltk.TweetTokenizer()
punct = list(string.punctuation)
punct.extend(['...', '..', '…', '”', '“', 'the', '.@', 'RT'])
data['text_clean'] = data['text_clean'].apply(lambda s: ' '.join([w.lower() for w in tknzr.tokenize(str(s)) if w not in punct]))

### Iterate Subevents

In [130]:
for i in range(k)[:1]:
    subevent = data[data['rank'] == i+1]

In [139]:
subevent.shape

(855, 4)

#### get token frequencies
- save only those > 1

In [143]:
alltokens = [token for tweet in subevent['text_clean'] for token in tweet.split()]
counts = Counter(alltokens)
print(counts.most_common(10))

[('monacogp', 856), ('red', 259), ('bull', 254), ('f1', 251), ('what', 165), ('a', 143), ('that', 132), ('for', 127), ('redbullracing', 122), ('you', 115)]


In [144]:
counts = dict((token,count) for token,count in counts.items() if count > 1)

#### define base term
- get most frequent term, not occurring in 100% of docs
- this excludes the query term, being in all tweets

In [146]:
d = dict((token,count) for token,count in counts.items() if count < subevent.shape[0])
base = sorted(d.items(), key=operator.itemgetter(1))[-1][0]
print(base)

red


#### get term per tweet index
- only considering those with frequency > 1

In [154]:
subevent['dict_indexes'] = subevent['text_clean'].apply(lambda text: dict((token,index) for index,token in enumerate(text.split()) if token in counts))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


#### normalize token indexes to base

In [155]:
subevent['dict_indexes'] = subevent['dict_indexes'].apply(lambda d: dict((token, token_index - d.get(base)) for token,token_index in d.items() if base in d.keys()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


#### get term weights

$weight(node) = count(node) - distance(base) * \ln(count(node))$

In [156]:
subevent['dict_weights'] = subevent['dict_indexes'].apply(lambda d: dict((token, (counts.get(token) - token_index * math.log1p(counts.get(token)))) for token,token_index in d.items()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


#### aggregate term weights to text

In [157]:
subevent['text_weight'] = subevent['dict_weights'].apply(lambda d: sum(d.values()))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


#### print summary

In [158]:
subevent.sort_values(by='text_weight', ascending=False).iloc[0]['text']

'#F1 #MonacoGP Ricciardo got a super late in pits, what happened to Red Bull???'

In [160]:
subevent.sort_values(by='text_weight')

Unnamed: 0_level_0,twitter_id,text,rank,text_clean,dict_indexes,dict_weights,text_weight
created_at,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2016-05-29 12:57:00,736904174215016448,Everyone changing onto dry tires now. #MonacoG...,1.0,everyone changing onto dry tires now monacogp ...,{},{},0.0
2016-05-29 12:57:00,736904173485064193,That was a @Tiametmarduk pit stop right there....,1.0,that was a tiametmarduk pit stop right there f...,{},{},0.0
2016-05-29 12:57:00,736904172683939840,You had one job.... #RedBull Tire guys... #Mon...,1.0,you had one job .... redbull tire guys monacog...,{},{},0.0
2016-05-29 12:57:00,736904174957256705,WTF Red BUll !!! #ABCF1 #monacogp,1.0,wtf red bull abcf 1 monacogp,"{'wtf': -1, 'red': 0, 'abcf': 2, 'bull': 1, '1...","{'wtf': 32.40119738166216, 'red': 259.0, 'abcf...",1417.267554
2016-05-29 12:57:00,736904171425783808,Red Bull with a costly error in the pits there...,1.0,red bull with a costly error in pits there mon...,"{'f1': 10, 'pits': 7, 'a': 3, 'costly': 4, 'in...","{'f1': 195.70570912488577, 'pits': 8.315476327...",1716.425365
2016-05-29 12:57:00,736904171094343680,Disastrous pit stop #MonacoGP #Blunder,1.0,disastrous pit stop monacogp blunder,{},{},0.0
2016-05-29 12:57:00,736904170926575617,@redbullracing what the hell have you done!! #...,1.0,redbullracing what hell have you done worstpit...,{},{},0.0
2016-05-29 12:57:00,736904172621160448,Red Bull gives you a fast asleep pit crew #Mon...,1.0,red bull gives you a fast asleep pit crew mona...,"{'gives': 2, 'you': 3, 'a': 4, 'bull': 1, 'pit...","{'gives': 4.394829814011908, 'monacogp': 795.2...",1603.037015
2016-05-29 12:57:01,736904177989877760,What does Ricciardo have to do? #MonacoGP,1.0,what does ricciardo have to do monacogp,{},{},0.0
2016-05-29 12:57:01,736904179076214785,Who fucked that at Redbull 😠😠 poor Riccardo #M...,1.0,who fucked that at redbull 😠 😠 poor riccardo m...,{},{},0.0
