In [5]:
import pandas as pd
import numpy as np
from collections import Counter
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
import nltk

In [2]:
#create elongation regular expression match
elongated = re.compile('([a-zA-Z])\\1{3,}')

In [3]:
#load data
df = pd.read_table('data/final/event_panama_papers_data.txt', sep='\t', encoding='utf-8', header=0)
df.shape

  interactivity=interactivity, compiler=compiler, result=result)


(2295892, 37)

In [32]:
#fill NaN in some features
df['user_id_verified'] = df['user_id_verified'].astype(object).replace(np.nan, 0)
#encode some features
df['user_id_verified'] = df['user_id_verified'].apply(lambda d: [0,1][d==True]).value_counts()

In [33]:
#make sure to handle text as string
df['text'] = df['text'].astype('str')
df['text_clean'] = df['text_clean'].astype('str')

In [34]:
#add twitter features
df['count_words'] = df['text_clean'].apply(lambda text: len([w for w in text.split()]))
df['count_stops'] = df.apply(lambda doc: len(doc['text'].split()) - len(doc['text_clean'].split()), axis=1)
df['count_characters'] = df['text'].apply(lambda text: len(str(text)))
df['count_non_characters'] = df['text'].apply(lambda text: len(re.sub('[\w+!@#$%&;:,.?\/\-“”’`"\'()|]', '', text).strip()))
df['count_upper'] = df['text'].apply(lambda text: len([l for l in ' '.join([w for w in text.split() if not w.startswith(('#', '@'))]) if l.isupper()]))
df['bool_question'] = df['text'].apply(lambda text: 1 if '?' in text else 0)
df['bool_elongation'] = df['text'].apply(lambda text: 1 if bool(elongated.search(text)) else 0)
df['bool_ellipsis'] = df['text'].apply(lambda text: 1 if any(x in text for x in ('...', '…')) else 0)
df['lexical_diversity'] = df['text'].apply(lambda text: len(set(text.split())) / len(text.split()))

In [35]:
#add event features

In [36]:
#get query grams
query_terms = df['query'].unique()

In [37]:
#compress series of tweet texts to list
tweets = [ str(d) for d in df[df['is_retweet'] == False]['text_clean']]
#extract tokens as list
tokens = [ w for t in tweets for w in t.split()]
#construct term counter
for w in [tokens]:
    termcounts = Counter(w)
#save top k = 100 most frequent terms
topk_terms = termcounts.most_common(100)

In [38]:
df['query_grams_coverage'] = df['text_clean'].apply(lambda text: len([token for token in text.split() if token.lower() in query_terms]) / len(query_terms))
df['topk_terms_coverage'] = df['text_clean'].apply(lambda text: len([token for token in text.split() if token.lower() in topk_terms]) / len(topk_terms))

In [39]:
#use only non retweets for bag of words statistics
df_nonrt = df[df['is_retweet'] == False][['master_id', 'text_clean']]
df_nonrt = df_nonrt.reset_index(drop=True)
df_nonrt.shape

(907440, 2)

In [40]:
#initialize sklearn vectorizer
tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=20, max_features=1000, stop_words='english')
#create matrix of tfidf counts
#not considering retweets, due to redundancy skew
Xtfidf = tfidf_vectorizer.fit_transform(df_nonrt['text_clean'])

In [41]:
#get mean tfidf for each doc
Xtfidf_means = Xtfidf.mean(axis=1)
df_tfidf_means = pd.DataFrame(Xtfidf_means, columns=['tfidf_mean'])
#get sum tfidf for each doc
Xtfidf_sums = Xtfidf.sum(axis=1)
df_tfidf_sums = pd.DataFrame(Xtfidf_sums, columns=['tfidf_sum'])

In [42]:
df_tfidf_stats = pd.concat([df_tfidf_means, df_tfidf_sums], axis=1)

In [43]:
#add tfidf sum, mean as features
df_nonrt = pd.merge(df_nonrt, df_tfidf_stats, how='inner', left_index=True, right_index=True)
#join back to full datatset
df = pd.merge(df, df_nonrt[['master_id', 'tfidf_sum', 'tfidf_mean']], how='left', on='master_id')
#retweet feature values are set to NaN
#because retweets will exempt from modeling

In [44]:
#calculate event centroid using tfidf mean of all columns (1000 top terms)
Xtfidf_centroid = Xtfidf.mean(axis=0)

In [45]:
#compute pairwise distance for each doc to centroid using cosine similarity equation
Xtfidf_centroid_cosdistance = pairwise_distances(X=Xtfidf, Y=Xtfidf_centroid, metric='cosine')

In [46]:
df_centroid_distance = pd.DataFrame(Xtfidf_centroid_cosdistance, columns=['event_centroid_distance'])

In [47]:
#add centroid distance as feature
df_nonrt = pd.merge(df_nonrt, df_centroid_distance, how='inner', left_index=True, right_index=True)
#join back to full datatset
df = pd.merge(df, df_nonrt[['master_id', 'event_centroid_distance']], how='left', on='master_id')
#retweet centroid values are set to NaN
#because retweets will exempt from modeling

In [None]:
#classify parts of speech, named entities using nltk classifier
#aggregate pos, ne counts, add counts and dictionary of parts/entities to database
#this is a slow process, should be redesigned
df_nespos = pd.DataFrame()

#define function for nltk tree mining
def getnes(tree):
    ne = []
    for node in tree:
        if type(node) is nltk.Tree:
            label = node.label()
            s = ''
            for node in node:
                s = (s + ' ' + node[0].lower()).lstrip()
            ne.append([label, s])
    return ne

for i,doc in df.iterrows():
    
    if i % 50 == 0:
        complete = round((i/df.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    tokens = nltk.word_tokenize(str(doc['text_clean']))
    
    pos = nltk.pos_tag(tokens)
    pos_cntr = Counter(list(dict(pos).values()))
    pos_data = dict(pos_cntr)
    pos_cnt = sum(pos_data.values())
    
    tree = nltk.ne_chunk(pos)
    nes = getnes(tree)
    nes_cntr = Counter(list(dict(nes).keys()))
    nes_data = dict(nes_cntr)
    nes_cnt = sum(nes_data.values())
    
    row = [pos_cnt, nes_cnt, pos_data, nes_data]
    row = pd.Series(row, index=['pos_cnt', 'nes_cnt', 'pos_data', 'nes_data'])
    row = pd.DataFrame(row).T
    df_nespos = df_nespos.append(row, ignore_index=True)
    
df = pd.merge(df, df_nespos, left_index=True, right_index=True)
print('100%% complete')

In [None]:
#get all parts of speach counts, add as features
all_pos = []
for i,doc in df.iterrows():
    all_pos.extend(list(doc['pos_data'].keys()))

#get unique pos types
all_pos = list(set(all_pos))

#create pos feature df
df_pos = pd.DataFrame()
pos_cols = ['pos_cnt_'+pos for pos in all_pos] 

#update pos counts
for i,doc in df.iterrows():    
    
    if i % 50 == 0:
        complete = round((i/df.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    #create empty dictionary with keys
    pos_dict = dict.fromkeys(all_pos)
    
    for pos in doc['pos_data'].keys():
        pos_dict[pos] = doc['pos_data'].get(pos)
    
    row = list(pos_dict.values())
    row = pd.Series(row, index=pos_cols)
    row = pd.DataFrame(row).T
    df_pos = df_pos.append(row, ignore_index=True)
    
df = pd.merge(df, df_pos, left_index=True, right_index=True)
print('100%% complete')

In [None]:
#get all named entities counts, add as features
all_nes = []
for i,doc in df.iterrows():
    all_nes.extend(list(doc['nes_data'].keys()))

#get unique
all_nes = list(set(all_nes))

#create nes feature df
df_nes = pd.DataFrame()
ne_cols = ['ne_cnt_'+ne for ne in all_nes] 
    
#update pos counts
for i,doc in df.iterrows():    
    
    if i % 50 == 0:
        complete = round((i/df.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    #create empty dictionary with keys
    nes_dict = dict.fromkeys(all_nes)
    
    for ne in doc['nes_data'].keys():
        nes_dict[ne] = doc['nes_data'].get(ne)
    
    row = list(nes_dict.values())
    row = pd.Series(row, index=ne_cols)
    row = pd.DataFrame(row).T
    df_nes = df_nes.append(row, ignore_index=True)
    
df = pd.merge(df, df_nes, left_index=True, right_index=True)
print('100%% complete')

In [48]:
df.shape

(2295892, 37)

In [49]:
df.head(1)

Unnamed: 0,master_id,twitter_id,created_at,coordinates,text,tweet_type,is_retweet,favorite_count,retweet_count,entities_count_hashtags,...,count_upper,bool_question,bool_elongation,bool_ellipsis,lexical_diversity,query_grams_coverage,topk_terms_coverage,tfidf_sum,tfidf_mean,event_centroid_distance
0,0,716414944355287040,2016-04-03 00:00:07,,Environmentalists Call For No New Offshore Dri...,text,False,0,0,0,...,13,0,0,0,1.0,0.2,0.0,1.608957,0.001609,0.877434


In [50]:
#save event data with features
#set nulls to 0 (in case of pos/ne counts)
df.to_csv('data/final/event_panama_papers_data.txt', sep='\t', encoding='utf-8', header=True, index=False, na_rep=0)