In [5]:
#TERM FREQUENCY - tf sub(t,d) = sum of occurences term (t) in doc (d)
#DOCUMENT FREQUENCY - df sub(t) = sum of docs (d) in collection containing term (t)
#INVERSE DOCUMENT FREQUENCY - idf sub(t) = log of total docs in collection (N) over document frequency
#WEIGHTED TF-IDF - tf-idf sub(t,d) - sum of occurences term (t) in doc (d) times inverse document frequency

In [1]:
import pandas as pd
import nltk
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
elongated = re.compile('([a-zA-Z])\\1{2,}')

In [17]:
event_name = '[TEDxNations]'

In [18]:
event_data = pd.read_table('%s_data_clean.txt' % event_name, sep='\t', header=0, encoding='utf-8')

In [6]:
#add basic count features
event_data['count_links'] = event_data['text'].apply(lambda text: len([w for w in text.split() if w.startswith(('http://', 'https://'))]))
event_data['count_hashtags'] = event_data['text'].apply(lambda text: len([w for w in text.split() if w.startswith('#')]))
event_data['count_mentions'] = event_data['text'].apply(lambda text: len([w for w in text.split() if w.startswith('@')]))
event_data['count_words'] = event_data['text'].apply(lambda text: len([w for w in text.split() if not w.startswith(('RT', '@', '#'))]))
event_data['count_characters'] = event_data['text'].apply(lambda text: len(str(text)))
event_data['count_non_characters'] = event_data['text_nolink'].apply(lambda text: len(re.sub('[\w+!@#$%&;:,.?\/\-“”’`"\'()|]', '', text).strip()))
event_data['count_upper'] = event_data['text_nolink'].apply(lambda text: len([l for l in ' '.join([w for w in text.split() if not w.startswith(('#', '@'))]) if l.isupper()]))
event_data['bool_question'] = event_data['text_clean'].apply(lambda text: 1 if '?' in text else 0)
event_data['bool_elongation'] = event_data['text_clean'].apply(lambda text: 1 if bool(elongated.search(text)) else 0)
event_data['bool_ellipsis'] = event_data['text_clean'].apply(lambda text: 1 if any(x in text for x in ('...', '…')) else 0)

In [7]:
#compute top tokens
def gather_tokens(data):
    all_tokens = []
    for doc in data:
        tokens = doc.split()
        all_tokens.extend(tokens)
    return all_tokens

#gather tokens from all docs 
all_tokens = gather_tokens(event_data['text_clean_tokens'])
#create counter object
tokens_cntr = Counter(all_tokens)

In [8]:
tokens_cntr.most_common(10)

[('tedxnations', 1752),
 ('ungeneva', 283),
 ('tedxpdnations', 198),
 ('live', 167),
 ('11', 144),
 ('rt', 142),
 ('watch', 127),
 ('lagosviewingparty', 126),
 ('lives', 122),
 ('people', 106)]

In [19]:
#initialize a vectorizer, require minimum freq. of terms at 2
count_vect = CountVectorizer(min_df=2)

In [20]:
#Learn the vocabulary dictionary and return term-document matrix
train_matrix_cnt = count_vect.fit_transform(event_data['text_clean_tokens'])

In [21]:
#Fit and Transform count sparse matrix to normalized tf-idf sparse matrix
#first fit transformer which computes idf values
tfidf_transformer = TfidfTransformer().fit(train_matrix_cnt)
#second transform back to sparse matrix with tfidf values
train_matrix_tfidf = tfidf_transformer.transform(train_matrix_cnt)

In [22]:
#explore sparse matrix
print('sparse matrix shape:', train_matrix_cnt.shape)
print('size:', (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))
print('non-zeros:', train_matrix_cnt.getnnz())
print('sparsity: %.2f%%' % (100.0 * (((train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]) - train_matrix_cnt.getnnz()) / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))))
print('density: %.2f%%' % (100.0 * train_matrix_cnt.getnnz() / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1])))

sparse matrix shape: (1778, 1550)
size: 2755900
non-zeros: 15666
sparsity: 99.43%
density: 0.57%


In [180]:
#TEST EXAMPLE term frequencies
data_index = 0

data = pd.DataFrame(columns=['term', 'tf', 'df', 'idf', 'tfidf'])

for feature_index in train_matrix_cnt[data_index].nonzero()[1]:
    
    term = count_vect.get_feature_names()[feature_index]
    tf = train_matrix_cnt[data_index, feature_index]
    df = tokens_cntr.get(term)
    idf = round(tfidf_transformer.idf_[feature_index], 4)
    tfidf = round(train_matrix_tfidf[data_index, feature_index], 4)
    
    row = [term, tf, df, idf, tfidf]
    row = pd.Series(row, index=['term', 'tf', 'df', 'idf', 'tfidf'])
    row = pd.DataFrame(row).T
    data = data.append(row, ignore_index=True)

data

Unnamed: 0,term,tf,df,idf,tfidf
0,how,1,31,5.0181,0.3031
1,icrc,1,29,5.0498,0.305
2,work,1,15,5.7112,0.345
3,globally,1,5,6.692,0.4042
4,address,1,12,5.9189,0.3575
5,sexualviolence,1,66,4.2791,0.2585
6,conflict,1,20,5.4393,0.3285
7,learn,1,15,5.6506,0.3413
8,tedxnations,1,1752,1.0142,0.0613
9,via,1,19,5.4881,0.3315


In [181]:
#add aggregate tfidf to data
event_data_tfidf = pd.DataFrame()

for i,doc in event_data.iterrows():
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    tfs = train_matrix_cnt[i].data
    tfidfs = train_matrix_tfidf[i].data
    t_distinct = len(tfs)
    t_sum = tfs.sum()
    tfidf_sum = tfidfs.sum()
    tfidf_mean = (0 if tfidf_sum == 0 else tfidfs.mean())

    row = [t_distinct, t_sum, tfidf_sum, tfidf_mean]
    row = pd.Series(row, index=['t_distinct', 't_sum', 'tfidf_sum', 'tfidf_mean'])
    row = pd.DataFrame(row).T
    event_data_tfidf = event_data_tfidf.append(row, ignore_index=True)
    
event_data = pd.merge(event_data, event_data_tfidf, left_index=True, right_index=True)
print('100%% complete')

0.0% complete
2.81% complete
5.62% complete
8.44% complete
11.25% complete
14.06% complete
16.87% complete
19.69% complete
22.5% complete
25.31% complete
28.12% complete
30.93% complete
33.75% complete
36.56% complete
39.37% complete
42.18% complete
44.99% complete
47.81% complete
50.62% complete
53.43% complete
56.24% complete
59.06% complete
61.87% complete
64.68% complete
67.49% complete
70.3% complete
73.12% complete
75.93% complete
78.74% complete
81.55% complete
84.36% complete
87.18% complete
89.99% complete
92.8% complete
95.61% complete
98.43% complete
100%% complete


In [182]:
#calculate and add parts of speech, named entities info to data
event_data_nespos = pd.DataFrame()

#define function for nltk tree mining
def getnes(tree):
    ne = []
    for node in tree:
        if type(node) is nltk.Tree:
            label = node.label()
            s = ''
            for node in node:
                s = (s + ' ' + node[0].lower()).lstrip()
            ne.append([label, s])
    return ne

for i,doc in event_data.iterrows():
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    tokens = nltk.word_tokenize(str(doc['text_clean']))
    
    pos = nltk.pos_tag(tokens)
    pos_cntr = Counter(list(dict(pos).values()))
    pos_data = dict(pos_cntr)
    pos_cnt = sum(pos_data.values())
    
    tree = nltk.ne_chunk(pos)
    nes = getnes(tree)
    nes_cntr = Counter(list(dict(nes).keys()))
    nes_data = dict(nes_cntr)
    nes_cnt = sum(nes_data.values())
    
    row = [pos_cnt, nes_cnt, pos_data, nes_data]
    row = pd.Series(row, index=['pos_cnt', 'nes_cnt', 'pos_data', 'nes_data'])
    row = pd.DataFrame(row).T
    event_data_nespos = event_data_nespos.append(row, ignore_index=True)
    
event_data = pd.merge(event_data, event_data_nespos, left_index=True, right_index=True)
print('100%% complete')

0.0% complete
2.81% complete
5.62% complete
8.44% complete
11.25% complete
14.06% complete
16.87% complete
19.69% complete
22.5% complete
25.31% complete
28.12% complete
30.93% complete
33.75% complete
36.56% complete
39.37% complete
42.18% complete
44.99% complete
47.81% complete
50.62% complete
53.43% complete
56.24% complete
59.06% complete
61.87% complete
64.68% complete
67.49% complete
70.3% complete
73.12% complete
75.93% complete
78.74% complete
81.55% complete
84.36% complete
87.18% complete
89.99% complete
92.8% complete
95.61% complete
98.43% complete
100%% complete


In [185]:
#get all parts of speach
all_pos = []
for i,doc in event_data.iterrows():
    all_pos.extend(list(doc['pos_data'].keys()))

#get unique
all_pos = list(set(all_pos))

#create pos feature df
event_data_pos = pd.DataFrame()
pos_cols = ['pos_cnt_'+pos for pos in all_pos] 

#update pos counts
for i,doc in event_data.iterrows():    
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    #create empty dictionary with keys
    pos_dict = dict.fromkeys(all_pos)
    
    for pos in doc['pos_data'].keys():
        pos_dict[pos] = doc['pos_data'].get(pos)
    
    row = list(pos_dict.values())
    row = pd.Series(row, index=pos_cols)
    row = pd.DataFrame(row).T
    event_data_pos = event_data_pos.append(row, ignore_index=True)
    
event_data = pd.merge(event_data, event_data_pos, left_index=True, right_index=True)
print('100%% complete')

0.0% complete
2.81% complete
5.62% complete
8.44% complete
11.25% complete
14.06% complete
16.87% complete
19.69% complete
22.5% complete
25.31% complete
28.12% complete
30.93% complete
33.75% complete
36.56% complete
39.37% complete
42.18% complete
44.99% complete
47.81% complete
50.62% complete
53.43% complete
56.24% complete
59.06% complete
61.87% complete
64.68% complete
67.49% complete
70.3% complete
73.12% complete
75.93% complete
78.74% complete
81.55% complete
84.36% complete
87.18% complete
89.99% complete
92.8% complete
95.61% complete
98.43% complete
100%% complete


In [186]:
#get all named entities
all_nes = []
for i,doc in event_data.iterrows():
    all_nes.extend(list(doc['nes_data'].keys()))

#get unique
all_nes = list(set(all_nes))

#create nes feature df
event_data_nes = pd.DataFrame()
ne_cols = ['ne_cnt_'+ne for ne in all_nes] 
    
#update pos counts
for i,doc in event_data.iterrows():    
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    #create empty dictionary with keys
    nes_dict = dict.fromkeys(all_nes)
    
    for ne in doc['nes_data'].keys():
        nes_dict[ne] = doc['nes_data'].get(ne)
    
    row = list(nes_dict.values())
    row = pd.Series(row, index=ne_cols)
    row = pd.DataFrame(row).T
    event_data_nes = event_data_nes.append(row, ignore_index=True)
    
event_data = pd.merge(event_data, event_data_nes, left_index=True, right_index=True)
print('100%% complete')

0.0% complete
2.81% complete
5.62% complete
8.44% complete
11.25% complete
14.06% complete
16.87% complete
19.69% complete
22.5% complete
25.31% complete
28.12% complete
30.93% complete
33.75% complete
36.56% complete
39.37% complete
42.18% complete
44.99% complete
47.81% complete
50.62% complete
53.43% complete
56.24% complete
59.06% complete
61.87% complete
64.68% complete
67.49% complete
70.3% complete
73.12% complete
75.93% complete
78.74% complete
81.55% complete
84.36% complete
87.18% complete
89.99% complete
92.8% complete
95.61% complete
98.43% complete
100%% complete


In [190]:
#na as 0
event_data.to_csv('%s_data_features.txt' % event_name, sep='\t', encoding='utf-8', header=True, index=False, na_rep=0)