In [5]:
#TERM FREQUENCY - tf sub(t,d) = sum of occurences term (t) in doc (d)
#DOCUMENT FREQUENCY - df sub(t) = sum of docs (d) in collection containing term (t)
#INVERSE DOCUMENT FREQUENCY - idf sub(t) = log of total docs in collection (N) over document frequency
#WEIGHTED TF-IDF - tf-idf sub(t,d) - sum of occurences term (t) in doc (d) times inverse document frequency

In [1]:
import pandas as pd
import nltk
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
elongated = re.compile('([a-zA-Z])\\1{2,}')

In [3]:
event_name = '[egyptair]'

In [4]:
event_data = pd.read_table('data/%s_data_clean.txt' % event_name, sep='\t', header=0, encoding='utf-8')

In [5]:
list(event_data.columns)

['id',
 'created_at',
 'text',
 'text_nolink',
 'text_clean',
 'text_clean_tokens',
 'text_clean_stems']

In [6]:
#add basic count features
event_data['count_links'] = event_data['text'].apply(lambda text: len([w for w in text.split() if w.startswith(('http://', 'https://'))]))
event_data['count_hashtags'] = event_data['text'].apply(lambda text: len([w for w in text.split() if w.startswith('#')]))
event_data['count_mentions'] = event_data['text'].apply(lambda text: len([w for w in text.split() if w.startswith('@')]))
event_data['count_words'] = event_data['text'].apply(lambda text: len([w for w in text.split() if not w.startswith(('RT', '@', '#'))]))
event_data['count_stopwords'] = event_data.apply(lambda doc: len(doc['text_clean'].split()) - len(doc['text_clean_tokens'].split()), axis=1)
event_data['count_characters'] = event_data['text'].apply(lambda text: len(str(text)))
event_data['count_non_characters'] = event_data['text_nolink'].apply(lambda text: len(re.sub('[\w+!@#$%&;:,.?\/\-“”’`"\'()|]', '', text).strip()))
event_data['count_upper'] = event_data['text_nolink'].apply(lambda text: len([l for l in ' '.join([w for w in text.split() if not w.startswith(('#', '@'))]) if l.isupper()]))
event_data['bool_question'] = event_data['text_clean'].apply(lambda text: 1 if '?' in text else 0)
event_data['bool_elongation'] = event_data['text_clean'].apply(lambda text: 1 if bool(elongated.search(text)) else 0)
event_data['bool_ellipsis'] = event_data['text_clean'].apply(lambda text: 1 if any(x in text for x in ('...', '…')) else 0)

In [7]:
#compute top tokens
def gather_tokens(data):
    all_tokens = []
    for doc in data:
        tokens = doc.split()
        all_tokens.extend(tokens)
    return all_tokens

#gather tokens from all docs 
all_tokens = gather_tokens(event_data['text_clean_tokens'])
#create counter object
tokens_cntr = Counter(all_tokens)

In [8]:
tokens_cntr.most_common(10)

[('egyptair', 5183),
 ('plane', 1550),
 ('hijacked', 1531),
 ('hijacker', 1253),
 ('cyprus', 1148),
 ('passengers', 779),
 ('flight', 618),
 ('hijacking', 599),
 ('hijack', 571),
 ('love', 535)]

In [9]:
#initialize a vectorizer, require minimum freq. of terms at 2
count_vect = CountVectorizer(min_df=2)

In [10]:
#Learn the vocabulary dictionary and return term-document matrix
train_matrix_cnt = count_vect.fit_transform(event_data['text_clean_tokens'])

In [11]:
#Fit and Transform count sparse matrix to normalized tf-idf sparse matrix
#first fit transformer which computes idf values
tfidf_transformer = TfidfTransformer().fit(train_matrix_cnt)
#second transform back to sparse matrix with tfidf values
train_matrix_tfidf = tfidf_transformer.transform(train_matrix_cnt)

In [12]:
#explore sparse matrix
print('sparse matrix shape:', train_matrix_cnt.shape)
print('size:', (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))
print('non-zeros:', train_matrix_cnt.getnnz())
print('sparsity: %.2f%%' % (100.0 * (((train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]) - train_matrix_cnt.getnnz()) / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))))
print('density: %.2f%%' % (100.0 * train_matrix_cnt.getnnz() / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1])))

sparse matrix shape: (5297, 2371)
size: 12559187
non-zeros: 46425
sparsity: 99.63%
density: 0.37%


In [13]:
#TEST EXAMPLE term frequencies
data_index = 0

data = pd.DataFrame(columns=['term', 'tf', 'df', 'idf', 'tfidf'])

for feature_index in train_matrix_cnt[data_index].nonzero()[1]:
    
    term = count_vect.get_feature_names()[feature_index]
    tf = train_matrix_cnt[data_index, feature_index]
    df = tokens_cntr.get(term)
    idf = round(tfidf_transformer.idf_[feature_index], 4)
    tfidf = round(train_matrix_tfidf[data_index, feature_index], 4)
    
    row = [term, tf, df, idf, tfidf]
    row = pd.Series(row, index=['term', 'tf', 'df', 'idf', 'tfidf'])
    row = pd.DataFrame(row).T
    data = data.append(row, ignore_index=True)

data

Unnamed: 0,term,tf,df,idf,tfidf
0,foreign,1,44,5.7684,0.2694
1,ministry,1,85,5.1324,0.2397
2,denies,1,7,7.4956,0.35
3,told,1,7,7.4956,0.35
4,guardian,2,16,6.8025,0.6353
5,egyptair,1,5183,1.0434,0.0487
6,hijacker,1,1253,2.4734,0.1155
7,idiot,1,130,4.7388,0.2213
8,spoke,1,2,8.4765,0.3958


In [14]:
#add aggregate tfidf to data
event_data_tfidf = pd.DataFrame()

for i,doc in event_data.iterrows():
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    tfs = train_matrix_cnt[i].data
    tfidfs = train_matrix_tfidf[i].data
    t_distinct = len(tfs)
    t_sum = tfs.sum()
    tfidf_sum = tfidfs.sum()
    tfidf_mean = (0 if tfidf_sum == 0 else tfidfs.mean())

    row = [t_distinct, t_sum, tfidf_sum, tfidf_mean]
    row = pd.Series(row, index=['t_distinct', 't_sum', 'tfidf_sum', 'tfidf_mean'])
    row = pd.DataFrame(row).T
    event_data_tfidf = event_data_tfidf.append(row, ignore_index=True)
    
event_data = pd.merge(event_data, event_data_tfidf, left_index=True, right_index=True)
print('100%% complete')

0.0% complete
0.94% complete
1.89% complete
2.83% complete
3.78% complete
4.72% complete
5.66% complete
6.61% complete
7.55% complete
8.5% complete
9.44% complete
10.38% complete
11.33% complete
12.27% complete
13.22% complete
14.16% complete
15.1% complete
16.05% complete
16.99% complete
17.93% complete
18.88% complete
19.82% complete
20.77% complete
21.71% complete
22.65% complete
23.6% complete
24.54% complete
25.49% complete
26.43% complete
27.37% complete
28.32% complete
29.26% complete
30.21% complete
31.15% complete
32.09% complete
33.04% complete
33.98% complete
34.93% complete
35.87% complete
36.81% complete
37.76% complete
38.7% complete
39.65% complete
40.59% complete
41.53% complete
42.48% complete
43.42% complete
44.36% complete
45.31% complete
46.25% complete
47.2% complete
48.14% complete
49.08% complete
50.03% complete
50.97% complete
51.92% complete
52.86% complete
53.8% complete
54.75% complete
55.69% complete
56.64% complete
57.58% complete
58.52% complete
59.47% com

In [15]:
#calculate and add parts of speech, named entities info to data
event_data_nespos = pd.DataFrame()

#define function for nltk tree mining
def getnes(tree):
    ne = []
    for node in tree:
        if type(node) is nltk.Tree:
            label = node.label()
            s = ''
            for node in node:
                s = (s + ' ' + node[0].lower()).lstrip()
            ne.append([label, s])
    return ne

for i,doc in event_data.iterrows():
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    tokens = nltk.word_tokenize(str(doc['text_clean']))
    
    pos = nltk.pos_tag(tokens)
    pos_cntr = Counter(list(dict(pos).values()))
    pos_data = dict(pos_cntr)
    pos_cnt = sum(pos_data.values())
    
    tree = nltk.ne_chunk(pos)
    nes = getnes(tree)
    nes_cntr = Counter(list(dict(nes).keys()))
    nes_data = dict(nes_cntr)
    nes_cnt = sum(nes_data.values())
    
    row = [pos_cnt, nes_cnt, pos_data, nes_data]
    row = pd.Series(row, index=['pos_cnt', 'nes_cnt', 'pos_data', 'nes_data'])
    row = pd.DataFrame(row).T
    event_data_nespos = event_data_nespos.append(row, ignore_index=True)
    
event_data = pd.merge(event_data, event_data_nespos, left_index=True, right_index=True)
print('100%% complete')

0.0% complete
0.94% complete
1.89% complete
2.83% complete
3.78% complete
4.72% complete
5.66% complete
6.61% complete
7.55% complete
8.5% complete
9.44% complete
10.38% complete
11.33% complete
12.27% complete
13.22% complete
14.16% complete
15.1% complete
16.05% complete
16.99% complete
17.93% complete
18.88% complete
19.82% complete
20.77% complete
21.71% complete
22.65% complete
23.6% complete
24.54% complete
25.49% complete
26.43% complete
27.37% complete
28.32% complete
29.26% complete
30.21% complete
31.15% complete
32.09% complete
33.04% complete
33.98% complete
34.93% complete
35.87% complete
36.81% complete
37.76% complete
38.7% complete
39.65% complete
40.59% complete
41.53% complete
42.48% complete
43.42% complete
44.36% complete
45.31% complete
46.25% complete
47.2% complete
48.14% complete
49.08% complete
50.03% complete
50.97% complete
51.92% complete
52.86% complete
53.8% complete
54.75% complete
55.69% complete
56.64% complete
57.58% complete
58.52% complete
59.47% com

In [16]:
#get all parts of speach
all_pos = []
for i,doc in event_data.iterrows():
    all_pos.extend(list(doc['pos_data'].keys()))

#get unique
all_pos = list(set(all_pos))

#create pos feature df
event_data_pos = pd.DataFrame()
pos_cols = ['pos_cnt_'+pos for pos in all_pos] 

#update pos counts
for i,doc in event_data.iterrows():    
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    #create empty dictionary with keys
    pos_dict = dict.fromkeys(all_pos)
    
    for pos in doc['pos_data'].keys():
        pos_dict[pos] = doc['pos_data'].get(pos)
    
    row = list(pos_dict.values())
    row = pd.Series(row, index=pos_cols)
    row = pd.DataFrame(row).T
    event_data_pos = event_data_pos.append(row, ignore_index=True)
    
event_data = pd.merge(event_data, event_data_pos, left_index=True, right_index=True)
print('100%% complete')

0.0% complete
0.94% complete
1.89% complete
2.83% complete
3.78% complete
4.72% complete
5.66% complete
6.61% complete
7.55% complete
8.5% complete
9.44% complete
10.38% complete
11.33% complete
12.27% complete
13.22% complete
14.16% complete
15.1% complete
16.05% complete
16.99% complete
17.93% complete
18.88% complete
19.82% complete
20.77% complete
21.71% complete
22.65% complete
23.6% complete
24.54% complete
25.49% complete
26.43% complete
27.37% complete
28.32% complete
29.26% complete
30.21% complete
31.15% complete
32.09% complete
33.04% complete
33.98% complete
34.93% complete
35.87% complete
36.81% complete
37.76% complete
38.7% complete
39.65% complete
40.59% complete
41.53% complete
42.48% complete
43.42% complete
44.36% complete
45.31% complete
46.25% complete
47.2% complete
48.14% complete
49.08% complete
50.03% complete
50.97% complete
51.92% complete
52.86% complete
53.8% complete
54.75% complete
55.69% complete
56.64% complete
57.58% complete
58.52% complete
59.47% com

In [17]:
#get all named entities
all_nes = []
for i,doc in event_data.iterrows():
    all_nes.extend(list(doc['nes_data'].keys()))

#get unique
all_nes = list(set(all_nes))

#create nes feature df
event_data_nes = pd.DataFrame()
ne_cols = ['ne_cnt_'+ne for ne in all_nes] 
    
#update pos counts
for i,doc in event_data.iterrows():    
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    #create empty dictionary with keys
    nes_dict = dict.fromkeys(all_nes)
    
    for ne in doc['nes_data'].keys():
        nes_dict[ne] = doc['nes_data'].get(ne)
    
    row = list(nes_dict.values())
    row = pd.Series(row, index=ne_cols)
    row = pd.DataFrame(row).T
    event_data_nes = event_data_nes.append(row, ignore_index=True)
    
event_data = pd.merge(event_data, event_data_nes, left_index=True, right_index=True)
print('100%% complete')

0.0% complete
0.94% complete
1.89% complete
2.83% complete
3.78% complete
4.72% complete
5.66% complete
6.61% complete
7.55% complete
8.5% complete
9.44% complete
10.38% complete
11.33% complete
12.27% complete
13.22% complete
14.16% complete
15.1% complete
16.05% complete
16.99% complete
17.93% complete
18.88% complete
19.82% complete
20.77% complete
21.71% complete
22.65% complete
23.6% complete
24.54% complete
25.49% complete
26.43% complete
27.37% complete
28.32% complete
29.26% complete
30.21% complete
31.15% complete
32.09% complete
33.04% complete
33.98% complete
34.93% complete
35.87% complete
36.81% complete
37.76% complete
38.7% complete
39.65% complete
40.59% complete
41.53% complete
42.48% complete
43.42% complete
44.36% complete
45.31% complete
46.25% complete
47.2% complete
48.14% complete
49.08% complete
50.03% complete
50.97% complete
51.92% complete
52.86% complete
53.8% complete
54.75% complete
55.69% complete
56.64% complete
57.58% complete
58.52% complete
59.47% com

In [18]:
#na as 0
event_data.to_csv('data/%s_data_clean_features.txt' % event_name, sep='\t', encoding='utf-8', header=True, index=False, na_rep=0)