In [1]:
import pandas as pd
import scipy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel

In [10]:
event_name = '[TEDxNations]'

In [11]:
event_data = pd.read_table('data/%s_data_clean.txt' % event_name, sep='\t', header=0, encoding='utf-8')

In [12]:
#initialize a vectorizer, require minimum freq. of terms at 2
count_vect = CountVectorizer(min_df=2)

In [13]:
#Learn the vocabulary dictionary and return term-document matrix
train_matrix_cnt = count_vect.fit_transform(event_data['text_clean_tokens'])

In [14]:
#Fit and Transform count sparse matrix to normalized tf-idf sparse matrix
#first fit transformer which computes idf values
tfidf_transformer = TfidfTransformer().fit(train_matrix_cnt)
#second transform back to sparse matrix with tfidf values
train_matrix_tfidf = tfidf_transformer.transform(train_matrix_cnt)

In [15]:
#explore sparse matrix
print('sparse matrix shape:', train_matrix_cnt.shape)
print('size:', (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))
print('non-zeros:', train_matrix_cnt.getnnz())
print('sparsity: %.2f%%' % (100.0 * (((train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]) - train_matrix_cnt.getnnz()) / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))))
print('density: %.2f%%' % (100.0 * train_matrix_cnt.getnnz() / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1])))

sparse matrix shape: (1659, 1497)
size: 2483523
non-zeros: 14463
sparsity: 99.42%
density: 0.58%


In [16]:
#construct square doc matrix, of cosine-similarity values, using pandas dataframe
X = pd.DataFrame()

for i,doc in event_data.iterrows():
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    #compute dot product cosine similarity of indexed document with all others
    cosine_similarities = linear_kernel(train_matrix_tfidf[i], train_matrix_tfidf).flatten()
    s = pd.Series(cosine_similarities)
    df = pd.DataFrame(s, index=list(s.index)).T
    X = X.append(df, ignore_index=True)

print('100%% complete')

0.0% complete
3.01% complete
6.03% complete
9.04% complete
12.06% complete
15.07% complete
18.08% complete
21.1% complete
24.11% complete
27.12% complete
30.14% complete
33.15% complete
36.17% complete
39.18% complete
42.19% complete
45.21% complete
48.22% complete
51.24% complete
54.25% complete
57.26% complete
60.28% complete
63.29% complete
66.31% complete
69.32% complete
72.33% complete
75.35% complete
78.36% complete
81.37% complete
84.39% complete
87.4% complete
90.42% complete
93.43% complete
96.44% complete
99.46% complete
100%% complete


In [20]:
#save similarity matrix (WARNING: LARGE FILE)
#X.to_csv('data/%s_data_clean_cosinesimX.txt' % event_name, sep='\t', encoding='utf-8', header=True, index=True)

In [18]:
#get top n similiar docs from matrix, from indexed doc of interest
#first doc of cosine-similarity=1 will be the doc itself
doc_index = 48
n = 10

similar = X.sort(doc_index, ascending=False).head(n)

#print doc of interest
print(event_data.iloc[doc_index]['text_nolink'])
print()

for i in list(similar.index):
    print('index:', i, '\tcosine-sim:', X.iloc[i][doc_index])
    print('  text:', event_data.iloc[i]['text_nolink'])
    print()

Retweeted UNIC Lagos (@UNICLagos):Audience @TEDxPdNations #TEDxNations lagosViewingParty  

index: 48 	cosine-sim: 1.0
  text: Retweeted UNIC Lagos (@UNICLagos):Audience @TEDxPdNations #TEDxNations lagosViewingParty  

index: 354 	cosine-sim: 1.0
  text: Retweeted UNIC Lagos (@UNICLagos):Audience LagosViewingParty @TEDxPdNations #TEDxNations  

index: 304 	cosine-sim: 0.843656655461
  text: Retweeted UNIC Lagos (@UNICLagos):Audience discussion time #TEDxNations lagosViewingParty @TEDxPdNations... 

index: 757 	cosine-sim: 0.826817409408
  text: Retweeted UNIC Lagos (@UNICLagos):Audience at the #TEDxNations LagosViewingParty organised by @UNICLagos and... 

index: 81 	cosine-sim: 0.768650499179
  text: Retweeted UNIC Lagos (@UNICLagos):Closing from #TEDxNations lagosViewingParty @TEDxPdNations... 

index: 305 	cosine-sim: 0.694769920614
  text: Retweeted UNIC Lagos (@UNICLagos):Discussion ongoing @TEDxPdNations #TEDxNations lagosViewingParty... 

index: 88 	cosine-sim: 0.693226888507
  