In [20]:
import pandas as pd
import scipy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel

In [2]:
event_name = '[TEDxNations]'

In [4]:
event_data = pd.read_table('data/%s_data_clean.txt' % event_name, sep='\t', header=0, encoding='utf-8')

In [5]:
#initialize a vectorizer, require minimum freq. of terms at 2
count_vect = CountVectorizer(min_df=2)

In [6]:
#Learn the vocabulary dictionary and return term-document matrix
train_matrix_cnt = count_vect.fit_transform(event_data['text_clean_tokens'])

In [7]:
#Fit and Transform count sparse matrix to normalized tf-idf sparse matrix
#first fit transformer which computes idf values
tfidf_transformer = TfidfTransformer().fit(train_matrix_cnt)
#second transform back to sparse matrix with tfidf values
train_matrix_tfidf = tfidf_transformer.transform(train_matrix_cnt)

In [8]:
#explore sparse matrix
print('sparse matrix shape:', train_matrix_cnt.shape)
print('size:', (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))
print('non-zeros:', train_matrix_cnt.getnnz())
print('sparsity: %.2f%%' % (100.0 * (((train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]) - train_matrix_cnt.getnnz()) / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))))
print('density: %.2f%%' % (100.0 * train_matrix_cnt.getnnz() / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1])))

sparse matrix shape: (1778, 1550)
size: 2755900
non-zeros: 15666
sparsity: 99.43%
density: 0.57%


In [185]:
#construct square doc matrix, of cosine-similarity values, using pandas dataframe
X = pd.DataFrame()

for i,doc in event_data.iterrows():
    
    if i % 50 == 0:
        complete = round((i/event_data.shape[0])*100, 2)
        print('%s%% complete' % complete)
    
    #compute dot product cosine similarity of indexed document with all others
    cosine_similarities = linear_kernel(train_matrix_tfidf[i], train_matrix_tfidf).flatten()
    s = pd.Series(cosine_similarities)
    df = pd.DataFrame(s, index=list(s.index)).T
    X = X.append(df, ignore_index=True)

print('100%% complete')

0.0% complete
2.81% complete
5.62% complete
8.44% complete
11.25% complete
14.06% complete
16.87% complete
19.69% complete
22.5% complete
25.31% complete
28.12% complete
30.93% complete
33.75% complete
36.56% complete
39.37% complete
42.18% complete
44.99% complete
47.81% complete
50.62% complete
53.43% complete
56.24% complete
59.06% complete
61.87% complete
64.68% complete
67.49% complete
70.3% complete
73.12% complete
75.93% complete
78.74% complete
81.55% complete
84.36% complete
87.18% complete
89.99% complete
92.8% complete
95.61% complete
98.43% complete
100%% complete


In [225]:
#get top n similiar docs from matrix, from indexed doc of interest
doc_index = 100
n = 10

similar = X.sort(doc_index, ascending=False).head(n)

#print doc of interest
print(event_data.iloc[doc_index]['text_nolink'])
print()

for i in list(similar.index):
    print('index:', i, '\tcosine-sim:', X.iloc[i][doc_index])
    print('  text:', event_data.iloc[i]['text_nolink'])
    print()

Retweeted UNIC Lagos (@UNICLagos):Energy is the backbone of all the SDGS - FOSTER @TEDxPdNations #TEDxNations... 

index: 100 	cosine-sim: 1.0
  text: Retweeted UNIC Lagos (@UNICLagos):Energy is the backbone of all the SDGS - FOSTER @TEDxPdNations #TEDxNations... 

index: 54 	cosine-sim: 1.0
  text: Retweeted UNIC Lagos (@UNICLagos):Energy is the backbone of all the SDGS - FOSTER @TEDxPdNations #TEDxNations... 

index: 262 	cosine-sim: 0.74836141561
  text: Energy is the backbone of all the SDGS - FOSTER @TEDxPdNations #TEDxNations lagosViewingParty 

index: 55 	cosine-sim: 0.556083484541
  text: Retweeted UNIC Lagos (@UNICLagos):Audience @TEDxPdNations #TEDxNations lagosViewingParty  

index: 386 	cosine-sim: 0.556083484541
  text: Retweeted UNIC Lagos (@UNICLagos):Audience LagosViewingParty @TEDxPdNations #TEDxNations  

index: 94 	cosine-sim: 0.529599358726
  text: Retweeted UNIC Lagos (@UNICLagos):Closing from #TEDxNations lagosViewingParty @TEDxPdNations... 

index: 104 	cosine-si