In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import linear_kernel

In [14]:
event_name = '[TEDxNations]'

In [15]:
event_data = pd.read_table('%s_data_clean.txt' % event_name, sep='\t', header=0, encoding='utf-8')

In [24]:
#initialize a vectorizer, require minimum freq. of terms at 2
count_vect = CountVectorizer(min_df=2)

In [25]:
#Learn the vocabulary dictionary and return term-document matrix
train_matrix_cnt = count_vect.fit_transform(event_data['text_clean_tokens'])

In [26]:
#Fit and Transform count sparse matrix to normalized tf-idf sparse matrix
#first fit transformer which computes idf values
tfidf_transformer = TfidfTransformer().fit(train_matrix_cnt)
#second transform back to sparse matrix with tfidf values
train_matrix_tfidf = tfidf_transformer.transform(train_matrix_cnt)

In [27]:
#explore sparse matrix
print('sparse matrix shape:', train_matrix_cnt.shape)
print('size:', (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))
print('non-zeros:', train_matrix_cnt.getnnz())
print('sparsity: %.2f%%' % (100.0 * (((train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]) - train_matrix_cnt.getnnz()) / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1]))))
print('density: %.2f%%' % (100.0 * train_matrix_cnt.getnnz() / (train_matrix_cnt.shape[0] * train_matrix_cnt.shape[1])))

sparse matrix shape: (1778, 1550)
size: 2755900
non-zeros: 15666
sparsity: 99.43%
density: 0.57%


In [53]:
doc_index = 0

#compute dot product cosine similarity of indexed document with all others
cosine_similarities = linear_kernel(train_matrix_tfidf[0:doc_index+1], train_matrix_tfidf).flatten()

In [66]:
#top 10 similar
similar = cosine_similarities.argsort()[:-10:-1]

In [59]:
#doc of interest
print(event_data.iloc[doc_index]['text_nolink'])

How does @ICRC work globally to address #sexualviolence in conflict? Learn more here:  #TEDxNations via @PMeigeICRC


In [67]:
#show top similar docs
for i in similar:
    if i != doc_index:
        print('index:',i, '\tmeasure:', round(cosine_similarities[i],2), '\n text:', event_data.iloc[i]['text_nolink'])
        print()

index: 757 	measure: 0.94 
 text: How does @ICRC work globally to address #sexualviolence in conflict? Learn more here:  #TEDxNations

index: 808 	measure: 0.94 
 text: How does @ICRC work globally to address #sexualviolence in conflict? Learn more here:  #TEDxNations

index: 47 	measure: 0.34 
 text: What can you do to end #sexualviolence in conflict areas? Coline Rapneau answers at #TEDxNations:  via @TEDx #VAW

index: 796 	measure: 0.33 
 text: Break the taboo on #SexualViolence in #conflict by shifting the #blame - Coline Rapneau @ICRC #TEDxNations 

index: 1523 	measure: 0.3 
 text: How can we better address #sexualviolence in conflict? Watch Coline Rapneau talk at #TEDxNations - 11th Feb at 4pm: 

index: 793 	measure: 0.29 
 text: @ICRC Coline Rapneau underlines today at #TEDxNations the steps we can take to defeat #sexualviolence in conflict: 

index: 754 	measure: 0.28 
 text: We ALL have a role to play. #SexualViolence is not unspeakable. We must openly address the issue -C. R