In [7]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [30]:
wiki = pd.read_csv(r"D:\dasci\github\dataSci_datasets\coursera_ML_UW\4_Clustering_&_Retrieval\people_wiki.csv")

In [9]:
def load_sparse_csr(filename):
    loader = np.load(filename)
    data = loader['data']
    indices = loader['indices']
    indptr = loader['indptr']
    shape = loader['shape']
    
    return csr_matrix( (data, indices, indptr), shape)

In [12]:
word_count = load_sparse_csr(r"D:\dasci\github\dataSci_datasets\coursera_ML_UW\4_Clustering_&_Retrieval\people_wiki_word_count.npz")

In [25]:
import json

with open(r"D:\dasci\github\dataSci_datasets\coursera_ML_UW\4_Clustering_&_Retrieval\people_wiki_map_index_to_word.json") as jsn:
    map_index_to_word = json.load(jsn)

## Question 1
Among the words that appear in both Barack Obama and Francisco Barrio, take the 5 that appear most frequently in Obama. How many of the articles in the Wikipedia dataset contain all of those 5 words?

In [13]:
from sklearn.neighbors import NearestNeighbors

In [28]:
model = NearestNeighbors(metric='euclidean', algorithm='brute')
model.fit(word_count)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='euclidean',
         metric_params=None, n_jobs=1, n_neighbors=5, p=2, radius=1.0)

In [29]:
word_count

<59071x547979 sparse matrix of type '<class 'numpy.int64'>'
	with 10379283 stored elements in Compressed Sparse Row format>

In [32]:
wiki[wiki['name'] == 'Barack Obama']

Unnamed: 0,URI,name,text
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...


In [35]:
obama_index = wiki[wiki['name'] == 'Barack Obama'].index[0]

In [37]:
distances, indices = model.kneighbors(word_count[obama_index], n_neighbors=10)

In [43]:
# displaying the nearest neighbours, whose indices we have in indices[0]
wiki[wiki.index.isin(indices[0])]

Unnamed: 0,URI,name,text
9210,<http://dbpedia.org/resource/Andy_Anstett>,Andy Anstett,andrue john andy anstett born june 25 1946 is ...
13229,<http://dbpedia.org/resource/Francisco_Barrio>,Francisco Barrio,francisco javier barrio terrazas born november...
14754,<http://dbpedia.org/resource/Mitt_Romney>,Mitt Romney,willard mitt romney born march 12 1947 is an a...
22745,<http://dbpedia.org/resource/Wynn_Normington_H...,Wynn Normington Hugh-Jones,sir wynn normington hughjones kb sometimes kno...
24478,<http://dbpedia.org/resource/Joe_Biden>,Joe Biden,joseph robinette joe biden jr dosf rbnt badn b...
28447,<http://dbpedia.org/resource/George_W._Bush>,George W. Bush,george walker bush born july 6 1946 is an amer...
31423,<http://dbpedia.org/resource/Walter_Mondale>,Walter Mondale,walter frederick fritz mondale born january 5 ...
35357,<http://dbpedia.org/resource/Lawrence_Summers>,Lawrence Summers,lawrence henry larry summers born november 30 ...
35817,<http://dbpedia.org/resource/Barack_Obama>,Barack Obama,barack hussein obama ii brk husen bm born augu...
36364,<http://dbpedia.org/resource/Don_Bonker>,Don Bonker,don leroy bonker born march 7 1937 in denver c...


## Beauty!, we get mostly political people as the nearest neighbours!

In [46]:
def unpack_dict(matrix, map_index_to_word):
    table = sorted(map_index_to_word, key=map_index_to_word.get)
    
    data = matrix.data
    indices = matrix.indices
    indptr = matrix.indptr
    
    num_doc = matrix.shape[0]

    return [{k:v for k,v in zip([table[word_id] for word_id in indices[indptr[i]:indptr[i+1]] ],
                                 data[indptr[i]:indptr[i+1]].tolist())} \
               for i in range(num_doc) ]

wiki['word_count'] = unpack_dict(word_count, map_index_to_word)

In [97]:
def top_words(name):
    """
    Get a table of the most frequent words in the given person's wikipedia page.
    """
    row = wiki[wiki['name'] == name]
    word_count_table = pd.DataFrame(list(row.word_count.values[0].items()), columns=['word','count'])
    return word_count_table.sort_values('count', ascending=False)

obama_words = top_words('Barack Obama')
print (obama_words)

barrio_words = top_words('Francisco Barrio')
print (barrio_words)

               word  count
272             the     40
270              in     30
271             and     21
269              of     18
266              to     14
258             his     11
71            obama      9
138             act      8
260              he      7
268               a      7
191             law      6
263              as      6
212              us      6
264             was      5
265             for      4
97             iraq      4
166      democratic      4
129         control      4
155        military      4
248           after      4
217       president      4
254             has      4
252           first      3
256            from      3
216          states      3
221            2011      3
222            2004      3
214         january      3
165            term      3
261            with      3
..              ...    ...
83            filed      1
66             13th      1
67           combat      1
68      initiatives      1
70   unsuccessfully      1
7

In [79]:
row = wiki[wiki['name'] == 'Barack Obama']

In [102]:
combined_words = obama_words.merge(barrio_words, on='word', suffixes=('_obama','_barrio'))

## IMP - the merge operation doesnot enforce any particular ordering as such

In [106]:
top_5 = combined_words.sort_values('count_obama', ascending=False).word.values[:5]

In [114]:
top_5_set = set(top_5)

In [118]:
wiki.text.str.split().apply(lambda x: len(set(x).intersection(top_5_set))==5).sum()

56066

## Answer 1
56066

## Question 2
Measure the pairwise distance between the Wikipedia pages of Barack Obama, George W. Bush, and Joe Biden. Which of the three pairs has the smallest distance?

In [120]:
from sklearn.metrics.pairwise import euclidean_distances

In [137]:
obama_index = wiki[wiki.name=='Barack Obama'].index[0]
biden_index = wiki[wiki.name=='Joe Biden'].index[0]
bush_index = wiki[wiki.name=='George W. Bush'].index[0]

In [139]:
print (euclidean_distances(word_count[obama_index],word_count[biden_index]))
print (euclidean_distances(word_count[obama_index],word_count[bush_index]))
print (euclidean_distances(word_count[biden_index],word_count[bush_index]))

[[ 33.07567082]]
[[ 34.39476704]]
[[ 32.75667871]]


## Answer 2

Biden and bush seem to have the loswest distance!, seems counter intutive given that obama and biden we President and Vice president

## Question 3
Collect all words that appear both in Barack Obama and George W. Bush pages. Out of those words, find the 10 words that show up most often in Obama's page. Which of the following is NOT one of the 10 words?

In [147]:
obama_words = list(wiki[wiki.name=='Barack Obama'].word_count.values[0].keys())
bush_words = list(wiki[wiki.name=='Barack Obama'].word_count.values[0].keys())

In [None]:
obama_top10 = sorted(wiki[wiki.name=='Barack Obama'].word_count.values[0].items(), key=lambda x:x[1],reverse=True)[:10]

obama_top10 = [i[0] for i in obama_top10]

In [159]:
list(set(bush_words).intersection(set(obama_top10)))

['he', 'the', 'to', 'of', 'act', 'his', 'and', 'obama', 'a', 'in']

## Answer 3
As we can see above, 'presedential is not in the above list!'

## Question 4
Among the words that appear in both Barack Obama and Phil Schiliro, take the 5 that have largest weights in Obama. How many of the articles in the Wikipedia dataset contain all of those 5 words?

In [165]:
tf_idf = load_sparse_csr(r"D:\dasci\github\dataSci_datasets\coursera_ML_UW\4_Clustering_&_Retrieval\people_wiki_tf_idf.npz")

In [166]:
wiki['tf_idf'] = unpack_dict(tf_idf, map_index_to_word)

In [168]:
obama_words = list(wiki[wiki.name=='Barack Obama'].word_count.values[0].keys())
schiliro_words = list(wiki[wiki.name=='Phil Schiliro'].word_count.values[0].keys())
common_os = list(set(schiliro_words).intersection(set(obama_words)))

In [182]:
obama_top = sorted(wiki[wiki.name=='Barack Obama'].tf_idf.values[0].items(), key=lambda x:x[1],reverse=True)
os_top5 = set([i[0] for i in obama_top if i[0] in common_os][:5])

In [183]:
wiki.text.str.split().apply(lambda x: len(set(x).intersection(os_top5))==5).sum()

14

## Answer 4
14

## Question 5
Compute the Euclidean distance between TF-IDF features of Obama and Biden. Round your answer to 3 decimal places. Use American-style decimals (e.g. 110.921).

In [188]:
round(euclidean_distances(tf_idf[obama_index], tf_idf[biden_index])[0][0],3)

123.297

## Answer 5 
123.297