# Use sklearn to build sparse matrix

In [2]:
import pandas as pd
import numpy as np

In [3]:
wiki = pd.read_csv("people_wiki.csv")

In [4]:
wiki.head(2)

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Digby_Morrell>,Digby Morrell,digby morrell born 10 october 1979 is a former...
1,<http://dbpedia.org/resource/Alfred_J._Lewy>,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from un...


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer(token_pattern=r'\b\w+\b')
word_count_matrix = vectorizer.fit_transform(wiki['text'])

In [6]:
word_count_matrix.shape

(59071, 548465)

In [7]:
wiki[wiki['name'] == 'Elton John'].index.values

array([19923])

In [9]:
word_count_matrix[19923]

<1x548465 sparse matrix of type '<class 'numpy.int64'>'
	with 255 stored elements in Compressed Sparse Row format>

In [10]:
print(type(word_count_matrix))
print(word_count_matrix[0].shape)

<class 'scipy.sparse.csr.csr_matrix'>
(1, 548465)


# Use turicreate to build word count vector
1. raw word count
2. TF-IDF

In [5]:
import turicreate

In [6]:
people = turicreate.SFrame('people_wiki.sframe')

In [7]:
people

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


## Apply raw word count

In [8]:
people['word_count'] = turicreate.text_analytics.count_words(people['text'])

In [9]:
Elton_John = people[people['name'] == 'Elton John']
Elton_John['word_count']

dtype: dict
Rows: ?
[{'social': 1.0, 'champion': 1.0, 'be': 1.0, '2014': 1.0, 'legal': 1.0, 'became': 1.0, '2005': 1.0, 'december': 2.0, '21': 2.0, 'furnish': 2.0, 'david': 1.0, 'civil': 1.0, 'gay': 2.0, 'who': 1.0, '200': 1.0, 'raised': 1.0, 'industry': 1.0, 'film': 1.0, 'parties': 1.0, 'oscar': 1.0, 'partnership': 1.0, 'highestprofile': 1.0, 'which': 1.0, 'hosting': 1.0, 'later': 1.0, 'foundation': 2.0, 'established': 1.0, '1988': 1.0, '1992': 1.0, '1980s': 1.0, 'against': 1.0, 'fight': 1.0, 'heavily': 1.0, 'marriage': 2.0, '2012he': 1.0, 'buckingham': 1.0, 'outside': 1.0, 'and': 15.0, 'queens': 1.0, 'at': 4.0, '10': 1.0, '2002': 1.0, 'palace': 2.0, 'abbey': 1.0, 'hall': 2.0, 'royal': 1.0, 'services': 2.0, '1998': 1.0, 'charitable': 1.0, 'ii': 1.0, 'elizabeth': 1.0, 'empire': 1.0, 'year': 1.0, 'commander': 1.0, 'westminster': 1.0, '1996': 1.0, 'single': 2.0, 'named': 1.0, 'been': 3.0, 'songwriters': 2.0, '100': 3.0, '1994': 1.0, 'into': 3.0, 'overallelton': 1.0, 'wed': 1.0, 'male': 1

In [10]:
Elton_John[['word_count']].stack('word_count',new_column_name=['word','count']).sort('count', ascending = False)[:3]

word,count
the,27.0
in,18.0
and,15.0


## Apply TF-IDF

In [11]:
people['tfidf'] = turicreate.text_analytics.tf_idf(people['text'])

In [12]:
people

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'melbourne': 1.0, 'college': 1.0, 'para ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'time': 1.0, 'each': 1.0, 'rhythms': 1.0, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'time': 1.0, 'honored': 1.0, 'maple': 1.0, ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'kurdlawitzpreis': 1.0, 'this': 1.0, 'occasion': ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'curtis': 1.0, 'promo': 1.0, '2007': 1.0, 'ce ..."
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'journal': 1.0, 'niblit': 1.0, ..."
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'including': 1.0, 'artists': 1.0, 'local': ..."
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'concordia': 1.0, 'creative': 1.0, ..."
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'knuckles': 1.0, 'simply': 1.0, 'brand': ..."
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'n3': 1.0, '2002': 1.0, 'harvard': 1.0, 'tria ..."

tfidf
"{'melbourne': 3.8914310119380633, ..."
"{'time': 1.3253342074200498, ..."
"{'time': 1.3253342074200498, ..."
"{'kurdlawitzpreis': 10.986495389225194, ..."
"{'curtis': 5.299520032885375, ..."
"{'journal': 3.025473923341824, ..."
"{'including': 1.2272824458461182, ..."
"{'concordia': 6.250296940830698, ..."
"{'knuckles': 8.042056410058754, ..."
"{'n3': 10.293348208665249, ..."


In [13]:
Elton_John= people[people['name'] == 'Elton John']
Elton_John[['tfidf']].stack('tfidf',new_column_name=['word','tfidf']).sort('tfidf',ascending=False)[:3]

word,tfidf
furnish,18.38947183999428
elton,17.482320270031995
billboard,17.30368095754203


## Measure distance by cosine similarity

In [14]:
Victoria_Beckham = people[people['name'] == 'Victoria Beckham']
Paul_McCartney = people[people['name'] == 'Paul McCartney']

In [15]:
Victoria_Beckham['word_count'][0]

{'new': 1.0,
 'biannual': 1.0,
 'ticket': 1.0,
 'clamour': 1.0,
 'scathing': 1.0,
 'celebrity': 1.0,
 'significant': 1.0,
 'saying': 1.0,
 'most': 1.0,
 'show': 1.0,
 'successful': 1.0,
 'wag': 1.0,
 'that': 1.0,
 'belinda': 1.0,
 'won': 1.0,
 'business': 1.0,
 'familys': 1.0,
 'performer': 1.0,
 'daily': 1.0,
 'star': 1.0,
 'assessed': 1.0,
 '2012': 1.0,
 'year': 1.0,
 'brand': 2.0,
 'named': 1.0,
 '2011': 3.0,
 'diffusion': 1.0,
 'label': 3.0,
 'eponymous': 1.0,
 'other': 1.0,
 'collaborations': 1.0,
 'highprofile': 1.0,
 'following': 2.0,
 'interests': 1.0,
 'icon': 1.0,
 'style': 1.0,
 'become': 1.0,
 'noted': 1.0,
 'decade': 1.0,
 'past': 1.0,
 'millionin': 1.0,
 'estimated': 1.0,
 'writing': 1.0,
 'wealth': 1.0,
 'joint': 1.0,
 'children': 1.0,
 'they': 1.0,
 'launched': 1.0,
 'recognised': 1.0,
 'american': 1.0,
 'transition': 1.0,
 'participated': 1.0,
 'mind': 1.0,
 'betty': 1.0,
 'topmodel': 1.0,
 'predicted': 1.0,
 'guest': 1.0,
 'magazine': 1.0,
 'germanys': 1.0,
 'judge': 

In [16]:
turicreate.distances.cosine(Elton_John['tfidf'][0], Victoria_Beckham['tfidf'][0])    #use index0 to retrieve pure dictionary

0.9567006376655429

In [17]:
turicreate.distances.cosine(Elton_John['tfidf'][0], Paul_McCartney['tfidf'][0])
# larger cosine distance, longer distance

0.8250310029221779

## Apply nearest neighbors for retrieval of Wikipedia articles

In [18]:
knn_model_wordcount = turicreate.nearest_neighbors.create(people,features=['word_count'],label='name', distance = 'cosine')

In [19]:
knn_model_tfidf = turicreate.nearest_neighbors.create(people,features=['tfidf'],label='name', distance = 'cosine')

In [20]:
knn_model_wordcount.query(Elton_John)

query_label,reference_label,distance,rank
0,Elton John,2.220446049250313e-16,1
0,Cliff Richard,0.1614241525896703,2
0,Sandro Petrone,0.1682254275104111,3
0,Rod Stewart,0.168327165587061,4
0,Malachi O'Doherty,0.177315545978884,5


In [21]:
knn_model_tfidf.query(Elton_John)

query_label,reference_label,distance,rank
0,Elton John,-2.220446049250313e-16,1
0,Rod Stewart,0.7172196678927374,2
0,George Michael,0.7476009989692847,3
0,Sting (musician),0.7476719544306141,4
0,Phil Collins,0.7511932487904706,5


In [22]:
knn_model_wordcount.query(Victoria_Beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.220446049250313e-16,1
0,Mary Fitzgerald (artist),0.2073070361150499,2
0,Adrienne Corri,0.2145097827875479,3
0,Beverly Jane Fry,0.2174664687407927,4
0,Raman Mundair,0.2176954749915048,5


In [23]:
knn_model_tfidf.query(Victoria_Beckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.1102230246251563e-16,1
0,David Beckham,0.5481696102632145,2
0,Stephen Dow Beckham,0.7849867068283364,3
0,Mel B,0.8095855234085036,4
0,Caroline Rush,0.81982642291868,5
