In [10]:
import graphlab as gl

In [11]:
people = gl.SFrame('people_wiki.gl/')

## Task 1 : top 3 words in word_count and TF_IDF for Elton John

In [19]:
#Adding word_count and TF_IDF for every person

people['word_count'] = gl.text_analytics.count_words(people['text'])
people['tfidf'] = gl.text_analytics.tf_idf(people['word_count'])


In [20]:
elton = people[people['name'] == 'Elton John']

In [21]:
elton[['word_count']].stack('word_count', new_column_name=['word', 'count']).sort('count', ascending=False)

word,count
the,27
in,18
and,15
of,13
a,10
has,9
john,7
he,7
on,6
award,5


In [22]:
elton[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf', ascending=False)

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
tonightcandle,10.9864953892
overallelton,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


## Task 2 : How close is Victoria Beckham and Paul McCartney to Elton John

In [23]:
victoria = people[people['name'] == 'Victoria Beckham']
paul = people[people['name'] == 'Paul McCartney']

victoria.head()

URI,name,text,word_count
<http://dbpedia.org/resou rce/Victoria_Beckham> ...,Victoria Beckham,victoria caroline beckham ne adams born 17 april ...,"{'millionin': 1, 'saying': 1, 'cameo': 1, ..."

tfidf
"{'millionin': 7.728398851203712, ..."


In [24]:
print gl.distances.cosine(elton['tfidf'][0], victoria['tfidf'][0])
print gl.distances.cosine(elton['tfidf'][0], paul['tfidf'][0])

0.956700637666
0.825031002922


## Task 3 : Building nearest neighbors models with different input features and setting the distance metric

In [25]:
#knn with word_count as feature
knn_model_word_count = gl.nearest_neighbors.create(people, features = ['word_count'], label='name', distance='cosine')

In [26]:
#knn with TF_IDF as feature
knn_model_tfidf = gl.nearest_neighbors.create(people, features = ['tfidf'], label='name', distance='cosine')

In [27]:
knn_model_word_count.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


In [28]:
knn_model_tfidf.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


In [29]:
knn_model_word_count.query(victoria)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


In [30]:
knn_model_tfidf.query(victoria)


query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
