In [2]:
import graphlab

# Load some text data - from wikipedia, pages on people

In [3]:
people = graphlab.SFrame('people_wiki.gl/')

This non-commercial license of GraphLab Create is assigned to stphnliang@gmail.com and will expire on June 12, 2017. For commercial licensing options, visit https://dato.com/buy/.


[INFO] graphlab.cython.cy_server: GraphLab Create v1.10.1 started. Logging: /tmp/graphlab_server_1467212727.log


In [4]:
people.head()

URI,name,text
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...


In [5]:
len(people)

59071

# Explore the dataset and checkout the text it contains

In [6]:
obama = people[people['name'] == 'Barack Obama']

In [7]:
obama

URI,name,text
<http://dbpedia.org/resou rce/Barack_Obama> ...,Barack Obama,barack hussein obama ii brk husen bm born august ...


# Get the word counts for Obama article

In [10]:
obama['word_count'] = graphlab.text_analytics.count_words(obama['text'])

## Sor the word conts for the Obama article

In [12]:
obama_word_count_table = obama[['word_count']].stack('word_count',
                                                    new_column_name = ['word', 'count'])

In [13]:
obama_word_count_table.head()

word,count
normalize,1
sought,1
combat,1
continued,1
unconstitutional,1
8,1
californias,1
1996,1
marriage,1
defense,1


In [14]:
obama_word_count_table.sort('count', ascending=False)

word,count
the,40
in,30
and,21
of,18
to,14
his,11
obama,9
act,8
a,7
he,7


# Compute TF-IDF for the corpus

In [15]:
people['word_count'] = graphlab.text_analytics.count_words(people['text'])

In [16]:
people.head()

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'since': 1, 'carltons': 1, 'being': 1, '2005' ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'precise': 1, 'thomas': 1, 'closely': 1, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'just': 1, 'issued': 1, 'mainly': 1, 'nominat ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'all': 1, 'bauforschung': 1, ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'legendary': 1, 'gangstergenka': 1, ..."
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'now': 1, 'currently': 1, 'less': 1, 'being' ..."
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'exclusive': 2, 'producer': 1, 'tribe': ..."
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'taxi': 1, 'salon': 1, 'gangs': 1, 'being': 1, ..."
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'houston': 1, 'frankie': 1, 'labels': 1, ..."
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'phenomenon': 1, 'deborash': 1, ..."


In [17]:
tfidf = graphlab.text_analytics.tf_idf(people['word_count'])

In [19]:
people['tfidf'] = tfidf

## Examine the TF-IDF for the Obama article

In [20]:
obama = people[people['name'] == 'Barack Obama']

In [21]:
obama[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf', ascending=False)

word,tfidf
obama,43.2956530721
act,27.678222623
iraq,17.747378588
control,14.8870608452
law,14.7229357618
ordered,14.5333739509
military,13.1159327785
involvement,12.7843852412
response,12.7843852412
democratic,12.4106886973


### Manually compute distances between a few people

In [22]:
clinton = people[people['name'] == 'Bill Clinton']

In [23]:
beckham = people[people['name'] == 'David Beckham']

# Is Obama closer to Clinton than to Beckham?

In [24]:
graphlab.distances.cosine(obama['tfidf'][0], clinton['tfidf'][0])

0.8339854936884276

In [25]:
graphlab.distances.cosine(obama['tfidf'][0], beckham['tfidf'][0])

0.9791305844747478

# Build a nearest neighbor model for document retrieval

In [26]:
knn_model = graphlab.nearest_neighbors.create(people, features=['tfidf'],
                                             label='name')

# Applying nearest-neighbors model for retrieval

## Who is closest to Obama?

In [27]:
knn_model.query(obama)

query_label,reference_label,distance,rank
0,Barack Obama,0.0,1
0,Joe Biden,0.794117647059,2
0,Joe Lieberman,0.794685990338,3
0,Kelly Ayotte,0.811989100817,4
0,Bill Clinton,0.813852813853,5


# Other examples of document retrieval

In [28]:
swift = people[people['name'] == 'Taylor Swift']

In [29]:
knn_model.query(swift)

query_label,reference_label,distance,rank
0,Taylor Swift,0.0,1
0,Carrie Underwood,0.76231884058,2
0,Alicia Keys,0.764705882353,3
0,Jordin Sparks,0.769633507853,4
0,Leona Lewis,0.776119402985,5


In [30]:
jolie = people[people['name'] == 'Angelina Jolie']

In [31]:
knn_model.query(jolie)

query_label,reference_label,distance,rank
0,Angelina Jolie,0.0,1
0,Brad Pitt,0.784023668639,2
0,Julianne Moore,0.795857988166,3
0,Billy Bob Thornton,0.803069053708,4
0,George Clooney,0.8046875,5


In [32]:
arnold = people[people['name'] == 'Arnold Schwarzenegger']

In [33]:
knn_model.query(arnold)

query_label,reference_label,distance,rank
0,Arnold Schwarzenegger,0.0,1
0,Jesse Ventura,0.818918918919,2
0,John Kitzhaber,0.824615384615,3
0,Lincoln Chafee,0.833876221498,4
0,Anthony Foxx,0.833910034602,5


# Elton John

In [34]:
elton = people[people['name'] == 'Elton John']

In [38]:
elton[['tfidf']].stack('tfidf', new_column_name=['word', 'tfidf']).sort('tfidf', ascending=False)

word,tfidf
furnish,18.38947184
elton,17.48232027
billboard,17.3036809575
john,13.9393127924
songwriters,11.250406447
overallelton,10.9864953892
tonightcandle,10.9864953892
19702000,10.2933482087
fivedecade,10.2933482087
aids,10.262846934


In [39]:
elton[['word_count']].stack('word_count', new_column_name=['word', 'count']).sort('count', ascending=False)

word,count
the,27
in,18
and,15
of,13
a,10
has,9
he,7
john,7
on,6
since,5


In [40]:
vbeckham = people[people['name'] == 'Victoria Beckham']

In [41]:
pmccartney = people[people['name'] == 'Paul McCartney']

In [42]:
graphlab.distances.cosine(elton['tfidf'][0], vbeckham['tfidf'][0])

0.9567006376655429

In [43]:
graphlab.distances.cosine(elton['tfidf'][0], pmccartney['tfidf'][0])

0.8250310029221779

In [45]:
people

URI,name,text,word_count
<http://dbpedia.org/resou rce/Digby_Morrell> ...,Digby Morrell,digby morrell born 10 october 1979 is a former ...,"{'since': 1, 'carltons': 1, 'being': 1, '2005' ..."
<http://dbpedia.org/resou rce/Alfred_J._Lewy> ...,Alfred J. Lewy,alfred j lewy aka sandy lewy graduated from ...,"{'precise': 1, 'thomas': 1, 'closely': 1, ..."
<http://dbpedia.org/resou rce/Harpdog_Brown> ...,Harpdog Brown,harpdog brown is a singer and harmonica player who ...,"{'just': 1, 'issued': 1, 'mainly': 1, 'nominat ..."
<http://dbpedia.org/resou rce/Franz_Rottensteiner> ...,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lower ...,"{'all': 1, 'bauforschung': 1, ..."
<http://dbpedia.org/resou rce/G-Enka> ...,G-Enka,henry krvits born 30 december 1974 in tallinn ...,"{'legendary': 1, 'gangstergenka': 1, ..."
<http://dbpedia.org/resou rce/Sam_Henderson> ...,Sam Henderson,sam henderson born october 18 1969 is an ...,"{'now': 1, 'currently': 1, 'less': 1, 'being' ..."
<http://dbpedia.org/resou rce/Aaron_LaCrate> ...,Aaron LaCrate,aaron lacrate is an american music producer ...,"{'exclusive': 2, 'producer': 1, 'tribe': ..."
<http://dbpedia.org/resou rce/Trevor_Ferguson> ...,Trevor Ferguson,trevor ferguson aka john farrow born 11 november ...,"{'taxi': 1, 'salon': 1, 'gangs': 1, 'being': 1, ..."
<http://dbpedia.org/resou rce/Grant_Nelson> ...,Grant Nelson,grant nelson born 27 april 1971 in london ...,"{'houston': 1, 'frankie': 1, 'labels': 1, ..."
<http://dbpedia.org/resou rce/Cathy_Caruth> ...,Cathy Caruth,cathy caruth born 1955 is frank h t rhodes ...,"{'phenomenon': 1, 'deborash': 1, ..."

tfidf
"{'since': 1.455376717308041, ..."
"{'precise': 6.44320060695519, ..."
"{'just': 2.7007299687108643, ..."
"{'all': 1.6431112434912472, ..."
"{'legendary': 4.280856294365192, ..."
"{'now': 1.96695239252401, 'currently': ..."
"{'exclusive': 10.455187230695827, ..."
"{'taxi': 6.0520214560945025, ..."
"{'houston': 3.935505942157149, ..."
"{'phenomenon': 5.750053426395245, ..."


In [49]:
knn_model_wc = graphlab.nearest_neighbors.create(people, 
                                                 features=['word_count'],
                                                 distance='cosine',
                                                 label='name')

In [50]:
knn_model_tfidf = graphlab.nearest_neighbors.create(people, 
                                                    features=['tfidf'],
                                                    distance='cosine',
                                                    label='name')

## What’s the most similar article, other than itself, to the one on ‘Elton John’ using word count features?

In [51]:
knn_model_wc.query(elton)

query_label,reference_label,distance,rank
0,Elton John,2.22044604925e-16,1
0,Cliff Richard,0.16142415259,2
0,Sandro Petrone,0.16822542751,3
0,Rod Stewart,0.168327165587,4
0,Malachi O'Doherty,0.177315545979,5


## What’s the most similar article, other than itself, to the one on ‘Elton John’ using TF-IDF features?

In [52]:
knn_model_tfidf.query(elton)

query_label,reference_label,distance,rank
0,Elton John,-2.22044604925e-16,1
0,Rod Stewart,0.717219667893,2
0,George Michael,0.747600998969,3
0,Sting (musician),0.747671954431,4
0,Phil Collins,0.75119324879,5


## What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using word count features?

In [53]:
knn_model_wc.query(vbeckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,-2.22044604925e-16,1
0,Mary Fitzgerald (artist),0.207307036115,2
0,Adrienne Corri,0.214509782788,3
0,Beverly Jane Fry,0.217466468741,4
0,Raman Mundair,0.217695474992,5


## What’s the most similar article, other than itself, to the one on ‘Victoria Beckham’ using TF-IDF features?

In [54]:
knn_model_tfidf.query(vbeckham)

query_label,reference_label,distance,rank
0,Victoria Beckham,1.11022302463e-16,1
0,David Beckham,0.548169610263,2
0,Stephen Dow Beckham,0.784986706828,3
0,Mel B,0.809585523409,4
0,Caroline Rush,0.819826422919,5
