# Document retrieval from wikipedia data

## Fire up Libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from collections import Counter
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from operator import itemgetter

from sklearn.metrics.pairwise import cosine_similarity

from sklearn.neighbors import NearestNeighbors

from sklearn.linear_model import LogisticRegression

# Load some text data - from wikipedia, pages on people

In [2]:
people = pd.read_csv('people_wiki.csv', index_col='name')

Data contains:  link to wikipedia article, name of person, text of article.

In [3]:
people.head()

Unnamed: 0_level_0,URI,text
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Digby Morrell,<http://dbpedia.org/resource/Digby_Morrell>,digby morrell born 10 october 1979 is a former...
Alfred J. Lewy,<http://dbpedia.org/resource/Alfred_J._Lewy>,alfred j lewy aka sandy lewy graduated from un...
Harpdog Brown,<http://dbpedia.org/resource/Harpdog_Brown>,harpdog brown is a singer and harmonica player...
Franz Rottensteiner,<http://dbpedia.org/resource/Franz_Rottensteiner>,franz rottensteiner born in waidmannsfeld lowe...
G-Enka,<http://dbpedia.org/resource/G-Enka>,henry krvits born 30 december 1974 in tallinn ...


In [4]:
people.shape

(59071, 2)

# Explore the dataset and checkout the text it contains

## Exploring the entry for president Obama

In [5]:
obama = people[people.index == 'Barack Obama'].copy()

In [6]:
obama

Unnamed: 0_level_0,URI,text
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Barack Obama,<http://dbpedia.org/resource/Barack_Obama>,barack hussein obama ii brk husen bm born augu...


In [12]:
print (obama['text'].values[0])

barack hussein obama ii brk husen bm born august 4 1961 is the 44th and current president of the united states and the first african american to hold the office born in honolulu hawaii obama is a graduate of columbia university and harvard law school where he served as president of the harvard law review he was a community organizer in chicago before earning his law degree he worked as a civil rights attorney and taught constitutional law at the university of chicago law school from 1992 to 2004 he served three terms representing the 13th district in the illinois senate from 1997 to 2004 running unsuccessfully for the united states house of representatives in 2000in 2004 obama received national attention during his campaign to represent illinois in the united states senate with his victory in the march democratic party primary his keynote address at the democratic national convention in july and his election to the senate in november he began his presidential campaign in 2007 and after

## Exploring the entry for actor George Clooney

In [13]:
clooney = people[people.index == 'George Clooney']
print (clooney['text'].values[0])

george timothy clooney born may 6 1961 is an american actor writer producer director and activist he has received three golden globe awards for his work as an actor and two academy awards one for acting and the other for producingclooney made his acting debut on television in 1978 and later gained wide recognition in his role as dr doug ross on the longrunning medical drama er from 1994 to 1999 for which he received two emmy award nominations while working on er he began attracting a variety of leading roles in films including the superhero film batman robin 1997 and the crime comedy out of sight 1998 in which he first worked with a director who would become a longtime collaborator steven soderbergh in 1999 clooney took the lead role in three kings a wellreceived war satire set during the gulf warin 2001 clooneys fame widened with the release of his biggest commercial success the heist comedy oceans eleven the first of the film trilogy a remake of the 1960 film with frank sinatra as da

# Get the word counts for Obama article

In [14]:
count_vectorizer    = CountVectorizer()
obama['word_count'] = count_vectorizer.fit_transform(obama['text'].values)

## Showing the features (i.e. words)

In [15]:
print (count_vectorizer.get_feature_names())

['13th', '1961', '1992', '1996', '1997', '20', '2000in', '2004', '2007', '2008', '2009', '2010', '2011', '2012', '2012obama', '2013', '44th', '63', 'act', 'address', 'administration', 'affordable', 'afghanistan', 'african', 'after', 'against', 'american', 'americans', 'and', 'arms', 'as', 'ask', 'at', 'attention', 'attorney', 'august', 'barack', 'before', 'began', 'bin', 'bm', 'born', 'briefs', 'brk', 'budget', 'by', 'californias', 'called', 'campaign', 'care', 'chicago', 'civil', 'clinton', 'close', 'columbia', 'combat', 'community', 'constitutional', 'consumer', 'continued', 'control', 'convention', 'court', 'creation', 'cuba', 'current', 'death', 'debate', 'debt', 'defeated', 'defeating', 'defense', 'degree', 'delegates', 'democratic', 'district', 'doddfrank', 'domestic', 'dont', 'down', 'during', 'earning', 'economic', 'election', 'elementary', 'ended', 'ending', 'equality', 'federal', 'filed', 'first', 'for', 'foreign', 'form', 'from', 'full', 'gains', 'general', 'graduate', 'grea

## Sort the word counts for the Obama article

In [16]:
vocab  = list(count_vectorizer.get_feature_names())
counts = obama['word_count'].values.sum(axis=0).toarray()[0]

freq_distribution = Counter(dict(zip(vocab, counts)))
print (freq_distribution.most_common(10))

[('the', 40), ('in', 30), ('and', 21), ('of', 18), ('to', 14), ('his', 11), ('obama', 9), ('act', 8), ('he', 7), ('as', 6)]


Most common words include uninformative words like "the", "in", "and",...

# Compute TF-IDF for the corpus 

To give more weight to informative words, we weigh them by their TF-IDF scores.

In [17]:
tfidf_vectorizer = TfidfVectorizer()
people['tfidf']  = list(tfidf_vectorizer.fit_transform(people['text']))

In [18]:
people.shape

(59071, 3)

In [19]:
len(tfidf_vectorizer.vocabulary_)

548429

## Examine the TF-IDF for the Obama article

In [20]:
obama = people[people.index == 'Barack Obama'].copy()

In [21]:
obama['tfidf'].values[0]

<1x548429 sparse matrix of type '<class 'numpy.float64'>'
	with 270 stored elements in Compressed Sparse Row format>

In [23]:
vocab = tfidf_vectorizer.get_feature_names()

response = obama['tfidf'].values[0]
l = [(vocab[col], response[0, col]) for col in response.nonzero()[1]]
l = sorted(l, key=itemgetter(1),reverse=True)
print (l[:10])

[('obama', 0.3650175898187781), ('the', 0.2793227400023615), ('act', 0.2490890416206761), ('in', 0.20967299876631698), ('iraq', 0.15180855532927304), ('and', 0.14673880270062417), ('law', 0.14468744228550123), ('control', 0.1318571790693225), ('of', 0.1262048162788276), ('us', 0.12283397315748205)]


Words with highest TF-IDF are much more informative.

# Eliminating stopwords

In [24]:
count_vectorizer    = CountVectorizer(stop_words='english')
obama['word_count'] = count_vectorizer.fit_transform(obama['text'].values)

vocab  = list(count_vectorizer.get_feature_names())
counts = obama['word_count'].values.sum(axis=0).toarray()[0]
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
people['tfidf']  = list(tfidf_vectorizer.fit_transform(people['text']))

vocab = tfidf_vectorizer.get_feature_names()

obama    = people[people.index == 'Barack Obama'].copy()
response = obama['tfidf'].values[0]
l        = [(vocab[col], response[0, col]) for col in response.nonzero()[1]]
l        = sorted(l, key=itemgetter(1),reverse=True)
print (l[:10])
freq_distribution = Counter(dict(zip(vocab, counts)))
print (freq_distribution.most_common(10))

[('obama', 9), ('act', 8), ('law', 6), ('control', 4), ('democratic', 4), ('iraq', 4), ('military', 4), ('president', 4), ('2004', 3), ('2009', 3)]


In [25]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
people['tfidf']  = list(tfidf_vectorizer.fit_transform(people['text']))

vocab = tfidf_vectorizer.get_feature_names()

obama    = people[people.index == 'Barack Obama'].copy()
response = obama['tfidf'].values[0]
l        = [(vocab[col], response[0, col]) for col in response.nonzero()[1]]
l        = sorted(l, key=itemgetter(1),reverse=True)
print (l[:10])

[('obama', 0.4134945526753888), ('act', 0.28216985897150365), ('iraq', 0.1719698240003707), ('law', 0.1639029758957432), ('control', 0.14936876138866909), ('ordered', 0.13863257254400477), ('military', 0.1353676447383487), ('democratic', 0.12979169658143577), ('response', 0.12482116078603717), ('involvement', 0.12482116078603717)]


# Manually compute distances between a few people

Let's manually compare the distances between the articles for a few famous people.  

In [26]:
clinton = people[people.index == 'Bill Clinton'].copy()

In [27]:
beckham = people[people.index == 'David Beckham'].copy()

## Is Obama closer to Clinton than to Beckham?

We will find that the article about president Obama is closer to the one about former president Clinton than that of footballer David Beckham.

In [28]:
cosine_similarity(obama['tfidf'].values[0], clinton['tfidf'].values[0])

array([[0.18896718]])

In [29]:
cosine_similarity(obama['tfidf'].values[0], beckham['tfidf'].values[0])

array([[0.02556581]])

# Build a nearest neighbor model for document retrieval

We now create a nearest-neighbors model and apply it to document retrieval.  

In [30]:
tfidf_matrix = tfidf_vectorizer.fit_transform(people['text'])
nbrs         = NearestNeighbors(n_neighbors=10).fit(tfidf_matrix)

In [31]:
def get_closest_neighbors(name):
    row                = people.index.get_loc(name)
    distances, indices = nbrs.kneighbors(tfidf_matrix.getrow(row))
    names_similar      = pd.Series(indices.flatten()).map(people.reset_index()['name'])
    result             = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

# Applying the nearest-neighbors model for retrieval

## Who is closest to Obama?

In [32]:
get_closest_neighbors('Barack Obama')

Unnamed: 0,distance,name
0,0.0,Barack Obama
1,1.165145,Joe Biden
2,1.207369,Samantha Power
3,1.21964,Hillary Rodham Clinton
4,1.222509,Eric Stern (politician)
5,1.236178,Robert Gibbs
6,1.243057,Henry Waxman
7,1.244667,Jesse Lee (politician)
8,1.248296,Eric Holder
9,1.251607,Joe the Plumber


As we can see, president Obama's article is closest to the one about his vice-president Biden, and those of other politicians.  

## Other examples of document retrieval

In [33]:
get_closest_neighbors('Taylor Swift')

Unnamed: 0,distance,name
0,0.0,Taylor Swift
1,1.183004,Carrie Underwood
2,1.187754,Al Swift
3,1.193938,Ed Sheeran
4,1.197285,Tim McGraw
5,1.199189,Kelly Clarkson
6,1.19979,Adele
7,1.204965,Bill Swift
8,1.207081,Dolly Parton
9,1.208139,Joss Stone


In [34]:
get_closest_neighbors('Angelina Jolie')

Unnamed: 0,distance,name
0,0.0,Angelina Jolie
1,1.173973,Brad Pitt
2,1.241878,Keith Jolie
3,1.25319,Jodie Foster
4,1.254573,Maggie Smith
5,1.259312,Jessica Chastain
6,1.26016,Anne Hathaway
7,1.262106,Nicole Kidman
8,1.262143,Barry Voight
9,1.263898,Billy Bob Thornton


In [35]:
get_closest_neighbors('Arnold Schwarzenegger')

Unnamed: 0,distance,name
0,0.0,Arnold Schwarzenegger
1,1.259683,Bonnie Garcia
2,1.263233,Paul Grant (bodybuilder)
3,1.283846,Gray Davis
4,1.284463,James Tramel
5,1.2851,Abel Maldonado
6,1.29324,Bruce McPherson
7,1.294107,Charlene Zettel
8,1.301621,Russell Gould
9,1.301828,David Israel


# Quizz questions

### Answer 1

In [1]:
elton = people[people.index == 'Elton John'].copy()
count_vectorizer    = CountVectorizer()
elton['word_count'] = count_vectorizer.fit_transform(elton['text'].values)

vocab  = list(count_vectorizer.get_feature_names())
counts = elton['word_count'].values.sum(axis=0).toarray()[0]

freq_distribution = Counter(dict(zip(vocab, counts)))
print (freq_distribution.most_common(3))
print ()

#TF-IDF
elton = people[people.index == 'Elton John'].copy()
elton['tfidf'].values[0]

vocab = tfidf_vectorizer.get_feature_names()

response = elton['tfidf'].values[0]
l = [(vocab[col], response[0, col]) for col in response.nonzero()[1]]
l = sorted(l, key=itemgetter(1),reverse=True)
print (l[:3])
print ()

#Eliminando as stopwords
count_vectorizer    = CountVectorizer(stop_words='english')
elton['word_count'] = count_vectorizer.fit_transform(elton['text'].values)

vocab  = list(count_vectorizer.get_feature_names())
counts = elton['word_count'].values.sum(axis=0).toarray()[0]

freq_distribution = Counter(dict(zip(vocab, counts)))
#print (freq_distribution.most_common(3))
print()

tfidf_vectorizer = TfidfVectorizer(stop_words='english')
people['tfidf']  = list(tfidf_vectorizer.fit_transform(people['text']))

vocab = tfidf_vectorizer.get_feature_names()

elton    = people[people.index == 'Elton John'].copy()
response = elton['tfidf'].values[0]
l        = [(vocab[col], response[0, col]) for col in response.nonzero()[1]]
l        = sorted(l, key=itemgetter(1),reverse=True)
#print (l[:3])

NameError: name 'people' is not defined

### Answer 2

In [62]:
beckham = people[people.index == 'Victoria Beckham'].copy()
mcCartney = people[people.index == 'Paul McCartney'].copy()

print (cosine_similarity(elton['tfidf'].values[0], beckham['tfidf'].values[0]))
print (cosine_similarity(elton['tfidf'].values[0], mcCartney['tfidf'].values[0]))

[[0.03407023]]
[[0.18991373]]


### Answer 3

In [67]:
count_vectorizer    = CountVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(people['text'])
nbrs         = NearestNeighbors(n_neighbors=10).fit(tfidf_matrix)

wordCount = count_vectorizer.fit_transform(people['text'])
nbrsWC         = NearestNeighbors(n_neighbors=10).fit(wordCount)

In [68]:
def get_closest_neighbors(name):
    row                = people.index.get_loc(name)
    distances, indices = nbrs.kneighbors(tfidf_matrix.getrow(row))
    names_similar      = pd.Series(indices.flatten()).map(people.reset_index()['name'])
    result             = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

def get_closest_neighborsWC(name):
    row                = people.index.get_loc(name)
    distances, indices = nbrsWC.kneighbors(wordCount.getrow(row))
    names_similar      = pd.Series(indices.flatten()).map(people.reset_index()['name'])
    result             = pd.DataFrame({'distance':distances.flatten(), 'name':names_similar})
    return result

In [69]:
get_closest_neighborsWC('Elton John')

Unnamed: 0,distance,name
0,0.0,Elton John
1,27.748874,Roger Daltrey
2,27.92848,Rod Stewart
3,28.913665,John Ronane
4,29.223278,Matthew Kalman
5,29.223278,Brad Fiedel
6,29.274562,Robert E. Lerner
7,29.274562,Robbie Williams
8,29.291637,Michael White (author)
9,29.444864,Alex Scott (actor)


In [70]:
get_closest_neighbors('Elton John')

Unnamed: 0,distance,name
0,0.0,Elton John
1,1.186742,Rod Stewart
2,1.19651,Sting (musician)
3,1.203888,George Michael
4,1.204019,Phil Collins
5,1.218337,Kelly Clarkson
6,1.221504,Usher (entertainer)
7,1.223884,Adele
8,1.229677,Rihanna
9,1.232415,Bryan Adams


In [71]:
get_closest_neighborsWC('Victoria Beckham')

Unnamed: 0,distance,name
0,0.0,Victoria Beckham
1,22.538855,Kelly Bell
2,23.366643,Rhian Samuel
3,23.366643,Rikke Karlsson
4,23.388031,Marie Brassard
5,23.49468,Renee Nele
6,23.49468,Hilary Alexander
7,23.515952,Yeojin Bae
8,23.515952,Ali Hewson
9,23.579652,Gillian Mann


In [72]:
get_closest_neighbors('Victoria Beckham')

Unnamed: 0,distance,name
0,0.0,Victoria Beckham
1,1.072772,David Beckham
2,1.264277,Stephen Dow Beckham
3,1.274599,Caroline Rush
4,1.276112,Angelique Westerhof
5,1.280018,Wal%C3%A9 Adeyemi
6,1.280657,Colin McDowell
7,1.280896,Zurain Imam
8,1.281365,Mel B
9,1.281901,Yuliya Polishchuk
