In [12]:
#!/usr/bin/env python

# In this piece of code, I'm trying to retrieve documents relevant to Emmanuel :)
import pandas as pd
import numpy as np
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neighbors import KDTree

## Download the dataset here: https://drive.google.com/file/d/1quiVJqTHnDzCxFT6xgJqhkNpj48rz-49/view

In [2]:
people = pd.read_csv('people_data.csv', nrows=7799)  # my Ipython kernel died a few times, better choose less text
people.shape

(7799, 3)

In [44]:
people.head()

Unnamed: 0,URI,name,text
0,<http://dbpedia.org/resource/Bill_Clinton>,Bill Clinton,william jefferson bill clinton born william je...
1,<http://dbpedia.org/resource/Polyana_L%C3%B3pez>,Polyana L%C3%B3pez,polyana lpez born circa 1985 is an argentine a...
2,<http://dbpedia.org/resource/Harpdog_Brown>,Harpdog Brown,harpdog brown is a singer and harmonica player...
3,<http://dbpedia.org/resource/Franz_Rottensteiner>,Franz Rottensteiner,franz rottensteiner born in waidmannsfeld lowe...
4,<http://dbpedia.org/resource/G-Enka>,G-Enka,henry krvits born 30 december 1974 in tallinn ...


In [3]:
people.text[4:10]

4    henry krvits born 30 december 1974 in tallinn ...
5    sam henderson born october 18 1969 is an ameri...
6    aaron lacrate is an american music producer re...
7    trevor ferguson aka john farrow born 11 novemb...
8    grant nelson born 27 april 1971 in london also...
9    cathy caruth born 1955 is frank h t rhodes pro...
Name: text, dtype: object

In [4]:
count_vect = CountVectorizer()  # word vector
X_train_counts = count_vect.fit_transform(people.text.astype('U'))  # convert to sparse matrix
X_train_counts

<7799x134997 sparse matrix of type '<type 'numpy.int64'>'
	with 1354646 stored elements in Compressed Sparse Row format>

In [5]:
X_train_counts[7]

<1x134997 sparse matrix of type '<type 'numpy.int64'>'
	with 304 stored elements in Compressed Sparse Row format>

In [6]:
# remove unimpotant words through tf-idf
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf

<7799x134997 sparse matrix of type '<type 'numpy.float64'>'
	with 1354646 stored elements in Compressed Sparse Row format>

In [7]:
people['tfidf']=list(X_train_tfidf.toarray())
print len(people['tfidf'])
print len(people['tfidf'][7])

7799
134997


In [8]:
kdt = KDTree(people['tfidf'].tolist(), leaf_size=7)

In [11]:
kdt

<sklearn.neighbors.kd_tree.KDTree at 0x7f9fe86b9c10>

In [23]:
print people['name'].str.contains('Emmanuel').value_counts()  # 4 articles contains Emmanuel

False    7794
True        4
Name: name, dtype: int64


In [32]:
# Find most relevant 7 articles close to Emmanuel!

# 1. you find articles that contains Emmanuel in name column, in this case (you can also use text column)
# 2. the relevant tfidf features will be used in KDTree

dist, idx = kdt.query(people['tfidf'][people['name'].str.contains('Emmanuel').astype(bool)].tolist(), k=7)

In [35]:
print dist
print idx

[[ 0.          1.16889527  1.20133731  1.25841642  1.27332873  1.28143053
   1.28154608]
 [ 0.          1.06962209  1.07685893  1.18726699  1.19637047  1.20821813
   1.21485208]
 [ 0.          1.16217917  1.19242797  1.19935786  1.22984895  1.23150622
   1.23551565]
 [ 0.          1.21865095  1.2362159   1.27582766  1.28159673  1.28656549
   1.2885435 ]
 [ 0.          1.3257826   1.34413744  1.38912252  1.41421356  1.41421356
   1.41421356]]
[[ 193 7788 5603 5676 6437 7507 2782]
 [2446 2610 6145 4475 3411 5776 3468]
 [3922 1048 5030 6140 1658 1134 2129]
 [4175 5823 5643 2131  491 4258 3579]
 [5880 1987 1531 7791 1681 3030 2063]]


In [53]:
print people['name'][193] + ', ' + people['name'][7788] + ', ' + people['name'][6437]
print people['name'][2446]
print people['name'][3922]
print people['name'][4175]
print people['name'][5880]
print people['name'][7791]

Emmanuel Pierre-Antoine, Kylie Jones, Sammy Stopford
Emmanuel Kolini
Emmanuel N. Onwubiko
Emmanuel Paulker
nan
Margaret Leng Tan
