## Topic Modeling with Non-negative Matrix Factorization

In [1]:
import pandas as pd
import numpy as np
import re
import os
import string
import pickle
from miscScripts import *

import nltk
from nltk.corpus import wordnet 
from nltk import wordpunct_tokenize

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn import svm

from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
# nltk.data.path.append('/Users/jhonsen/Documents/DS/nltk_data/')
# nltk.download('wordnet', download_dir='/Users/jhonsen/Documents/DS/nltk_data/')

### Importing Pickled Dataframe

In [3]:
# Unpickle the clean data
with open('../data/fclean.pkl','rb') as fin:
    df = pickle.load(fin)

---

#### Vectorizing and Creating Document-To-Term Matrix 

In [4]:
# Turning texts into a list
sentences_tokens = df['summary'].tolist()

Using **TF-IDF** vectorizer
- English stop word
- consider uni-, bi-, trigrams

In [5]:
# TFIDF
vectorizer = TfidfVectorizer(token_pattern = r'\b[a-zA-Z]{3,}\b',
                             stop_words='english', 
                             strip_accents = 'unicode',
                             lowercase=True,
                             ngram_range=(1,3))

# Fit transform the documents
vectorizer.fit(sentences_tokens)
doc_word = vectorizer.transform(sentences_tokens)

In [6]:
# Save vectorizer and documents as pickle files for visualization (another notebook)
with open('../data/doc_words.pkl','wb') as fout:
    pickle.dump(doc_word, fout)

with open('../data/vectorizer.pkl', 'wb') as fout:
    pickle.dump(vectorizer, fout)

In [7]:
# Create Doc-Term-Matrix
dtm = pd.DataFrame(doc_word.toarray(),
             index= df.title,
            columns = vectorizer.get_feature_names())
dtm.head()

Unnamed: 0_level_0,aaa,aaa champion,aaa champion placed,aaa coaching,aaa coaching committee,aaai,aaai allen,aaai allen newell,aaai association,aaai association advancement,...,zykan heinz karl,zylberbaum,zylberbaum mexican,zylberbaum mexican neurophysiologist,zylinski,zylinski november,zylinski november vilnius,zytle,zytle portrayed,zytle portrayed peter
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A. Aiyappan,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A. D. Gardner,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A. E. V. Richardson,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A. G. Mearns,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A. M. Mubarak,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Try TOPIC MODELING using NMF 
- at the current state 9 topics seem to make sense
- Truncated Singular Value Decomposition of this dtm matrix indicates that 9 components capture 80% of variance explained ratio (another notebook)

In [8]:
# Build an NMF with 9 topics

nmf = NMF(9)
nmf.fit(doc_word)
doc_topic = nmf.transform(doc_word)

In [9]:
# Save the model as a pickled file for visualization later (another notebook) 
with open('../data/nmf.pkl','wb') as fout:
    pickle.dump(nmf, fout)
with open('../data/doc_nmf.pkl','wb') as fout:
    pickle.dump(doc_topic, fout)

In [10]:
# Check components and feature vectors 
df_topic_word = pd.DataFrame(nmf.components_.round(3),
                         index=['comp'+str(k+1) for k in range(9)],
                         columns= vectorizer.get_feature_names())
df_topic_word

Unnamed: 0,aaa,aaa champion,aaa champion placed,aaa coaching,aaa coaching committee,aaai,aaai allen,aaai allen newell,aaai association,aaai association advancement,...,zykan heinz karl,zylberbaum,zylberbaum mexican,zylberbaum mexican neurophysiologist,zylinski,zylinski november,zylinski november vilnius,zytle,zytle portrayed,zytle portrayed peter
comp1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.001,0.001,0.001,0.0,0.0,0.0
comp2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comp3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comp4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comp5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comp6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comp7,0.0,0.0,0.0,0.0,0.0,0.003,0.001,0.001,0.002,0.001,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
comp8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001,0.001,0.001
comp9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Create a dataframe of scientists (observations) and TOPICS

df_sci_topic = pd.DataFrame(doc_topic.round(3),
                index= df.title,
                columns = ['comp'+str(k+1) for k in range(9)])

In [12]:
df_sci_topic.sample(5)

Unnamed: 0_level_0,comp1,comp2,comp3,comp4,comp5,comp6,comp7,comp8,comp9
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
David_Lynch_(wine_expert),0.01,0.002,0.0,0.0,0.022,0.0,0.0,0.001,0.0
Chaim Samuel Hönig,0.016,0.0,0.0,0.0,0.004,0.035,0.0,0.002,0.008
Bucking Broncho,0.0,0.001,0.001,0.005,0.015,0.017,0.0,0.01,0.0
Ayyappanpillai Ajayaghosh,0.0,0.0,0.313,0.001,0.0,0.036,0.0,0.0,0.0
Lixia_Zhang,0.003,0.0,0.0,0.0,0.0,0.009,0.059,0.0,0.0


#### Convert NMF result into percentages, for intuitive inspection

In [13]:
# Turn these into percentages
df_Hp = getPercentages(df_sci_topic)

In [14]:
df_Hp.head()

Unnamed: 0_level_0,comp1,comp2,comp3,comp4,comp5,comp6,comp7,comp8,comp9
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A. Aiyappan,0.0,0.0,50.0,0.0,50.0,0.0,0.0,0.0,0.0
A. D. Gardner,65.517241,0.0,0.0,0.0,24.137931,10.344828,0.0,0.0,0.0
A. E. V. Richardson,35.897436,0.0,10.25641,0.0,43.589744,0.0,10.25641,0.0,0.0
A. G. Mearns,12.5,0.0,8.333333,0.0,62.5,0.0,8.333333,8.333333,0.0
A. M. Mubarak,78.571429,0.0,0.0,0.0,14.285714,0.0,0.0,7.142857,0.0


In [15]:
# Collect the 9 topics into a col 
# this object can be used to create wordclouds
dict_topics_9 = collect_topics(nmf, vectorizer.get_feature_names(), no_top_words=10)

In [16]:
# Display words in NMF TOPICS 
display_topics(nmf, vectorizer.get_feature_names(), no_top_words=10)


Topic  1
university, professor, research, science, born, institute, department, director, college, engineering

Topic  2
marvel, comics, comic, comic books, comic books published, books published, american comic books, appearing american comic, appearing american, american comic

Topic  3
indian, science, academy, sciences, india, academy sciences, indian science, research, scientific, national

Topic  4
characters, star trek, trek, star, fictional, created star trek, created star, created, major characters, lists characters

Topic  5
known, german, french, work, april, march, february, august, december, january

Topic  6
russian, academy, sciences, chinese, academy sciences, member, soviet, academician, national, national academy

Topic  7
computer, computer science, science, computer scientist, scientist, computing, professor computer, professor computer science, programming, american computer

Topic  8
series, characters, television, television series, character, fictional, charact

  
Breakdown of Topics
- Topic 1 :  Scientist in academia
- Topic 2 :  ---> Comic-books scientists  
- Topic 3 :  Indian scientist
- Topic 4 :  ---> fictional scientist (Star Trek)
- Topic 5 :  European? 
- Topic 6 :  Russian
- Topic 7 :  Computer scientist
- Topic 8 :  ---->  TV
- Topic 9 :  Physicist

Note: 
- Topic-2, -4, and -8 seem to not be real scientists (persons). These were unintentionally collected during acquisition , as they must have some relations (on some level) with articles categorized under `Category:scientists`  
- Topic 5 seems to have months (april, february, etc.), which may have been people's date of births. Topic 4 
- Exploratory to double check these entries
- Cleaning the data further may fix this issue
---

#### Assign Names to Topics

In [17]:
# Rename Column names
df_Hp.columns = ['academia','comic','indian','fictional','european','russian','compsci','TV','physicist']

In [18]:
df_Hp.head()

Unnamed: 0_level_0,academia,comic,indian,fictional,european,russian,compsci,TV,physicist
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
A. Aiyappan,0.0,0.0,50.0,0.0,50.0,0.0,0.0,0.0,0.0
A. D. Gardner,65.517241,0.0,0.0,0.0,24.137931,10.344828,0.0,0.0,0.0
A. E. V. Richardson,35.897436,0.0,10.25641,0.0,43.589744,0.0,10.25641,0.0,0.0
A. G. Mearns,12.5,0.0,8.333333,0.0,62.5,0.0,8.333333,8.333333,0.0
A. M. Mubarak,78.571429,0.0,0.0,0.0,14.285714,0.0,0.0,7.142857,0.0


In [19]:
# Turn the Name-index into a column and then reset index
df_Hp['title'] = df_Hp.index

In [20]:
print("Are the dimensions of the dataframes equal?")
df_Hp.shape[0] == df.shape[0]

Are the dimensions of the dataframes equal?


True

In [21]:
# Resetting the indexes in dataframes
df.reset_index(inplace=True)
df.drop(columns='index', inplace=True)
df_Hp.index=np.arange(df_Hp.shape[0])

In [56]:
# This is the original dataframe containing unprocessed summary
df.head()

Unnamed: 0,title,summary
0,A. Aiyappan,Ayinapalli Aiyappan (5 February 1905 – 28 June...
1,A. D. Gardner,"Arthur Duncan Gardner, FRCP, FRCS (28 March 18..."
2,A. E. V. Richardson,"Arnold Edwin Victor Richardson MA, BSc., (12 S..."
3,A. G. Mearns,Dr Alexander Gow Mearns FRSE MBE (1903–1968) w...
4,A. M. Mubarak,Azeez Mohamed Mubarak (born 4 July 1951) is a ...


In [57]:
# This dataframe is the one to use for cosine similarity (below)
df_Hp.head()

Unnamed: 0,academia,comic,indian,fictional,european,russian,compsci,TV,physicist,title
0,0.0,0.0,50.0,0.0,50.0,0.0,0.0,0.0,0.0,A. Aiyappan
1,65.517241,0.0,0.0,0.0,24.137931,10.344828,0.0,0.0,0.0,A. D. Gardner
2,35.897436,0.0,10.25641,0.0,43.589744,0.0,10.25641,0.0,0.0,A. E. V. Richardson
3,12.5,0.0,8.333333,0.0,62.5,0.0,8.333333,8.333333,0.0,A. G. Mearns
4,78.571429,0.0,0.0,0.0,14.285714,0.0,0.0,7.142857,0.0,A. M. Mubarak


In [58]:
# merged ataframe
dfc = pd.merge(df_Hp, df, how='left', on='title')
dfc.sample()

Unnamed: 0,academia,comic,indian,fictional,european,russian,compsci,TV,physicist,title,summary
10498,23.529412,0.0,11.764706,0.0,41.176471,0.0,15.686275,1.960784,5.882353,Ralph_E._Gomory,Ralph Edward Gomory (born 7 May 1929) is an Am...


In [59]:
# Save clean dataframe
with open('../data/df_Hp.pkl','wb') as fout:
    pickle.dump(df_Hp, fout)    
with open('../data/df_combined.pkl','wb') as fout:
    pickle.dump(dfc, fout)

---

#### Find a representative example for each category

  
**Keywords** in TOPICS:  
  
Topic  1 (**Academia**)
university, professor, research, science, born, institute, department, director, college, engineering

Topic  2 (**Comica**)
marvel, comics, comic, comic books, comic books published, books published, american comic books, appearing american, appearing american comic, american comic

Topic  3 (**Indian**)
indian, science, india, academy, sciences, indian science, academy sciences, research, scientific, government india

Topic  4 (**Fictional**)
characters, star trek, trek, star, fictional, created star trek, created star, created, major characters, lists characters

Topic  5 (**European**)
german, known, french, work, march, april, august, february, december, january

Topic  6 (**Russian**)
russian, academy, sciences, chinese, academy sciences, member, soviet, academician, national, national academy

Topic 7 (**CompSci**)
computer, computer science, science, computer scientist, scientist, computing, professor computer, programming, professor computer science, american computer

Topic  8 (**TV**)
series, characters, television, television series, character, fictional, characters television series, characters television, fictional characters television, main recurring

Topic  9 (**Physicists**)
physics, theory, quantum, physicist, theoretical, prize, nobel, nuclear, einstein, nobel prize


In [26]:
# Example of TOPIC-1, 'Academia' 
df[df.index==1157].summary.values[0]

# keywords: university, professor, research

"Asmeret Asefaw Berhe is a soil biogeochemist, political ecologist and associate professor at University of California, Merced. Her research group works to understand how soil helps regulate the earth's climate."

In [27]:
# Example of TOPIC-5 'European' 
df[df.index==1787].summary.values[0]

# keywords: August, September

'Carl Auer von Welsbach, also known as Carl Auer, Freiherr von Welsbach (1 September 1858 – 4 August 1929) was an Austrian scientist and inventor, who had a talent not only for discovering advances, but also for turning them into commercially successful products. He is particularly well known for his work on rare-earth elements, which led to the development of the ferro rod used in modern lighters, the gas mantle, which brought light to the streets of Europe in the late 19th century, and for the development of the metal-filament light bulb.'

In [28]:
# Example of TOPIC-6 'CompSci' 
df[df.index==5283].summary.values[0]

# keywords: computing, computer

"Henry Givens Baker Jr. is an American computer scientist who has made contributions in garbage collection, functional programming languages, and linear logic. He was also one of the founders of Symbolics, a company that designed and manufactured a line of Lisp machines. In 2006 he was recognized as a Distinguished Scientist by the Association for Computing Machinery. He is notable for his research in garbage collection, particularly Baker's real-time copying collector, and on the Actor model. Baker received his B.Sc. (1969), S.M. (1973), E.E. (1973), and Ph.D. (1978) degrees at M.I.T."

In [29]:
# Example of TOPIC-3 'Indian'
df[df.index==851].summary.values[0]

# keywords: Indian, Science, scientific

'Animesh Chakraborty FNA, FASc (born 30 June 1935) is a Bengali Indian academic and a professor of chemistry. In 1975, he was awarded the Shanti Swarup Bhatnagar Prize for Science and Technology in chemistry by the Council of Scientific and Industrial Research.'

In [30]:
# Example of TOPIC-2 'comic'
df[df.index==994].summary.values[0]

# keywords: comic, comic book, marvel

"Apocalypse (En Sabah Nur) is a fictional supervillain appearing in comic books published by Marvel Comics. He is one of the world's first mutants, and was originally a principal villain for the original X-Factor team and now for the X-Men and related spinoff teams. Created by writer Louise Simonson and artist Jackson Guice, Apocalypse first appeared in X-Factor #5 (May 1986).Since his introduction, the character has appeared in a number of X-Men titles, including spin-offs and several limited series. Apocalypse has also been featured in various forms of media. In 2016, Oscar Isaac portrayed the villain in the film X-Men: Apocalypse. In 2009, Apocalypse was ranked as IGN's 24th Greatest Comic Book Villain of All Time."

In [31]:
# Example of TOPIC-4 'fictional'
df[df.index==2522].summary.values[0]

# keywords: Star Trek, fiction

'Data ( DAY-tə) is a character in the fictional Star Trek franchise. He appears in the television series Star Trek: The Next Generation (TNG) and the feature films Star Trek Generations (1994), Star Trek: First Contact (1996), Star Trek: Insurrection (1998), and Star Trek: Nemesis (2002). Data is portrayed by actor Brent Spiner. Data was found by Starfleet in 2338 as the sole survivor on Omicron Theta in the rubble of a colony left after an attack from the Crystalline Entity. He was a synthetic life form with artificial intelligence and designed and built by Doctor Noonien Soong in his own likeness (likewise portrayed by Spiner). Data is a self-aware, sapient, sentient and anatomically fully functional android who serves as the second officer and chief operations officer aboard the Federation starship USS Enterprise-D and later the USS Enterprise-E.  His positronic brain allows him impressive computational capabilities. He experienced ongoing difficulties during the early years of his 

In [32]:
# Example of TOPIC-7 'TV'
df[df.index==126].summary.values[0]

# keywords: television, 

'Adam Ross is a fictional character on the television series CSI: NY. He is portrayed by A. J. Buckley.'

In [33]:
# Example of TOPIC-8 'physicist' 
df[df.index==19].summary.values[0]

# keywords: physicist, nobel prize

'Aage Niels Bohr (Danish: [ˈɔːwə ˌnels ˈboɐ̯ˀ] (listen); 19 June 1922 – 8 September 2009) was a Danish nuclear physicist who shared the Nobel Prize in Physics in 1975 with Ben Mottelson and James Rainwater "for the discovery of the connection between collective motion and particle motion in atomic nuclei and the development of the theory of the structure of the atomic nucleus based on this connection". Starting from Rainwater\'s concept of an irregular-shaped liquid drop model of the nucleus, Bohr and Mottelson developed a detailed theory that was in close agreement with experiments. Since his father, Niels Bohr, had won the prize in 1922, he and his father were one of the six pairs of fathers and sons who have both won the Nobel Prize and one of the four pairs who have both won the Nobel Prize in Physics.'

In [34]:
# Example of TOPIC-9 'Russian'
df[df.index==448].summary.values[0]

# keywords: russion, soviet, academy sciences

'Alexander Petrovich Karpinsky (Russian: Александр Петрович Карпинский; 7 January 1847 (NS) – 15 July 1936) was a prominent Russian and Soviet geologist and mineralogist, and the president of the Russian Academy of Sciences, and later Academy of Sciences of the USSR, in 1917–1936.'

---

### QUERY
- First, let's see if we could locate people in the dataframe, based on cosine similarity
- Then, we check a random query outside of the dataframe

**Query 1** Hadley Wickham (is one of the observation in the dataframe)

In [35]:
query1= "Hadley Wickham is a statistician from New Zealand who is currently Chief Scientist at RStudio and an adjunct Professor of statistics at \
the University of Auckland,[3] Stanford University, \
and Rice University. He is best known for his development of open-source statistical analysis software packages for R (programming language) \
that implement logics of data visualisation and data transformation. Wickham's packages and writing are known for advocating a tidy data approach to data \
import, analysis and modelling methods."

In [36]:
df_Hp1,df_Qp1 = Recommender(df_Hp, query1, 'Hadley Wickham', vectorizer, nmf, top_n=10)

(13625, 10)
(1, 9)
(13625, 9)
(1, 9)


In [37]:
# Check the transformed query1
df_Qp1

Unnamed: 0,academia,comic,indian,fictional,european,russian,compsci,TV,physicist
Hadley Wickham,36.170213,0.0,0.0,0.0,23.404255,0.0,40.425532,0.0,0.0


In [38]:
# Compare query1 above with its top 10 similar people 
df_Hp1

Unnamed: 0,academia,comic,indian,fictional,european,russian,compsci,TV,physicist,title,similarity
4958,36.170213,0.0,0.0,0.0,23.404255,0.0,40.425532,0.0,0.0,Hadley_Wickham,1.0
7648,34.285714,0.0,0.0,0.0,25.714286,0.0,40.0,0.0,0.0,Larry_Yaeger,0.998721
891,33.333333,0.0,0.0,0.0,22.222222,0.0,44.444444,0.0,0.0,Anne Castles,0.99646
8094,38.709677,0.0,3.225806,0.0,20.967742,0.0,37.096774,0.0,0.0,Lucy_Suchman,0.995324
4764,31.25,0.0,0.0,0.0,25.0,0.0,40.625,0.0,3.125,Gordon_S._Brown,0.995156
5,40.909091,2.272727,0.0,0.0,20.454545,0.0,36.363636,0.0,0.0,A. Michael Noll,0.992408
2362,41.860465,0.0,0.0,0.0,23.255814,0.0,34.883721,0.0,0.0,Cornelis H. A. Koster,0.990988
2561,33.333333,6.060606,0.0,0.0,18.181818,3.030303,39.393939,0.0,0.0,David Fries,0.989758
12378,38.888889,0.0,0.0,0.0,27.777778,0.0,33.333333,0.0,0.0,Thomas_Chesney,0.988938
6420,29.032258,1.612903,0.0,0.0,22.580645,0.0,45.16129,0.0,1.612903,Jill_H._Larkin,0.988582


In [40]:
# Other people similar to Hadley
df[df.index==7648].summary.values

array(["Larry Steven Yaeger (1950) is a former Apple Distinguished Scientist and Full Professor of Informatics at Indiana University Bloomington, currently employed at Google.  Outside of academia he is best known for designing the handwriting recognition software used in the Apple Newton and Inkwell. Yaeger's academic research focused on the evolution of true artificial intelligence through natural selection.  He is the lead developer of Polyworld."],
      dtype=object)

- Hadley Wickham found himself on Wikipedia

---

**Query #2**: JD

In [43]:
# JD's LinkedIn Summary
query2 = "Research Chemist transitioning into data science. I have an academic background in spectroscopy and molecular modeling, \
which are the science of extracting signals out of the noise their interactions.\
I love integrating experimental techniques with computational approaches, to find actionable insights and most consistent answer to research questions. Hands-on\
laboratory work is fun, but in silico computational projects have always peaked my interest. The latter has led me to this exciting field of data science.\
I'm currently taking a deeper dive into machine learning and AI for industrial applications, especially, in the biomedical and healthcare fields.\
I'm also passionate about science communication, which is a way of “storytelling using data”. I find it enjoyable to decompose technical concepts, \
and convey them to a mixed audience, e.g., scientists in different disciplines or others without scientific backgrounds."


In [44]:
# Get recommendations of articles to look up
df_Hp2,df_Qp2 = Recommender(df_Hp, query2,'JD', vectorizer, nmf, top_n=10)

(13625, 10)
(1, 9)
(13625, 9)
(1, 9)


In [45]:
# Check the transformed query3
df_Qp2

Unnamed: 0,academia,comic,indian,fictional,european,russian,compsci,TV,physicist
JD,18.072289,0.0,20.481928,0.0,8.433735,0.0,39.759036,3.614458,9.638554


In [46]:
# List of top 10 most scientists, similar to JD
df_Hp2

Unnamed: 0,academia,comic,indian,fictional,european,russian,compsci,TV,physicist,title,similarity
9016,18.644068,0.0,16.949153,0.0,13.559322,0.0,37.288136,0.0,13.559322,Moe_Z._Win,0.985219
11262,18.75,0.0,12.5,0.0,3.125,3.125,46.875,3.125,12.5,Saddek_Rabah,0.972623
10306,25.0,0.0,18.75,0.0,12.5,0.0,43.75,0.0,0.0,Prasad_Ram,0.970528
7210,11.538462,0.0,31.730769,0.0,0.0,0.0,46.153846,3.846154,6.730769,Kalyanmoy_Deb,0.960011
11663,16.27907,0.0,32.55814,0.0,11.627907,0.0,34.883721,0.0,4.651163,Sivasubramanian_Srikantan,0.958202
12047,26.315789,0.0,14.035088,0.0,5.263158,0.0,54.385965,0.0,0.0,Sylvia_Ratnasamy,0.953386
5612,20.833333,0.0,10.416667,0.0,14.583333,8.333333,37.5,0.0,8.333333,IEEE_Edison_Medal,0.951719
5894,28.571429,0.0,14.285714,0.0,14.285714,0.0,42.857143,0.0,0.0,Iván_Guzmán_de_Rojas,0.951155
10676,25.531915,0.0,10.638298,0.0,0.0,0.0,53.191489,0.0,10.638298,Ricardo_Bianchini,0.950697
10563,14.678899,0.0,22.93578,0.0,0.917431,6.422018,55.045872,0.0,0.0,Ravi_Sethi,0.948597


In [47]:
# Other people similar to JD
df[df.index==11262].summary.values

array(['Saddek Rabah (born 1968) is a researcher in the field of information science and communication.'],
      dtype=object)

In [48]:
# Other people similar to JD
df[df.index==10676].summary.values

array(['Ricardo Bianchini from Rutgers University & Microsoft Research, Bellevue, WA was named Fellow of the Institute of Electrical and Electronics Engineers (IEEE) in 2015 for contributions to server and data center energy management. He was named an Association for Computing Machinery (ACM) Fellow in 2016  for contributions to power, energy and thermal management of servers and datacenters.'],
      dtype=object)

In [51]:
# Other people similar to JD
df[df.index==10306].summary.values

array(['Prasad Ram (aka Pram) is the founder and CEO of Gooru, a non-profit education technology start-up. Ram previously worked at Xerox PARC, Dynamx Technology, Yahoo! and Google.'],
      dtype=object)

- JD's summary sounds similar to other computer scientists, w/ cosine similarity of ~0.9

---