In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv("./data/wikipedia-vectors.csv")

In [3]:
df = df.drop("Unnamed: 0", axis=1)

In [4]:
df.head()

Unnamed: 0,HTTP 404,Alexa Internet,Internet Explorer,HTTP cookie,Google Search,Tumblr,Hypertext Transfer Protocol,Social search,Firefox,LinkedIn,...,Chad Kroeger,Nate Ruess,The Wanted,Stevie Nicks,Arctic Monkeys,Black Sabbath,Skrillex,Red Hot Chili Peppers,Sepsis,Adam Levine
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.008878,0.0,0.0,0.049502,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00611,0.0
2,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005646,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
documents = df.values

In [6]:
titles = df.columns.values

In [7]:
documents.shape

(13125, 60)

In [8]:
articles = scipy.sparse.csr_matrix(documents).T

In [9]:
articles.shape

(60, 13125)

In [69]:
# Read vocabulary text file into an array named words
with open('data/wikipedia-vocabulary-utf8.txt') as f:
    words = f.read().splitlines()

In [70]:
len(words)

13125

## Cluster Wikipedia articles using a pipeline with PCA and KMeans

In [10]:
# Instantiate PCA model using TruncatedSVD in order to be able to use csr_matrix sparse matrixes
svd = TruncatedSVD(n_components=50)

In [11]:
# Instantiate a KMeans instance
kmeans = KMeans(n_clusters=6)

In [12]:
# Create a pipeline
pipeline = make_pipeline(svd, kmeans)

In [13]:
# Fit the documents
pipeline.fit(articles)

Pipeline(steps=[('truncatedsvd', TruncatedSVD(n_components=50)),
                ('kmeans', KMeans(n_clusters=6))])

In [14]:
# Calculate the cluster labels: labels
labels = pipeline.predict(articles)

In [15]:
# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
print(df.sort_values('label'))

    label                                        article
0       0                                       HTTP 404
8       0                                        Firefox
7       0                                  Social search
6       0                    Hypertext Transfer Protocol
5       0                                         Tumblr
9       0                                       LinkedIn
3       0                                    HTTP cookie
2       0                              Internet Explorer
1       0                                 Alexa Internet
4       0                                  Google Search
50      1                                   Chad Kroeger
57      1                          Red Hot Chili Peppers
56      1                                       Skrillex
55      1                                  Black Sabbath
54      1                                 Arctic Monkeys
53      1                                   Stevie Nicks
52      1                      

## NMF applied to Wikipedia articles

In [48]:
# Import dependencies
from sklearn.decomposition import NMF

In [49]:
# Instantiate an NMF model
model = NMF(n_components=6)

In [50]:
# Fit the model to the articles csr_matrix
model.fit(articles)



NMF(n_components=6)

In [51]:
print(articles.shape)

(60, 13125)


In [52]:
print(model.components_.shape)

(6, 13125)


In [53]:
# Transform the articles: nmf_features
nmf_features = model.transform(articles)

In [54]:
# Print the NMF features
print(nmf_features.round(2))

[[0.   0.   0.   0.   0.   0.44]
 [0.   0.   0.   0.   0.   0.57]
 [0.   0.   0.   0.   0.   0.4 ]
 [0.   0.   0.   0.   0.   0.38]
 [0.   0.   0.   0.   0.   0.49]
 [0.01 0.01 0.01 0.03 0.   0.33]
 [0.   0.   0.02 0.   0.01 0.36]
 [0.   0.   0.   0.   0.   0.49]
 [0.02 0.01 0.   0.02 0.03 0.48]
 [0.01 0.03 0.03 0.07 0.02 0.34]
 [0.   0.   0.53 0.   0.03 0.  ]
 [0.   0.   0.36 0.   0.   0.  ]
 [0.01 0.01 0.31 0.06 0.01 0.02]
 [0.   0.01 0.34 0.01 0.   0.  ]
 [0.   0.   0.43 0.   0.04 0.  ]
 [0.   0.   0.48 0.   0.   0.  ]
 [0.01 0.02 0.38 0.03 0.   0.01]
 [0.   0.   0.48 0.   0.   0.  ]
 [0.   0.01 0.55 0.   0.   0.  ]
 [0.   0.   0.47 0.   0.   0.  ]
 [0.   0.01 0.02 0.52 0.06 0.01]
 [0.   0.   0.   0.51 0.   0.  ]
 [0.   0.01 0.   0.42 0.   0.  ]
 [0.   0.   0.   0.44 0.   0.  ]
 [0.   0.   0.   0.5  0.   0.  ]
 [0.1  0.09 0.   0.38 0.   0.01]
 [0.   0.   0.   0.57 0.   0.01]
 [0.01 0.01 0.   0.47 0.   0.01]
 [0.   0.   0.   0.58 0.   0.  ]
 [0.   0.   0.   0.53 0.01 0.01]
 [0.   0.4

In [55]:
# Create a pandas DataFrame to view the nmf_features
df = pd.DataFrame(nmf_features, index=titles)

In [56]:
df.index

Index(['HTTP 404', 'Alexa Internet', 'Internet Explorer', 'HTTP cookie',
       'Google Search', 'Tumblr', 'Hypertext Transfer Protocol',
       'Social search', 'Firefox', 'LinkedIn', 'Global warming',
       'Nationally Appropriate Mitigation Action', 'Nigel Lawson',
       'Connie Hedegaard', 'Climate change', 'Kyoto Protocol', '350.org',
       'Greenhouse gas emissions by the United States',
       '2010 United Nations Climate Change Conference',
       '2007 United Nations Climate Change Conference', 'Angelina Jolie',
       'Michael Fassbender', 'Denzel Washington', 'Catherine Zeta-Jones',
       'Jessica Biel', 'Russell Crowe', 'Mila Kunis', 'Dakota Fanning',
       'Anne Hathaway', 'Jennifer Aniston', 'France national football team',
       'Cristiano Ronaldo', 'Arsenal F.C.', 'Radamel Falcao',
       'Zlatan Ibrahimović', 'Colombia national football team',
       '2014 FIFA World Cup qualification', 'Football', 'Neymar',
       'Franck Ribéry', 'Tonsillitis', 'Hepatitis B', '

In [57]:
df.loc['Anne Hathaway']

0    0.003846
1    0.000000
2    0.000000
3    0.575626
4    0.000000
5    0.000000
Name: Anne Hathaway, dtype: float64

In [58]:
df.loc['Denzel Washington']

0    0.000000
1    0.005601
2    0.000000
3    0.422318
4    0.000000
5    0.000000
Name: Denzel Washington, dtype: float64

Both of these articles are reconstructed using primarily the 4th NMF component.

In [77]:
# Create a DataFrame from components of the model
components_df = pd.DataFrame(model.components_, columns=words)

In [78]:
# Print the shape of the DataFrame
components_df.shape

(6, 13125)

In [79]:
# Select the 4th row
component = components_df.iloc[3]

In [80]:
# Print the 5 largest words to establish the "theme"
print(component.nlargest())

film       0.627969
award      0.253169
starred    0.245320
role       0.211482
actress    0.186425
Name: 3, dtype: float64


The topic of the 4th NMF component looks related to film awards and who starred

## Find similar artilce to 'Christiano Ronaldo' using cosine similarity of nmf_features

In [83]:
# Import dependency 'normalize' to normalize the nmf_features array
from sklearn.preprocessing import normalize

In [84]:
# Normalize the NMF features : norm_features
norm_features = normalize(nmf_features)

In [85]:
# Create a DataFrame: df
df = pd.DataFrame(norm_features, index=titles)

In [86]:
# Select the row corresponding to 'Cristiano Ronaldo': article
article = df.loc['Cristiano Ronaldo']

In [87]:
# Compute the dot products to get the cosine similarities
similarities = df.dot(article)

In [89]:
# Display the top 5 results with the largets cosine similarities
similarities.nlargest(6)

Cristiano Ronaldo                  1.000000
Franck Ribéry                      0.999972
Radamel Falcao                     0.999942
Zlatan Ibrahimović                 0.999942
France national football team      0.999923
Colombia national football team    0.999897
dtype: float64