In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import scipy
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv("./data/wikipedia-vectors.csv")

In [3]:
df = df.drop("Unnamed: 0", axis=1)

In [4]:
df.head()

Unnamed: 0,HTTP 404,Alexa Internet,Internet Explorer,HTTP cookie,Google Search,Tumblr,Hypertext Transfer Protocol,Social search,Firefox,LinkedIn,...,Chad Kroeger,Nate Ruess,The Wanted,Stevie Nicks,Arctic Monkeys,Black Sabbath,Skrillex,Red Hot Chili Peppers,Sepsis,Adam Levine
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.008878,0.0,0.0,0.049502,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00611,0.0
2,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005646,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
documents = df.values

In [6]:
titles = df.columns.values

In [7]:
scipy.sparse.issparse(documents)

False

In [8]:
articles = scipy.sparse.csr_matrix(documents)

## Cluster Wikipedia articles using a pipeline with PCA and KMeans

In [9]:
# Instantiate PCA model using TruncatedSVD in order to be able to use csr_matrix sparse matrixes
svd = TruncatedSVD(n_components=50)

In [10]:
# Instantiate a KMeans instance
kmeans = KMeans(n_clusters=6)

In [11]:
# Create a pipeline
pipeline = make_pipeline(svd, kmeans)

In [12]:
# Fit the documents
pipeline.fit(articles.T)

Pipeline(steps=[('truncatedsvd', TruncatedSVD(n_components=50)),
                ('kmeans', KMeans(n_clusters=6))])

In [13]:
# Calculate the cluster labels: labels
labels = pipeline.predict(articles.T)

In [14]:
# Create a DataFrame aligning labels and titles: df
df = pd.DataFrame({'label': labels, 'article': titles})

# Display df sorted by cluster label
print(df.sort_values('label'))

    label                                        article
49      0                                       Lymphoma
41      0                                    Hepatitis B
42      0                                    Doxycycline
43      0                                       Leukemia
44      0                                           Gout
45      0                                    Hepatitis C
46      0                                     Prednisone
47      0                                          Fever
48      0                                     Gabapentin
40      0                                    Tonsillitis
59      1                                    Adam Levine
51      1                                     Nate Ruess
52      1                                     The Wanted
53      1                                   Stevie Nicks
54      1                                 Arctic Monkeys
55      1                                  Black Sabbath
56      1                      