In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import TruncatedSVD
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline

Setting up pipeline

In [2]:
svd = TruncatedSVD(n_components=50)
kmeans = KMeans(n_clusters=6)


In [3]:
pipeline = make_pipeline(svd, kmeans)

In [4]:
df_1 = pd.read_csv('wikipedia-vectors.csv')


In [5]:
df_2 = pd.read_csv('wikipedia-vocabulary-utf8.txt')


In [6]:
titles = df_1.columns

In [7]:
titles.shape

(61,)

In [8]:
titles.dtype

dtype('O')

In [9]:
remove_item = ['Unnamed: 0']

In [10]:
titles_arr = titles.to_numpy()

np.setdiff1d to set difference between 2 matrix 

In [11]:
titles_def = np.setdiff1d(titles_arr,remove_item)

In [12]:
titles_def.shape

(60,)

In [13]:
df_2

Unnamed: 0,aaron
0,abandon
1,abandoned
2,abandoning
3,abandonment
4,abbas
...,...
13119,zimbabwe
13120,zinc
13121,zone
13122,zones


In [14]:
df_1 = df_1.reset_index(drop=True)


In [15]:
del df_1['Unnamed: 0']

In [29]:
df_1T = df_1.T

In [30]:
df_1T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,13115,13116,13117,13118,13119,13120,13121,13122,13123,13124
HTTP 404,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Alexa Internet,0.0,0.0,0.029607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Internet Explorer,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003772,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.011594,0.0,0.0
HTTP cookie,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Google Search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.006649,0.0
Tumblr,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Hypertext Transfer Protocol,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Social search,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Firefox,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.031222,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
LinkedIn,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


Make pipeline to cluster articles

In [31]:
pipeline.fit(df_1T)

Pipeline(steps=[('truncatedsvd', TruncatedSVD(n_components=50)),
                ('kmeans', KMeans(n_clusters=6))])

In [32]:
labels = pipeline.predict(df_1T)
len(labels)


60

In [33]:
labels

array([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)

In [34]:
len(titles_def)

60

In [35]:
titles_def.ndim

1

In [37]:

df = pd.DataFrame({'label':labels, 'articles':titles_def})

In [38]:
df

Unnamed: 0,label,articles
0,2,2007 United Nations Climate Change Conference
1,2,2010 United Nations Climate Change Conference
2,2,2014 FIFA World Cup qualification
3,2,350.org
4,2,Adam Levine
5,2,Alexa Internet
6,2,Angelina Jolie
7,2,Anne Hathaway
8,2,Arctic Monkeys
9,2,Arsenal F.C.
