In [70]:
import sklearn
# Import all of the scikit learn stuff
from __future__ import print_function
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.preprocessing import Normalizer
from sklearn import metrics
from sklearn.cluster import KMeans, MiniBatchKMeans
import pandas as pd
import warnings
# Suppress warnings from pandas library warnings.filterwarnings("ignore", category=DeprecationWarning,
import numpy as np
import os
import languageProcess as lp

In [None]:
articles=[]
path='../RandomBaseline/plaindata'
categories=os.listdir(path)
categories=[cat for cat in categories if os.path.exists(path+"/"+cat+"/AA/wiki_00")]
for cat in categories:
    filepath = path+"/"+cat+"/AA/wiki_00"
    if os.path.exists(filepath):               #Save all base articles in an array of arrays
        articles += [lp.languageProcess(filepath).getWords()]

In [None]:
test = [' '.join(block) for block in articles]


In [None]:
vectorizer= TfidfVectorizer()
dtm = vectorizer.fit_transform(test)
pd.DataFrame(dtm.toarray(),index=categories,columns=vectorizer.get_feature_names ()).head(10)

In [None]:
# Get words that correspond to each column
vectorizer.get_feature_names()

In [None]:
# Fit LSA. Use algorithm = “randomized” for large datasets
#lsa = TruncatedSVD(99, algorithm = 'arpack')
lsa = TruncatedSVD(n_components=2,algorithm='arpack')
dtm_lsa = lsa.fit_transform(dtm.asfptype())
dtm_lsa = Normalizer(copy=False).fit_transform(dtm_lsa)

In [None]:
pd.DataFrame(lsa.components_,index = ["component_1","component_2"],columns = vectorizer.get_feature_names())

In [None]:
pd.DataFrame(dtm_lsa, index = categories, columns = ["component_1","component_2" ])

In [None]:
xs = [w[0] for w in dtm_lsa]
ys = [w[1] for w in dtm_lsa]
xs, ys

In [None]:
# Plot scatter plot of points %pylab inline
import matplotlib.pyplot as plt
plt.figure()
plt.scatter(xs,ys)
plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title('Plot of points against LSA principal components')
 
plt.show()

In [None]:
# Plot scatter plot of points with vectors %pylab inline
import matplotlib.pyplot as plt
plt.figure()
ax = plt.gca()
ax.quiver(0,0,xs,ys,angles='xy',scale_units='xy',scale=1, linewidth = .001)
ax.set_xlim([-1,1])
ax.set_ylim([-1,1])
plt.xlabel('First principal component')
plt.ylabel('Second principal component')
plt.title('Plot of points against LSA principal components')
plt.draw()
plt.savefig('../TestResults/Vectorgraph.png',dpi=500)
plt.show()

In [None]:
def renameCat(res):
    res=[cat.split('_')[0] for cat in res]
    return res
categories2=renameCat(categories)

In [None]:
# Compute document similarity using LSA components
similarity = np.asarray(np.asmatrix(dtm_lsa) * np.asmatrix(dtm_lsa).T)
res=pd.DataFrame(similarity,index=categories2, columns=categories2)
#res=res.iloc[:10]
#res=res.iloc[0:10, 0:10]
with open('../TestResults/similarityMatrixLSA_full.tex','w') as tf:
    tf.write(res.to_latex())



In [29]:
similarity = np.asarray(np.asmatrix(dtm_lsa) * np.asmatrix(dtm_lsa).T)
pd.DataFrame(similarity,index=categories, columns=categories)

Unnamed: 0,Culture_20,Nature_20,Concepts_20,History_20,Technology_20,Arts_20,Entertainment_20,Business_20,Society_20,Life_20,...,Events_20,Health_20,Education_20,Philosophy_20,Humanities_20,World_20,Mathematics_20,Sports_20,Politics_20,Geography_20
Culture_20,1.0,0.944321,0.919145,0.979498,0.728735,0.99244,0.988193,0.406539,0.534523,0.932247,...,0.717986,-0.052531,0.356707,0.955038,0.68813,-0.001397,0.84754,0.64388,0.286417,0.770447
Nature_20,0.944321,1.0,0.997577,0.991244,0.913475,0.8968,0.983583,0.684512,0.782838,0.999389,...,0.90703,0.278965,0.644227,0.999413,0.888552,0.327706,0.974974,0.859776,0.585711,0.937305
Concepts_20,0.919145,0.997577,1.0,0.979658,0.939567,0.863849,0.968647,0.733566,0.824227,0.999399,...,0.934124,0.345092,0.695871,0.994609,0.918314,0.392635,0.988077,0.893218,0.640675,0.959278
History_20,0.979498,0.991244,0.979658,1.0,0.85175,0.947367,0.998799,0.582261,0.693825,0.986025,...,0.843491,0.149724,0.537597,0.995186,0.820196,0.200087,0.937082,0.784819,0.473561,0.88308
Technology_20,0.728735,0.913475,0.939567,0.85175,1.0,0.639179,0.825052,0.921911,0.968283,0.927135,...,0.999879,0.645569,0.899692,0.899001,0.998344,0.683777,0.981075,0.993173,0.864829,0.998012
Arts_20,0.99244,0.8968,0.863849,0.947367,0.639179,1.0,0.961917,0.291333,0.426754,0.880792,...,0.627129,-0.174697,0.239351,0.911429,0.593874,-0.124119,0.775994,0.545107,0.166661,0.68638
Entertainment_20,0.988193,0.983583,0.968647,0.998799,0.825052,0.961917,1.0,0.541722,0.657703,0.976677,...,0.816156,0.101094,0.495632,0.989187,0.791177,0.151835,0.918849,0.753508,0.429833,0.859026
Business_20,0.406539,0.684512,0.733566,0.582261,0.921911,0.291333,0.541722,1.0,0.989465,0.709567,...,0.927831,0.891016,0.998547,0.659138,0.942673,0.913065,0.829452,0.960807,0.991797,0.895662
Society_20,0.534523,0.782838,0.824227,0.693825,0.968283,0.426754,0.657703,0.989465,1.0,0.804102,...,0.972056,0.815908,0.980225,0.761065,0.981054,0.844406,0.90158,0.990818,0.962843,0.950611
Life_20,0.932247,0.999389,0.999399,0.986025,0.927135,0.880792,0.976677,0.709567,0.804102,1.0,...,0.921189,0.31235,0.670558,0.997606,0.90404,0.360518,0.982147,0.877095,0.613675,0.94891
