In [34]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.linalg import svd as scipy_svd
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_iris
from sklearn.decomposition import PCA, TruncatedSVD
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline
%load_ext watermark
%load_ext autoreload
%autoreload 2
%config InlineBackend.figure_format = 'retina'
%watermark -a 'Ethen' -d -t -v -p numpy,scipy,pandas,sklearn,matplotlib

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Ethen 2020-10-18 18:12:38 

CPython 3.7.9
IPython 7.17.0

numpy 1.19.1
scipy 1.5.2
pandas 1.1.1
sklearn 0.0
matplotlib 3.3.1


In [35]:
example = [
    'Node js is super efficient',
    'Javascript is easy to code',
    'React and Angular js are front end frameworks',
    'Front end development is fun',
    'Javascript is great for Web development',
    'I like tennis',
    'Tennis is great to watch']

# a two-staged model pipeline,
# first convert raw words to a tfidf document-term matrix
# and apply svd decomposition after that
tfidf = TfidfVectorizer(stop_words = 'english')
svd = TruncatedSVD(n_components = 3)
pipeline = Pipeline([
    ('tfidf', tfidf),
    ('svd', svd)
])
X_lsa = pipeline.fit_transform(example)
X_lsa

array([[ 0.07143456,  0.32603645, -0.49034427],
       [ 0.33701986,  0.13378431,  0.55040488],
       [ 0.21190626,  0.57355785, -0.50593777],
       [ 0.44734903,  0.5483403 , -0.02543792],
       [ 0.71442519,  0.16818299,  0.40582996],
       [ 0.44608969, -0.54180734, -0.40961217],
       [ 0.65992211, -0.4753254 , -0.21076763]])

<h2>n_components int, default = 2<br>
Desired dimensionality of output data. Must be strictly less than the number of features. The default value is useful for visualisation. For LSA, a value of 100 is recommended.</h2>

In [36]:
# mapping of words to latent factors/concepts,
# i.e. each concept is a linear combination of words
tfidf = pipeline.named_steps['tfidf']
vocab = tfidf.get_feature_names()
pd.DataFrame(svd.components_, index = ['comp1', 'comp2', 'comp3'], columns = vocab)

Unnamed: 0,angular,code,development,easy,efficient,end,frameworks,fun,great,javascript,js,like,node,react,super,tennis,watch,web
comp1,0.067131,0.136232,0.384077,0.136232,0.024653,0.215342,0.067131,0.19229,0.459924,0.337544,0.076189,0.227523,0.024653,0.067131,0.024653,0.424328,0.283663,0.270405
comp2,0.210597,0.062679,0.288008,0.062679,0.130414,0.401578,0.210597,0.273182,-0.135327,0.113272,0.283068,-0.320288,0.130414,0.210597,0.130414,-0.462436,-0.236806,0.073779
comp3,-0.205453,0.285195,0.151806,0.285195,-0.216921,-0.182178,-0.205453,-0.014016,0.067042,0.400177,-0.350607,-0.2678,-0.216921,-0.205453,-0.216921,-0.318696,-0.116131,0.196896


In [37]:
svd.components_

array([[ 0.06713137,  0.13623242,  0.38407651,  0.13623242,  0.02465332,
         0.21534198,  0.06713137,  0.19228996,  0.45992403,  0.33754394,
         0.07618917,  0.2275232 ,  0.02465332,  0.06713137,  0.02465332,
         0.4243285 ,  0.28366306,  0.27040495],
       [ 0.2105966 ,  0.06267907,  0.2880077 ,  0.06267907,  0.1304144 ,
         0.40157811,  0.2105966 ,  0.27318239, -0.13532685,  0.11327187,
         0.28306842, -0.32028812,  0.1304144 ,  0.2105966 ,  0.1304144 ,
        -0.46243635, -0.23680647,  0.07377894],
       [-0.20545331,  0.28519478,  0.15180583,  0.28519478, -0.21692141,
        -0.18217843, -0.20545331, -0.01401606,  0.06704163,  0.40017653,
        -0.35060729, -0.26780008, -0.21692141, -0.20545331, -0.21692141,
        -0.31869581, -0.11613105,  0.19689574]])

In [38]:
svd = pipeline.named_steps['svd']
print('total variance explained:', np.sum(svd.explained_variance_))

# mapping of document to latent factors/concepts,
# i.e. Each document is a linear combination of the concepts
pd.DataFrame(X_lsa, index = example, columns = ['comp1', 'comp2', 'comp3'])

total variance explained: 0.3787994332385665


Unnamed: 0,comp1,comp2,comp3
Node js is super efficient,0.071435,0.326036,-0.490344
Javascript is easy to code,0.33702,0.133784,0.550405
React and Angular js are front end frameworks,0.211906,0.573558,-0.505938
Front end development is fun,0.447349,0.54834,-0.025438
Javascript is great for Web development,0.714425,0.168183,0.40583
I like tennis,0.44609,-0.541807,-0.409612
Tennis is great to watch,0.659922,-0.475325,-0.210768


In [39]:
X_normed = normalize(X_lsa, axis = 1)
similarity = X_normed @ X_normed.T
pd.DataFrame(similarity, index = example, columns = example)

Unnamed: 0,Node js is super efficient,Javascript is easy to code,React and Angular js are front end frameworks,Front end development is fun,Javascript is great for Web development,I like tennis,Tennis is great to watch
Node js is super efficient,1.0,-0.517176,0.956394,0.531406,-0.187202,0.116322,-0.008997
Javascript is easy to code,-0.517176,1.0,-0.249138,0.4502,0.880356,-0.275573,0.077306
React and Angular js are front end frameworks,0.956394,-0.249138,1.0,0.751207,0.063896,-0.013941,-0.039219
Front end development is fun,0.531406,0.4502,0.751207,1.0,0.676039,-0.151395,0.067128
Javascript is great for Web development,-0.187202,0.880356,0.063896,0.676039,1.0,0.090008,0.434259
I like tennis,0.116322,-0.275573,-0.013941,-0.151395,0.090008,1.0,0.934874
Tennis is great to watch,-0.008997,0.077306,-0.039219,0.067128,0.434259,0.934874,1.0
