### See https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/decomposition/_truncated_svd.py for the very simple source code

### See https://medium.com/@jonathan_hui/machine-learning-singular-value-decomposition-svd-principal-component-analysis-pca-1d45e885e491 for SVD info

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.decomposition import TruncatedSVD

X, _ = make_classification(n_samples=100, n_features=75, n_informative=45, n_redundant=30)

Sample size is 100. Number of features is 75.

In [2]:
X.shape

(100, 75)

SVD fit and transform

X_reduced is the result of **`matrix_U * singular_value_diagonal_matrix`**

Remember the original data is decomposed as **`matrix_U * singular_value_diagonal_matrix * matrix_V_transposed`**

In [3]:
svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)

In [4]:
X_reduced = svd.fit_transform(X)

Sample size is 100. Stay the same

In [5]:
len(X_reduced)

100

For each sample, there are 10 weights for 10 SVD components

In [6]:
X_reduced[0]

array([ 36.0140469 ,  -0.64812602,  14.87204039,  20.33294393,
       -14.73013766, -36.37060579, -11.59688586,   0.13445743,
        22.48782938,   9.86109337])

svd.components_ is the **`matrix_V_transposed`**

There are 10 SVD components

In [7]:
len(svd.components_)

10

For each SVD component, there are 75 weights for 75 features

In [8]:
len(svd.components_[0])

75

In [9]:
svd.components_[0]

array([-2.61571314e-02,  4.46511201e-02,  1.64046670e-01,  4.27456401e-02,
       -2.19160128e-03,  9.53254666e-04, -2.34819799e-04, -1.18035607e-01,
       -2.10482762e-03,  8.51686191e-03,  1.53723441e-01, -1.24696712e-01,
       -2.38043628e-01,  1.35628508e-01,  1.55503635e-01, -1.37368033e-02,
        1.29464013e-02,  1.90940270e-02, -4.80706987e-02,  1.83758418e-01,
       -1.98526943e-01,  8.11738494e-02, -2.22814706e-01,  8.04903034e-03,
       -4.99027630e-02, -8.88138082e-02,  1.69142724e-01,  1.84383393e-02,
       -8.92796643e-03,  4.98793465e-03, -1.85719851e-02,  2.74933534e-01,
       -1.57566905e-02,  2.00823595e-01, -2.35986983e-02, -2.99251970e-02,
        1.39801401e-02, -2.88299932e-02,  2.83156352e-02, -1.89151586e-01,
        3.55569086e-02, -7.77718318e-03,  2.57672727e-02, -1.93745868e-02,
       -3.47311321e-02,  1.82521979e-02, -6.15324384e-02,  2.79188337e-02,
       -4.68931360e-02,  2.00902738e-02, -2.85797241e-02, -5.01392838e-03,
       -1.39791178e-02, -

<br>
<br>

# Playground

Each vector is a unit length vector

In [10]:
sum([e**2 for e in svd.components_[0]])

1.0

Multiply two matrixes should roughtly be an identity matrix

In [11]:
svd.components_.dot(svd.components_.T).round(10)

array([[ 1.,  0., -0., -0.,  0.,  0.,  0., -0.,  0., -0.],
       [ 0.,  1.,  0.,  0.,  0.,  0., -0., -0., -0., -0.],
       [-0.,  0.,  1.,  0., -0., -0.,  0., -0., -0., -0.],
       [-0.,  0.,  0.,  1., -0.,  0.,  0.,  0., -0.,  0.],
       [ 0.,  0., -0., -0.,  1., -0., -0., -0.,  0.,  0.],
       [ 0.,  0., -0.,  0., -0.,  1.,  0., -0.,  0.,  0.],
       [ 0., -0.,  0.,  0., -0.,  0.,  1., -0.,  0.,  0.],
       [-0., -0., -0.,  0., -0., -0., -0.,  1., -0., -0.],
       [ 0., -0., -0., -0.,  0.,  0.,  0., -0.,  1., -0.],
       [-0., -0., -0.,  0.,  0.,  0.,  0., -0., -0.,  1.]])

<br>
<br>

Singular values (which are square roots of eigenvalues) are in order from large to small

In [12]:
svd.singular_values_

array([343.35341345, 285.40897224, 262.93189631, 242.98846562,
       230.76377983, 220.25214352, 199.83515417, 196.47275937,
       193.78967762, 178.61233943])

The singular value diagonal matrix would be this

In [13]:
diagonal_matrix = np.diag(svd.singular_values_)
pd.DataFrame(diagonal_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,343.353413,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,285.408972,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,262.931896,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,242.988466,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,230.76378,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,220.252144,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,199.835154,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,196.472759,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,193.789678,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,178.612339


<br>
<br>

Try to reconstruct the original data

In [14]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,3.318385,-1.335216,-15.262954,-1.679784,-0.199879,-7.594914,4.333231,-12.976625,-0.073915,-3.430921,...,7.385153,3.805216,16.229995,-2.527131,-2.188776,0.264846,-0.687871,-14.949059,10.836897,-0.277444
1,0.355203,-5.143471,12.584564,-1.743279,-1.965200,6.360234,3.114499,2.869431,-0.643756,-0.918330,...,0.656041,-5.711786,7.563529,18.012245,-8.872849,-5.712571,-5.606692,1.349966,-4.748865,2.210344
2,5.230314,-2.437424,-10.237631,-3.920587,-4.212228,-1.105890,-1.905364,-2.225488,1.426167,0.315109,...,3.950272,-0.310392,9.423190,0.498845,-11.688548,-1.734097,-1.342232,17.844924,12.417675,2.292827
3,7.558886,0.865430,-3.516490,2.584340,-1.573387,-4.420539,1.179696,22.404948,-3.463998,4.149340,...,3.236607,-1.872588,17.151628,29.916688,-5.615180,0.450752,-0.123168,-25.548818,-13.342387,5.278240
4,0.652413,-11.106021,20.743024,-5.673967,4.841244,-0.678726,4.248229,2.845214,0.138787,2.620566,...,0.378366,0.173397,12.719712,26.724844,-9.209252,6.056792,-3.049974,-4.309677,-18.079275,-3.016551
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0.220322,2.423564,-6.486892,-6.149183,-4.128115,3.444268,2.516265,20.143204,-0.304290,0.434138,...,-3.682608,-4.267562,-17.432510,-38.421590,20.088626,2.639086,4.709136,-6.456932,-2.405124,-6.129624
96,-2.008974,-8.220722,28.475435,-4.419521,2.063945,7.277905,8.455173,-0.482255,6.755480,-1.957931,...,2.418628,2.793338,-15.317767,5.495006,8.334748,-2.466309,-1.255424,-1.481747,-19.705743,-1.006912
97,4.526129,5.369899,-23.070700,1.847157,-3.066094,-0.298442,3.810006,-2.681851,-8.490889,5.363236,...,-1.464089,3.902457,29.785262,4.845252,8.630028,5.473790,-2.266765,-18.918922,17.701090,-0.982425
98,-0.023188,2.286599,-4.182125,-0.089146,-2.395845,-6.751925,4.611440,11.232563,0.544451,-0.066751,...,-2.858878,-0.599946,19.225840,-19.212780,-9.528040,-1.751772,0.213968,13.326840,9.850100,-3.496663


Can be reconstructed to a certain degree as in below

(There is also a convenient function to do that `pd.DataFrame(svd.inverse_transform(X_reduced))`)

In [15]:
pd.DataFrame(X_reduced.dot(svd.components_))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,0.571061,2.669820,-5.551864,1.720342,-0.436941,-1.711018,-1.558909,-10.305514,-2.368787,0.361780,...,3.280279,1.130273,13.857192,6.635350,-2.774924,4.019531,-3.319371,-16.407382,21.092292,-1.096471
1,1.004427,-4.466049,11.103528,-2.379622,0.923171,2.337306,-1.330895,-11.661616,0.963099,0.790833,...,-2.785789,1.779415,3.655153,6.007220,-11.712323,-4.235121,0.177811,9.774056,-8.897315,-0.866838
2,1.086513,-0.747818,-0.240113,-0.393693,-0.227036,-1.116624,-3.802739,-9.989742,0.789603,-0.731264,...,0.692732,1.667237,4.827877,-1.820457,-8.882132,-1.967096,1.914130,25.653376,2.905334,-2.359595
3,4.732887,0.122203,11.356575,1.105507,0.308976,-1.461753,0.638590,12.216539,-0.578793,3.478069,...,0.210899,2.798914,21.860156,18.347119,-6.764510,3.074380,-0.681546,-15.707401,-3.223935,4.317018
4,-0.893207,-7.603964,8.842674,-1.478988,5.215586,-0.086701,0.449791,-1.209015,4.683823,-1.727173,...,-3.382257,1.896071,9.501088,25.550056,-19.083975,-0.478270,-2.872319,-2.149067,-21.376019,1.730117
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.856675,4.614888,-6.355584,-2.792774,-3.412890,0.749765,-0.006769,16.190606,-4.389744,0.755715,...,-4.335329,-5.532383,-23.580725,-39.163757,19.446458,-3.337998,0.960365,-9.399896,3.094835,-5.113188
96,-3.185519,-5.573383,29.690803,-3.644014,1.287695,6.012206,6.023832,-2.677039,5.040133,-2.143802,...,-1.925723,0.979189,-24.432462,-1.096556,8.629166,-3.093401,-1.001097,4.247888,-19.409432,0.803139
97,-0.638577,3.627866,-14.537215,3.093577,-3.555866,1.204372,-1.222524,-4.530714,-7.227146,2.716879,...,0.535046,-0.850335,12.847372,6.573025,6.283987,0.718314,-1.922161,-14.032451,20.241926,1.217806
98,2.672896,0.360032,-9.664724,-0.284006,-0.170755,-2.110574,-2.763126,3.857570,-1.606867,0.578151,...,-1.594923,-0.920952,4.307027,-6.349943,-3.219971,-0.396551,-0.016383,2.008971,3.328625,-2.665849


<br>
<br>

Percentage of variance explained by each of the selected components

In [16]:
print(svd.explained_variance_ratio_)

[0.14600053 0.09725607 0.08361613 0.07557309 0.06781736 0.06207754
 0.04770939 0.04912844 0.04804216 0.04077752]


In [17]:
print(svd.explained_variance_ratio_.sum())

0.7179982302012543
