### See https://github.com/scikit-learn/scikit-learn/blob/master/sklearn/decomposition/_truncated_svd.py for the very simple source code

### See https://medium.com/@jonathan_hui/machine-learning-singular-value-decomposition-svd-principal-component-analysis-pca-1d45e885e491 for SVD info

In [1]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.decomposition import TruncatedSVD

X, _ = make_classification(n_samples=100, n_features=75, n_informative=45, n_redundant=30)

Sample size is 100. Number of features is 75.

In [2]:
X.shape

(100, 75)

SVD fit and transform

X_reduced is the result of **"matrix U" multiplying "singular value diagonal matrix"**

In [3]:
svd = TruncatedSVD(n_components=10, n_iter=7, random_state=42)

In [4]:
X_reduced = svd.fit_transform(X)

Sample size is 100. Stay the same

In [5]:
len(X_reduced)

100

For each sample, there are 10 weights for 10 SVD components

In [6]:
X_reduced[0]

array([-60.60070599,  14.35482843,  -6.61405031,  24.7285586 ,
         2.95724388,  -5.97029454, -15.27744183,  -0.97883369,
        -1.50460115,   4.13224018])

svd.components_ is the **"matrix V transposed"**

There are 10 SVD components

In [7]:
len(svd.components_)

10

For each SVD component, there are 75 weights for 75 features

In [8]:
len(svd.components_[0])

75

In [9]:
svd.components_[0]

array([ 1.49494218e-02, -3.29059912e-03, -3.28589283e-02, -2.65577138e-01,
        1.55664326e-02,  1.15693894e-02, -2.79691145e-01,  2.49219476e-01,
       -1.35138034e-01,  8.00635182e-03,  1.57012279e-01,  3.05142768e-02,
        1.38903423e-01, -7.40487700e-02, -1.86645149e-01, -5.68321614e-02,
       -3.37222947e-01,  2.15216633e-01, -1.01777392e-01, -7.94307523e-02,
       -8.50242390e-02,  4.82108718e-02,  3.31602517e-02,  1.42882491e-01,
       -6.79283051e-04, -1.31273903e-02,  2.81749687e-02, -1.42626923e-02,
        2.58295496e-02,  3.44285839e-02,  3.49481704e-02, -3.29532372e-02,
        1.42834258e-02, -7.97056292e-02, -1.18023249e-01,  6.53658295e-02,
        4.40832100e-02, -4.19446634e-02,  2.81374489e-02, -2.42851083e-01,
        3.79228680e-03, -1.48225127e-01,  3.46220225e-02,  1.81813047e-01,
        7.64421274e-02,  1.45742522e-01, -1.65356439e-02, -7.93007930e-03,
       -6.33377914e-02,  1.59853822e-02, -5.01404081e-03, -1.56200440e-02,
       -1.17450117e-02, -

<br>
<br>

# Playground

Each vector is a unit length vector

In [10]:
sum([e**2 for e in svd.components_[0]])

0.9999999999999994

Multiply two matrixes should roughtly be an identity matrix

In [11]:
svd.components_.dot(svd.components_.T).round(10)

array([[ 1.,  0., -0., -0.,  0., -0.,  0.,  0., -0.,  0.],
       [ 0.,  1., -0., -0., -0., -0., -0., -0., -0.,  0.],
       [-0., -0.,  1.,  0.,  0.,  0., -0.,  0.,  0., -0.],
       [-0., -0.,  0.,  1., -0.,  0., -0.,  0., -0., -0.],
       [ 0., -0.,  0., -0.,  1., -0.,  0.,  0.,  0., -0.],
       [-0., -0.,  0.,  0., -0.,  1., -0., -0.,  0., -0.],
       [ 0., -0., -0., -0.,  0., -0.,  1.,  0., -0., -0.],
       [ 0., -0.,  0.,  0.,  0., -0.,  0.,  1.,  0.,  0.],
       [ 0., -0.,  0., -0.,  0.,  0., -0.,  0.,  1.,  0.],
       [ 0.,  0., -0., -0., -0., -0., -0.,  0.,  0.,  1.]])

<br>
<br>

Singular values (which are square roots of eigenvalues) are in order from large to small

In [12]:
svd.singular_values_

array([317.52552698, 310.54793638, 262.81770972, 258.20563259,
       222.85530245, 208.46620401, 203.8716673 , 191.49558917,
       183.76863174, 178.34522803])

The singular value diagonal matrix would be this

In [13]:
diagonal_matrix = np.diag(svd.singular_values_)
pd.DataFrame(diagonal_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,317.525527,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,310.547936,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,262.81771,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,258.205633,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,222.855302,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,208.466204,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,203.871667,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,191.495589,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,183.768632,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,178.345228


<br>
<br>

Try to reconstruct the original data

In [14]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,-9.132332,7.089939,19.223370,27.293425,-3.719563,2.149781,5.449277,-10.519035,13.584725,2.155446,...,6.049880,18.440491,3.339988,0.443339,4.110722,-2.727246,-1.728613,0.622320,-1.302451,-4.266368
1,4.478893,-0.273036,31.510871,-9.415801,-1.086996,-2.283428,-2.514856,38.780182,-11.301611,-0.314553,...,-3.149657,9.550968,-0.321987,-6.225977,4.975531,1.937830,-5.783017,-4.705664,-1.419935,-4.211261
2,3.340361,-6.745025,-5.135661,-22.466159,7.727146,0.398737,1.428882,25.785753,-1.350475,-1.165761,...,-5.125305,-8.603902,1.772804,1.530645,0.086788,-8.505870,-0.793862,-0.775816,5.679367,-2.573401
3,-16.686807,-1.344327,4.565187,-19.896347,3.239832,4.438482,16.635263,-2.883803,-0.663235,0.058284,...,3.722705,12.659561,7.161852,0.468714,-1.529047,-0.023374,1.828285,-4.237812,1.563899,-2.464846
4,-9.576904,-0.502106,-0.247760,-19.392239,5.145535,-3.054811,0.703669,39.414169,-4.156716,1.817312,...,6.684282,-6.491639,-0.680017,-0.520198,6.338722,0.463336,-3.499492,0.319045,2.892976,2.047480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-2.574216,-28.838422,23.323530,-6.855571,-3.965305,-1.584451,-13.882426,-3.388094,16.079069,4.019703,...,-3.369339,8.319965,-1.359570,1.883068,2.519148,1.081027,0.057770,-4.580587,1.026571,3.140596
96,-27.481911,-5.838652,-2.153460,0.231948,-3.504759,-1.235147,44.097789,-6.225608,-13.846832,3.959466,...,2.496861,0.179573,6.094864,-5.138933,3.547445,0.536938,4.000552,-0.120067,4.352163,0.529778
97,11.742710,-13.943011,-17.101825,-24.313567,-1.658159,-1.035551,-30.451898,13.593209,-26.900858,-3.152127,...,-0.181478,30.578265,-9.345561,-5.051479,-1.222967,8.095335,4.546276,5.616065,4.868810,8.547424
98,-0.290524,2.184476,4.878867,10.687342,-1.151827,-0.874794,-2.182505,-3.217682,50.818648,3.837363,...,-3.019897,-6.941452,4.672878,5.667970,1.999020,2.327339,-1.034739,-1.147605,0.026412,3.081097



Can be reconstructed to a certain degree 

In [15]:
pd.DataFrame(X_reduced.dot(svd.components_))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,-9.005098,7.108641,6.080660,20.075673,-3.208880,0.685157,15.533624,-19.401207,16.165256,1.054280,...,1.876353,8.766883,5.589905,0.672468,-0.665795,-1.981868,-0.556105,1.039780,-0.762544,-0.141265
1,-2.240511,9.784917,12.048555,0.468493,-0.732603,-2.746804,6.915163,13.451467,-15.810024,2.061764,...,-1.956305,-2.040995,-0.821419,-1.704001,0.553019,0.109687,1.643617,0.226216,0.322785,0.108381
2,6.724059,-0.390361,-15.361935,-8.146333,4.053053,0.564323,-3.718441,16.320811,-7.163843,-0.972427,...,-2.562567,-3.855855,-0.868760,-1.658346,1.169183,-0.584038,-1.420758,-1.091918,0.820352,-1.684132
3,-14.068311,8.124263,6.565413,-10.384853,0.697833,1.909804,9.068818,-6.818132,-0.453243,-1.384073,...,1.046388,13.189088,4.581926,-0.583736,-1.526956,-0.350716,3.516448,-0.040705,3.074580,0.906444
4,3.300060,4.126417,-9.932383,-7.241618,1.447372,0.256810,-17.186842,24.098088,-5.123539,1.740789,...,0.589670,-5.453477,-4.661185,0.404579,0.897533,-0.802202,-3.137478,2.943474,-0.150002,0.068624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,6.937670,-10.974373,13.954798,-9.537786,-2.575501,1.778166,-18.551286,-14.581011,14.666367,-1.490673,...,1.444582,-7.583875,-4.801230,2.751898,1.140592,-0.186081,1.680781,-2.214406,-1.813916,3.220573
96,-9.593831,7.790979,6.898324,-0.426822,-1.371673,-1.553030,24.825610,-4.259860,-14.352897,0.838263,...,-0.773947,0.722730,2.754002,-2.971248,-0.886724,-0.999455,0.563769,2.289140,-0.960580,-0.812801
97,1.991580,-21.989040,-8.137482,-24.738810,1.469873,-1.021806,-27.507476,-2.323269,-33.979505,-0.140116,...,-2.341862,19.240081,-6.607045,-3.006160,-3.353500,6.278729,-0.783668,1.321582,3.809162,3.260973
98,-4.342033,6.896998,6.122083,21.344061,-3.420001,0.524828,-5.418101,-10.102470,48.617178,2.726431,...,4.460537,-13.746628,4.867639,3.503546,4.624009,0.781699,0.050903,-3.473876,-3.125879,-2.107576


There is a convenient function to do that

In [16]:
pd.DataFrame(svd.inverse_transform(X_reduced))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,65,66,67,68,69,70,71,72,73,74
0,-9.005098,7.108641,6.080660,20.075673,-3.208880,0.685157,15.533624,-19.401207,16.165256,1.054280,...,1.876353,8.766883,5.589905,0.672468,-0.665795,-1.981868,-0.556105,1.039780,-0.762544,-0.141265
1,-2.240511,9.784917,12.048555,0.468493,-0.732603,-2.746804,6.915163,13.451467,-15.810024,2.061764,...,-1.956305,-2.040995,-0.821419,-1.704001,0.553019,0.109687,1.643617,0.226216,0.322785,0.108381
2,6.724059,-0.390361,-15.361935,-8.146333,4.053053,0.564323,-3.718441,16.320811,-7.163843,-0.972427,...,-2.562567,-3.855855,-0.868760,-1.658346,1.169183,-0.584038,-1.420758,-1.091918,0.820352,-1.684132
3,-14.068311,8.124263,6.565413,-10.384853,0.697833,1.909804,9.068818,-6.818132,-0.453243,-1.384073,...,1.046388,13.189088,4.581926,-0.583736,-1.526956,-0.350716,3.516448,-0.040705,3.074580,0.906444
4,3.300060,4.126417,-9.932383,-7.241618,1.447372,0.256810,-17.186842,24.098088,-5.123539,1.740789,...,0.589670,-5.453477,-4.661185,0.404579,0.897533,-0.802202,-3.137478,2.943474,-0.150002,0.068624
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,6.937670,-10.974373,13.954798,-9.537786,-2.575501,1.778166,-18.551286,-14.581011,14.666367,-1.490673,...,1.444582,-7.583875,-4.801230,2.751898,1.140592,-0.186081,1.680781,-2.214406,-1.813916,3.220573
96,-9.593831,7.790979,6.898324,-0.426822,-1.371673,-1.553030,24.825610,-4.259860,-14.352897,0.838263,...,-0.773947,0.722730,2.754002,-2.971248,-0.886724,-0.999455,0.563769,2.289140,-0.960580,-0.812801
97,1.991580,-21.989040,-8.137482,-24.738810,1.469873,-1.021806,-27.507476,-2.323269,-33.979505,-0.140116,...,-2.341862,19.240081,-6.607045,-3.006160,-3.353500,6.278729,-0.783668,1.321582,3.809162,3.260973
98,-4.342033,6.896998,6.122083,21.344061,-3.420001,0.524828,-5.418101,-10.102470,48.617178,2.726431,...,4.460537,-13.746628,4.867639,3.503546,4.624009,0.781699,0.050903,-3.473876,-3.125879,-2.107576


<br>
<br>

Percentage of variance explained by each of the selected components

In [17]:
print(svd.explained_variance_ratio_)

[0.12771738 0.12391133 0.08375202 0.0855317  0.06115246 0.05423057
 0.0523203  0.04638803 0.04334338 0.04077383]


In [18]:
print(svd.explained_variance_ratio_.sum())

0.7191209943398573
