# Movies - SVD

In [5]:
import pandas as pd
import numpy as np
from sklearn.decomposition import TruncatedSVD

In [6]:
df = pd.read_parquet("../data/movies-clean.parquet.gzip")
df.head()

Unnamed: 0,RATING,VOTES,RunTime,TYPE,Year_From,Year_To,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,...,Director_Àlex Pastor,Director_Álex de la Iglesia,Director_Álvaro Brechner,Director_Álvaro Fernández Armero,Director_Álvaro Longoria,Director_Ángel Gómez Hernández,Director_Ángeles Reiné,Director_Åke Sandgren,Director_Óscar Pedraza,Director_Ömer Ugur
0,6.1,21062,121.0,Movie,2021,2021,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5.0,17870,25.0,Series,2021,2021,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
2,8.2,885805,44.0,Series,2010,2022,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9.2,414849,23.0,Series,2013,2013,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
5,7.6,25858,50.0,Series,2020,2020,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
selected_col = [col for col in df if col.startswith('Genre_')]
selected_col

['Genre_Action',
 'Genre_Adventure',
 'Genre_Animation',
 'Genre_Biography',
 'Genre_Comedy',
 'Genre_Crime',
 'Genre_Documentary',
 'Genre_Drama',
 'Genre_Family',
 'Genre_Fantasy',
 'Genre_Film-Noir',
 'Genre_Game-Show',
 'Genre_History',
 'Genre_Horror',
 'Genre_Music',
 'Genre_Musical',
 'Genre_Mystery',
 'Genre_News',
 'Genre_Reality-TV',
 'Genre_Romance',
 'Genre_Sci-Fi',
 'Genre_Short',
 'Genre_Sport',
 'Genre_Talk-Show',
 'Genre_Thriller',
 'Genre_War',
 'Genre_Western']

In [8]:
df_filtered = df[selected_col]
print(df_filtered.shape)
df_filtered.head()

(8168, 27)


Unnamed: 0,Genre_Action,Genre_Adventure,Genre_Animation,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Documentary,Genre_Drama,Genre_Family,Genre_Fantasy,...,Genre_News,Genre_Reality-TV,Genre_Romance,Genre_Sci-Fi,Genre_Short,Genre_Sport,Genre_Talk-Show,Genre_Thriller,Genre_War,Genre_Western
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,1,1,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
n_components = df_filtered.shape[1] - 1

svd = TruncatedSVD(n_components=n_components, n_iter=100)
svd_data = svd.fit_transform(df_filtered)

df_svd_data = pd.DataFrame(data=svd_data,
              columns=list(range(n_components)))

df_svd_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,0.506162,0.398868,-0.403222,0.094331,-0.422882,-0.276141,1.268346,0.298285,0.036546,0.319319,...,0.040702,-0.015256,0.020408,0.000337,0.00321,-0.00053,-0.008186,-0.002647,-0.002146,0.000487
1,0.786955,1.4839,-0.354579,-0.076035,0.105925,0.123407,-0.042001,0.082042,0.016278,-0.075018,...,0.004209,0.010854,-0.047056,-0.007694,-6.5e-05,0.003718,-0.001143,0.001122,9.7e-05,-0.000814
2,0.894683,-0.564891,-0.320875,-0.253963,-0.021645,0.276023,1.068701,0.119576,-0.19421,-0.014792,...,0.006917,-0.011048,-0.025356,-0.01721,0.008575,0.006889,-0.008738,-0.001355,-0.001911,0.000396
3,0.800346,1.027905,0.775993,0.026259,0.035271,0.657056,-0.10982,-0.025595,-0.031955,-0.397333,...,-0.026953,0.010212,-0.074908,0.000835,0.006478,-0.005355,0.006139,-0.002194,0.00266,-0.000937
4,1.327456,-0.088091,-0.662928,0.31581,-0.383711,-0.579733,-0.297729,-0.106477,-0.15927,0.199358,...,0.017169,-0.004777,0.019706,0.000149,-0.001447,0.003956,-0.004443,0.000941,-0.005261,0.002139


In [15]:
svd.explained_variance_ratio_

array([0.08759048, 0.19885743, 0.1478297 , 0.08438628, 0.06906947,
       0.05136233, 0.04721967, 0.04165726, 0.03807025, 0.0359145 ,
       0.02889937, 0.02443933, 0.02331205, 0.02200087, 0.01816212,
       0.0165697 , 0.01503935, 0.01172494, 0.01140305, 0.01014208,
       0.00497704, 0.00344116, 0.00298947, 0.00157358, 0.00135841,
       0.00119909])

In [16]:
np.cumsum(svd.explained_variance_ratio_)

array([0.08759048, 0.28644792, 0.43427762, 0.5186639 , 0.58773337,
       0.6390957 , 0.68631538, 0.72797264, 0.76604288, 0.80195738,
       0.83085676, 0.85529608, 0.87860813, 0.900609  , 0.91877112,
       0.93534081, 0.95038016, 0.9621051 , 0.97350815, 0.98365023,
       0.98862727, 0.99206843, 0.99505789, 0.99663147, 0.99798988,
       0.99918897])