# SVD (Singular Value Decomposition)

$\huge R = P_{mk}S_{kk}Q^T_{kn}$

- La matriz S es diagonal. Sus valores estan ordenados de mayor a menor. 
- Cada valor de la diagonal de S indica que tan importante es ese feature
- Puedo quedarme con los primeros y aproximar la matriz R
- Tengo que definir las peliculas no calificadas con algun valor 
- Tanto P como Q son ortogonales
- Si R es una matriz real, la decomposición existe
- Elegimos el K y nos garantiza que el la mejor aproximación de rango K (Rank K best aproximation) Trunkated SVD

In [1]:
from scipy.sparse.linalg import svds
import pandas as pd
import numpy as np

In [3]:
def SVD(users_items_train_matrix, k = 20):
    #get SVD components from train matrix. Choose k.
    u, s, vt = svds(users_items_train_matrix, k)
    s_diag_matrix=np.diag(s)
    X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
    return X_pred

In [4]:
def rmse(R, R_estimated):
    # No es conmutativa! Primero va ground truth, normalmente R_test
    # Segundo van las predicciones
    # Es necesario el orden para que el nonzero sea del ground truth
    nonzeros = R.nonzero()
    prediction = R_estimated[nonzeros].flatten()
    ground_truth = R[nonzeros].flatten()
    return np.sqrt(((prediction - ground_truth)**2).sum()/ground_truth.shape[0])

In [9]:
header = ['userId', 'movieId', 'rating', 'timestamp']
df = pd.read_csv('./ml-100k/u.data', sep='\t', names=header)
n_users = df.userId.unique().shape[0]
n_items = df.movieId.unique().shape[0]

In [6]:
from sklearn.model_selection import train_test_split
train_data, test_data = train_test_split(df, test_size=0.2)

In [11]:
n_split = 20000
train_data = df[n_split:]
test_data = df[:n_split]
len(train_data), len(test_data)

(80000, 20000)

In [13]:
test_data

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
5,298,474,4,884182806
6,115,265,2,881171488
7,253,465,5,891628467
8,305,451,3,886324817
9,6,86,3,883603013


In [14]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_users, n_items))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]

test_data_matrix = np.zeros((n_users, n_items))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

# Aplico SVD directamente

In [19]:
test_data_matrix

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [20]:
SVD_Predictions = SVD(train_data_matrix, k=20)

In [21]:
print('Training:',rmse(train_data_matrix, SVD_Predictions))
print('Testing:',rmse(test_data_matrix, SVD_Predictions))

Training: 2.301433287702103
Testing: 2.88799384762358


# Que pasa si predigo con la media:

In [23]:
mu = train_data_matrix[train_data_matrix.nonzero()].mean()
print(mu)

3.52835


In [25]:
print('Training:',rmse(train_data_matrix, mu*np.ones(train_data_matrix.shape)))
print('Testing:',rmse(test_data_matrix, mu*np.ones(test_data_matrix.shape)))

Training: 1.1185576773237935
Testing: 1.1536759477860323


# Agrego la media al SVD

In [26]:
train_data_matrix

array([[5., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

In [27]:
# Resto la media solo a los distinto de cero
R_train_norm = (train_data_matrix - mu)*(train_data_matrix>0) 

In [28]:
SVD_Predictions_norm = SVD(R_train_norm, k=20) + mu

In [31]:
print('Training:',rmse(train_data_matrix, SVD_Predictions_norm))
print('Testing:',rmse(test_data_matrix, SVD_Predictions_norm))

Training: 0.902650106889517
Testing: 1.0834567446952197


# Agrego baselines

In [32]:
from cf_helper_2 import getBaselineEstimates

In [33]:
# Estamos haciendo broadcasting
bii, bui = getBaselineEstimates(train_data_matrix, mu, lambda1 = 0, lambda2 = 0, items_first = True, not_rated = 0)
baseline = mu + bui + bii.T

In [34]:
bui.shape, bii.shape, baseline.shape

((1682, 1), (943, 1), (1682, 943))

In [35]:
R_train_baseline = (train_data_matrix - baseline.T)*(train_data_matrix>0)

In [44]:
SVD_Predictions_baseline = SVD(R_train_baseline, k=20) + baseline.T

In [45]:
print('Training:',rmse(train_data_matrix, SVD_Predictions_baseline))
print('Testing:',rmse(test_data_matrix, SVD_Predictions_baseline))

Training: 0.7830014251153156
Testing: 0.9439824791061129
