In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
from SVD import SVD_SGD
from SVD import SVD_ALS
from SVD import SVD_SGD_python
from sklearn.model_selection import train_test_split

In [2]:
# Loads movie data
df = pd.read_csv('../Data/ml-latest-small/ratings.csv')
max_U = np.max(df['userId'])
max_I = np.max(df['movieId'])

In [3]:
# Separates into training, cross-validation and testing sets
p_test = 0.2
p_xvalid = 0.2
df_train, df_test = train_test_split(df, test_size=p_test)
df_train, df_xvalid = train_test_split(df_train,test_size=p_xvalid/(1-p_test))

In [4]:
start = time.time()
e_arr, mu, bu, bi, p, t = SVD_ALS(df_train, n_factors=3, n_epochs=6);
end = time.time()
dt_als = end-start
print 'This function took %f seconds' % dt_als

This function took 14.995691 seconds


In [7]:
print e_arr

[ 0.49983682  0.48832385  0.48172155  0.4772566   0.47400444  0.47160115]


In [5]:
# users Id, movie Id, and rating cross-validation
users_xvalid = df_xvalid['userId']
movies_xvalid = df_xvalid['movieId']
r_xvalid = df_xvalid['rating']

# users Id, movie Id, and rating test set
users_test = df_test['userId']
movies_test = df_test['movieId']
r_test = df_test['rating']

In [10]:
# Creates function to help with cross validation 
# Ouputs mean absolute error and model parameters
def SVDcv(lambda_p, lambda_t, n_fact, p_cv=None, t_cv=None):
    
    e_arr, mu, bu, bi, p, t = SVD_ALS(
        df_train,
        n_factors=n_fact, maxUserId=max_U, maxItemId=max_I,
        n_epochs=15, epsilon=1.0, e_tol=0.005,
        lam_p=lambda_p, lam_t=lambda_t,
        p_init=p_cv, t_init=t_cv)
    
    r_preds = mu + bu[users_xvalid] + bi[movies_xvalid]
    r_preds += np.sum(p[users_xvalid]*t[movies_xvalid],axis=1)
    MAE = np.mean(np.abs(r_xvalid-r_preds))

    return (MAE, mu, bu, bi, p, t)

In [11]:
# Random variable sweep
from random import randint
from random import uniform

param_arr = [
    [10.0**uniform(-1,2), 10.0**uniform(-1,2), randint(1,20)]
    for ii in range(50)]
df_params = pd.DataFrame(data=param_arr, 
             columns=['lambda_p', 'lambda_t', 'n_factors']);

In [12]:
MAE_arr = []
n_fact_prev = 0
for idx, params in enumerate(param_arr):
    lam_p, lam_t, n_fact = params
    MAE, mu, bu, bi, p, t = SVDcv(lam_p, lam_t, n_fact)
    print MAE
    MAE_arr.append(MAE)
MAE_arr = np.array(MAE_arr)
df_params.loc[:,'MAE'] = pd.Series(MAE_arr)
df_params.to_csv('MAE_param_sweep_ALS.csv')

0.758585702905
0.857613228338
0.710849768854
0.829695949342
0.818296037796
0.732908630301
0.774835258915
0.706708566791
0.744795060363
0.722749695348
0.878848055636
0.875239223453
0.711589567671
0.791079074863
0.711027526703
0.762126468934
0.831625275253
0.8136855318
0.870742688767
0.875539901831
0.711589550224
0.852548432088
0.725616351943
0.703466693464
0.712049552431
0.711579900848
0.849791066197
0.709859131222
0.771855844355
0.806812253335
0.711407899857
0.713657806522
0.711570020679
0.833028890209
0.921272516863
0.793187717755
0.700420381792
0.707032372567
0.711589528109
0.750916588555
0.855170975184
0.840512048227
0.711589557014
0.699796709758
0.715672603734
0.859218330149
0.877206812729
0.755813764881
0.711589563394
0.701517238048


In [21]:
df_params.iloc[df_params['MAE'].argmin()]

lambda_p      4.291426
lambda_t     25.745868
n_factors    17.000000
MAE           0.699797
Name: 43, dtype: float64

In [23]:
# Finds mean absolute error of test set at best parameters
e_arr, mu, bu, bi, p, t = SVD_ALS(
    df_train,
    n_factors=17, maxUserId=max_U, maxItemId=max_I,
     n_epochs=30, epsilon=1.0, e_tol=0.0025,
    lam_p=4.291426, lam_t=25.745868)
    
r_preds = mu+bu[users_test]+bi[movies_test]
r_preds += np.sum(p[users_test]*t[movies_test],axis=1)
MAE = np.mean(np.abs(r_test-r_preds))

print MAE

0.706698448699
