In [85]:
import pandas as pd
import statistics
import numpy as np
from io import StringIO
import numpy.linalg as lin
import matplotlib.pyplot as plt
import scipy.sparse
from scipy.sparse.linalg import svds
import random
import sys
import time
from pandas import Series
from math import pow
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# Task 1: Naive Approaches

### Import Data

In [63]:
movies = pd.read_table('ml-1m/movies.dat',
                   header=None, 
                   encoding= 'ISO-8859-1',
                   engine='python', 
                   sep = '::')
movies.columns = ['MovieID','Title','Genres']

In [64]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [86]:
ratings = pd.read_table('ml-1m/ratings.dat',
                   header=None, 
                   encoding= 'ISO-8859-1',
                   engine='python', 
                   sep = '::')
ratings.columns = ['UserID','MovieID','Rating','Timestamp']

In [66]:
ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [67]:
users = pd.read_table('ml-1m/users.dat',
                   header=None, 
                   encoding= 'ISO-8859-1',
                   engine='python', 
                   sep = '::')
users.columns = ['UserID','Gender','Age','Occupation','Zip-code']

In [68]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


### Calculating the 5 formulas from slide  17: 
R_global(User, Item)=mean(all ratings)

R_item(User, Item)=mean(all ratings for Item)

R_user(User, Item)=mean(all ratings for User)

In [87]:
#R_global i.e. the mean of all ratings
R_global = ratings['Rating'].mean()

In [88]:
#R_user i.e. the mean of all ratings per user
R_user = ratings.groupby(['UserID'])['Rating'].mean()

In [89]:
#R_item i.e. the mean of all ratings per movie
R_movie = ratings.groupby(['MovieID'])['Rating'].mean()

In [90]:
R = pd.merge(ratings,R_user,on='UserID')
R = pd.merge(R,R_movie, on='MovieID')
R = R[['Rating_x','Rating_y', 'Rating', 'MovieID', 'UserID']]
R = R.rename(columns={"Rating_x": "Rating", "Rating_y": "R_user", "Rating": "R_movie"})
R.head()

Unnamed: 0,Rating,R_user,R_movie,MovieID,UserID
0,5,4.188679,4.390725,1193,1
1,5,3.713178,4.390725,1193,2
2,4,3.826087,4.390725,1193,12
3,4,3.323383,4.390725,1193,15
4,5,4.075829,4.390725,1193,17


### 5-fold Cross Validation 

#### For R_global

In [91]:
random.seed(123)
kf5 = KFold(n_splits=5, shuffle=True)
y = ratings["Rating"]


RMSE_list = []
MAE_list = []


for train_index, test_index in kf5.split(ratings):
    #print("TRAIN:", train_index, "TEST:", test_index)
    y_tst = y[test_index]
    R_global = ratings.iloc[train_index]['Rating'].mean()
    X_global = np.full(len(y_tst), R_global)
    RMSE_list.append(mean_squared_error(X_global, y_tst, squared=False))
    MAE_list.append(mean_absolute_error(X_global, y_tst))
    
    
print(RMSE_list)
print(MAE_list)

print("Average RMSE over the 5 splits:", statistics.mean(RMSE_list))
print("Average MAE over the 5 splits:", statistics.mean(MAE_list))

[1.117169058530917, 1.1156258117098539, 1.1168009239152379, 1.1172677738189722, 1.1186418960789937]
[0.934235411704304, 0.9323082365558799, 0.9332340415656972, 0.934182884033819, 0.9353453802005982]
Average RMSE over the 5 splits: 1.117101092810795
Average MAE over the 5 splits: 0.9338611908120596


#### For R_movie

In [92]:
random.seed(123)
kf5 = KFold(n_splits=5, shuffle=True)


RMSE_list = []
MAE_list = []


for train_index, test_index in kf5.split(ratings):
    tr = ratings.iloc[train_index]
    tst = ratings.iloc[test_index]
    R_global = ratings.iloc[train_index]['Rating'].mean()
    R_movie = tr.groupby('MovieID')['Rating'].mean()
    R_movie = pd.merge(tst, R_movie, how='left', on='MovieID')
    R_movie = R_movie.fillna(R_global)
    

      
    RMSE_list.append(mean_squared_error(R_movie['Rating_x'], R_movie['Rating_y'], squared=False))
    MAE_list.append(mean_absolute_error(R_movie['Rating_x'],  R_movie['Rating_y']))
    
    
print(RMSE_list)
print(MAE_list)

print("Average RMSE over the 5 splits:", statistics.mean(RMSE_list))
print("Average MAE over the 5 splits:", statistics.mean(MAE_list))

[0.9788744705311263, 0.9782521736556755, 0.9808639214280408, 0.9792310060063731, 0.9801191175564901]
[0.7829314324431547, 0.7812823742253857, 0.7827742601735868, 0.7817640268933301, 0.7828220345646255]
Average RMSE over the 5 splits: 0.9794681378355412
Average MAE over the 5 splits: 0.7823148256600165


#### For R_user

In [93]:
random.seed(123)
kf5 = KFold(n_splits=5, shuffle=True)


RMSE_list = []
MAE_list = []


for train_index, test_index in kf5.split(ratings):
    tr = ratings.iloc[train_index]
    tst = ratings.iloc[test_index]
    R_global = ratings.iloc[train_index]['Rating'].mean()
    R_user = tr.groupby('UserID')['Rating'].mean()
    R_user = pd.merge(tst, R_user, how='left', on='UserID')
    R_user = R_user.fillna(R_global)
      
    RMSE_list.append(mean_squared_error(R_user['Rating_x'], R_user['Rating_y'], squared=False))
    MAE_list.append(mean_absolute_error(R_user['Rating_x'],  R_user['Rating_y']))
    
    
print(RMSE_list)
print(MAE_list)

print("Average RMSE over the 5 splits:", statistics.mean(RMSE_list))
print("Average MAE over the 5 splits:", statistics.mean(MAE_list))

[1.0328570408746982, 1.037005050154724, 1.0364735170963355, 1.0330125610965752, 1.0379563143596293]
[0.8262842688843691, 0.8302974358450335, 0.829460153683896, 0.8273412991721857, 0.8313851337836927]
Average RMSE over the 5 splits: 1.0354608967163925
Average MAE over the 5 splits: 0.8289536582738354


#### 5-fold Cross Validation - for linear regression

In [94]:
random.seed(123)
kf5 = KFold(n_splits=5, shuffle=True)
X = R[['R_user', 'R_movie']].to_numpy()
y = np.array(list(R['Rating']))

Index = []
X_train = []
X_test = []
y_train = []
y_test = []

for train_index, test_index in kf5.split(X):
    Index.append(("TRAIN:", train_index, "TEST:", test_index))
    X_tr, X_tst = X[train_index], X[test_index]
    y_tr, y_tst = y[train_index], y[test_index]
    X_train.append(X_tr)
    X_test.append(X_tst)
    y_train.append(y_tr)
    y_test.append(y_tst)

#### Linear Regression without gamma

In [99]:
RMSE_list = []
coefficients = []
MAE_list = []
for i in range(0,5):
    regressor = LinearRegression(fit_intercept=False)
    model = regressor.fit(X_train[i], y_train[i])
    y_predicted = regressor.predict(X_test[i])
    RMSE = mean_squared_error(y_test[i], y_predicted, squared=False)
    RMSE_list.append(RMSE)
    print("RMSE:",RMSE)
    MAE_list.append(mean_absolute_error(y_test[i], y_predicted))
    print('MAE:',mean_absolute_error(y_test[i], y_predicted))
    coefficients.append([model.coef_])
print("average RMSE:", statistics.mean(RMSE_list))
print("Alpha, Beta:", coefficients)
print("average MAE:", statistics.mean(MAE_list))

RMSE: 0.9511589187128604
MAE: 0.7628324948960349
RMSE: 0.9461900080470297
MAE: 0.7584629493148507
RMSE: 0.9460745752096777
MAE: 0.7580882759098737
RMSE: 0.9472732893929404
MAE: 0.7589013327872721
RMSE: 0.9461942359485344
MAE: 0.7578217649466332
average RMSE: 0.9473782054622085
Alpha, Beta: [[array([0.36717047, 0.63999173])], [array([0.36768677, 0.6392783 ])], [array([0.36801343, 0.63900673])], [array([0.36764092, 0.63928471])], [array([0.36780424, 0.63921147])]]
average MAE: 0.759221363570933


#### Linear Regression with gamma

In [100]:
RMSE_list = []
coefficients = []
gamma = []
MAE_list = []
for i in range(0,5):
    regressor = LinearRegression(fit_intercept=True)
    model = regressor.fit(X_train[i], y_train[i])
    y_predicted = regressor.predict(X_test[i])
    RMSE = mean_squared_error(y_test[i], y_predicted, squared=False)
    RMSE_list.append(RMSE)
    print("RMSE:",RMSE)
    MAE_list.append(mean_absolute_error(y_test[i], y_predicted))
    print("MAE:",mean_absolute_error(y_test[i], y_predicted))
    coefficients.append([model.coef_])
    gamma.append(model.intercept_)
print("average RMSE:", statistics.mean(RMSE_list))
print("Alpha, Beta:",coefficients)
print("Gamma:",gamma)
print("average MAE:", statistics.mean(MAE_list))

RMSE: 0.9186387690093408
MAE: 0.728631140621893
RMSE: 0.9142721850603
MAE: 0.7248076043745912
RMSE: 0.915212243262717
MAE: 0.7259136621397649
RMSE: 0.9154728993416854
MAE: 0.7254197602035103
RMSE: 0.9143596331584964
MAE: 0.7245096859149507
average RMSE: 0.9155911459665079
Alpha, Beta: [[array([0.78019129, 0.87555096])], [array([0.7820653 , 0.87565654])], [array([0.78340364, 0.87645838])], [array([0.782409  , 0.87561848])], [array([0.7825708 , 0.87541553])]]
Gamma: [-2.3479621440607548, -2.355900380380261, -2.3634800680999195, -2.3570649278739575, -2.356575558523821]
average MAE: 0.725856370650942


# Task 2: UV Matrix

In [103]:
random.seed(123)

ratings = pd.read_csv('ml-1m/ratings.dat', engine='python', sep='::', names=['user_id', 'movie_id', 'rating', 'timestamp'])
X = ratings.copy()
kf = KFold(n_splits=5, shuffle=True)


#Run for each split
for train_index, test_index in kf.split(X):
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    R_df = X_train.pivot(index = 'user_id', columns ='movie_id', values = 'rating')
    avg_user = R_df.mean(axis=1)
    R = R_df.to_numpy()
    avg_user = avg_user.to_numpy()
    normalized = R - avg_user.reshape(-1,1)
    
    #Innitialize U and V
    M = normalized.copy()
    U = np.full((normalized.shape[0],2), 1)
    V = np.full((2,normalized.shape[1]), 1)
    U = U.astype(np.float32)
    V = V.astype(np.float32)
    UV = np.dot(U,V)
    
    #Iterate 10 times
    print("TRAIN:", train_index, "TEST:", test_index)
    for i in range(10):
        for r in range(6040):
            for s in range(2):
                V_kj = V[:,:]
                U_rk = U[r,:]
                sumer = 0
                
                deletedU = np.delete(U_rk, s, 0)
                deletedV = np.delete(V_kj, s, 0)
                
                V_sj = V[s,:]
                V_2 = V_sj ** 2

                dotUV = np.dot(deletedU, deletedV)
                M_rj = M[r,:]

                error = M_rj - dotUV

                V_error = V_sj * error
                sumer = np.nansum(V_error)
                vsum = np.nansum((V_2)*(~np.isnan(M_rj)))
                newU = sumer / vsum
                U[r,s] = U[r,s] + ((newU - U[r,s]))
                
        # Update V 
        for r in range(2):
            for s in range(R.shape[1]):
                V_ks = V[:,s]
                U_ik = U[:,:]
                sumer = 0
                
                deletedU1 = np.delete(U_ik, r, 1)
                deletedV1 = np.delete(V_ks, r, 0)
                
                U_ir = U[:,r]
                U_2 = U_ir ** 2

                dotUV1 = np.dot(deletedU1, deletedV1)
                M_is = M[:,s]
                error = M_is - dotUV1

                U_error = U_ir * error
                sum_error = np.nansum(U_error)
                usum = np.nansum(U_2*(~np.isnan(M_is)))
                newV = sum_error / usum
                V[r,s] = V[r,s] + ((newV - V[r,s]))

        UV = np.dot(U,V)
        
        UV_normalized = np.nan_to_num((UV - normalized)**2)
        UV_normalized_sum = np.sum(UV_normalized,axis=0)
        UV_sums1 = difference_squared_sum.sum()
        counter1 = np.count_nonzero(UV_normalized)
        RMSE = UV_sums1/counter1
        
        absolute_UV = np.nan_to_num(np.absolute(UV - normalized))
        absolute_UV_sum = np.sum(absolute_UV, axis=0)
        UV_sums2 = absolute_UV_sum.sum()
        counter2 = np.count_nonzero(absolute_UV)
        MAE = UV_sums2/counter2
        
        print("Iteration: ", i)
        print("RMSE:", RMSE)
        print('MAE:', MAE)

TRAIN: [      0       1       2 ... 1000205 1000206 1000207] TEST: [      3       4      13 ... 1000194 1000199 1000208]
Iteration:  0
RMSE: 0.7641297395321478
MAE: 0.7241943643169587
Iteration:  1
RMSE: 0.7641297395321478
MAE: 0.7030331648968386
Iteration:  2
RMSE: 0.7641297395321478
MAE: 0.6979455241535769
Iteration:  3
RMSE: 0.7641297395321478
MAE: 0.6945749573046044
Iteration:  4
RMSE: 0.7641297395321478
MAE: 0.6921520894195488
Iteration:  5
RMSE: 0.7641297395321478
MAE: 0.6903959383014484
Iteration:  6
RMSE: 0.7641297395321478
MAE: 0.6891142927850505
Iteration:  7
RMSE: 0.7641297395321478
MAE: 0.6881631226156776
Iteration:  8
RMSE: 0.7641297395321478
MAE: 0.6874386581166821
Iteration:  9
RMSE: 0.7641297395321478
MAE: 0.6868662192266337
TRAIN: [      0       1       2 ... 1000203 1000204 1000208] TEST: [      8      11      20 ... 1000205 1000206 1000207]
Iteration:  0
RMSE: 0.7641335594120946
MAE: 0.7251144353720037
Iteration:  1
RMSE: 0.7641335594120946
MAE: 0.7040688934815831
It

# Task 3:  Matrix Factorization with Gradient Descent and Regularization

In [114]:
random.seed(123)

def MF_gravity(X, U, M, K, num_iter = 3, learn_rate=0.005, regularization=0.05):
    M = M.T
    All_Errors = []
    iteration = 0
    for step in range(num_iter):
        print('Iteration', iteration)
        iteration += 1
        for i in range(len(X)):
             for j in range(len(X[i])):
                    if X[i][j] > 0:
                    #Training error 
                        eij = X[i][j]-np.dot(U[i,:], M[:,j])
                    #Regularization
                        for k in range(K):
                            U[i][k] += learn_rate * (2 * eij * M[k][j] - regularization * U[i][k])
                            M[k][j] += learn_rate * (2 * eij * U[i][k] - regularization * M[k][j])

        StandardError = 0
        for i in range(len(X)):
             for j in range(len(X[i])):
                    if X[i][j] > 0:
                        StandardError += pow(X[i][j]-np.dot(U[i,:],M[:,j]),2)
                        for k in range(K):
                            StandardError += (regularization / 2)*(pow(U[i][k],2)+pow(M[k][j],2))
        All_Errors.append(StandardError)
        if StandardError < 0.001:
            break
    return U, M.T, All_Errors,StandardError

In [115]:
random.seed(123)
ratings = pd.read_csv('ml-1m/ratings.dat', engine='python', sep='::', names=['UserID', 'MovieID', 'Rating', 'timestamp'])
kf = KFold(n_splits=5, shuffle=True)
X = ratings[["UserID", "Rating", "MovieID"]]
y = ratings["Rating"]


for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    y_train = pd.DataFrame(y_train)
    y_test = pd.DataFrame(y_test)
    R_df = X_train.pivot(index = 'UserID', columns ='MovieID', values = 'Rating', ).fillna(0)
    
    avg_user = R_df.mean(axis=1)
    R = R_df.to_numpy()
    avg_user = avg_user.to_numpy()
    normalized = R - avg_user.reshape(-1,1)
    
    userids = R_df.index
    movieids = R_df.columns
    Nusers = R.shape[0]
    Nmovies = R.shape[1]

   
    users = np.random.rand(Nusers, 10)
    movies = np.random.rand(Nmovies, 10)

    predicted_users, predicted_movies, All_Errors, error = MF_gravity(R, users, movies, 10)
    
    predicted = np.dot(predicted_users, predicted_movies.T)
    print(predicted)
    print('Error:',error)
    print('Result:', All_Errors)

TRAIN: [      0       1       2 ... 1000203 1000206 1000207] TEST: [      4      18      23 ... 1000204 1000205 1000208]
Iteration 0


KeyboardInterrupt: 