In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn import tree

from surprise import KNNBasic
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise import BaselineOnly
from surprise import SVD

import time

In [2]:
# Reads the data from CSV files, converts it into Dataframe and returns x and y dataframes
def getTrainDataframe(filePath):
    dataframe = pd.read_csv(filePath)
    y = dataframe['rating']
    x = dataframe.drop('rating', axis=1)
    return x, y


# Reads the data from CSV files, converts it into Dataframe and returns x and y dataframes
def getTestDataframe(filePath):
    dataframe = pd.read_csv(filePath)
    ids = dataframe['Id']
    user = dataframe['userId']
    movies = dataframe['movieId']

    return ids, user, movies


# predicted_y and test_y are the predicted and actual y values respectively as numpy arrays
# function prints the mean squared error value for the test dataset
def compute_rmse(predicted_y, y):
    rmse = np.sum((predicted_y - y)**2)/predicted_y.shape[0]
    return np.sqrt(rmse)

In [3]:
train_file = 'train_ratings.csv'
val_file = 'val_ratings.csv'
test_file = 'test_ratings.csv'

val_x, val_y = getTrainDataframe(val_file)
test_ids, test_users, test_movies = getTestDataframe(test_file)

reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
train_data = Dataset.load_from_file(train_file, reader)
#self.full_trainset = train_data.build_full_trainset()
trainset, gobruins = train_test_split(train_data, test_size=0.001)
print(type(trainset))

val_data = Dataset.load_from_file(val_file, reader)
gobruins, valset = train_test_split(val_data, test_size=1.0)
print(valset[0])

d = {'1user': [str(item) for item in test_users], '2item': [str(item) for item in test_movies], '3rating': test_ids}
df = pd.DataFrame(data=d)
#print(df)
test_data = Dataset.load_from_df(df, reader)
test_trainset = test_data.build_full_trainset()
testset = test_trainset.build_testset()

print(testset[0])    

<class 'surprise.trainset.Trainset'>
('82766', '2643.0', 3.0)
('1', '4993.0', 0.0)


In [6]:
print("Starting training")

als = SVD(n_epochs = 27,n_factors = 90, lr_all = 0.00645, reg_all = 0.0298)
start = time.time()
als.fit(trainset)
end = time.time()
print("Model Trained, took ", end - start)

Starting training
Model Trained, took  2802.852623939514


In [7]:

start = time.time()
predictions = als.test(valset, verbose=False)
end = time.time()
rmse = accuracy.rmse(predictions)
print("Validation RMSE: ", rmse, ", took ", end - start)

RMSE: 0.8117
Validation RMSE:  0.8117295643453344 , took  186.5482838153839


In [8]:
start = time.time()
predicted_y = als.test(testset, verbose=False)
end = time.time()
print("Predicted test values, took ", end - start)

Predicted test values, took  89.97021007537842


In [9]:
file_name = "SVDv2.1"

predicted_y_est = [item[3] for item in predicted_y]

d = {'Id': test_ids.values, 'rating': predicted_y_est}
df = pd.DataFrame(data=d)
df.to_csv(file_name + "_output.csv", index=None)
print("Logged")

Logged
