In [71]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import GridSearchCV
from surprise import KNNWithMeans
import numpy as np
import pandas as pd

In [73]:
# Load Data
data = Dataset.load_builtin("ml-100k")

In [6]:
# Find optimal hyper-parameters for SVD algorithm
param_grid = {
    "n_epochs": [10],
    "lr_all": [0.005],
    "reg_all": [0.4]
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3, refit=True)

gs.fit(data)

print(gs.best_score["rmse"])
print(gs.best_params["rmse"])

0.9630304753096999
{'n_epochs': 10, 'lr_all': 0.005, 'reg_all': 0.4}


In [74]:
trainingSet = data.build_full_trainset()

svd = SVD(n_epochs=10, lr_all=0.005, reg_all=0.04)

svd.fit(trainingSet)

prediction = svd.predict('E', 2)
print(prediction.est)

3.52986


In [131]:
df = pd.DataFrame(columns=['user', 'item', 'rating'], data=list(trainingSet.all_ratings()))
ratings = np.asarray(pd.pivot_table(df, index='item', columns = 'user', values='rating'))
ratings = np.where(np.isnan(ratings), 0, ratings)

In [249]:
# User distance matrix
users_distance_matrix = ratings.T.dot(ratings)
norms = np.array([np.sqrt(np.diagonal(users_distance_matrix))])
users_distance_matrix = users_distance_matrix / (norms * norms.T) + 1e-9

# Movie distance matrix
movies_distance_matrix = ratings.dot(ratings.T)
norms = np.array([np.sqrt(np.diagonal(movies_distance_matrix))])
movies_distance_matrix = movies_distance_matrix / (norms * norms.T) + 1e-9

# Get movie ratings via user-item filtering
# 1. Dot multiply the ratings matrix by the user_similarity_matrix
user_item_predictions = users_distance_matrix.dot(ratings.T) / np.array([np.abs(users_distance_matrix).sum(axis=1)]).T

# Get movie ratings via item-item filtering
# 1. Dot multiply the ratings matrix by the user_similarity_matrix
item_item_predictions = ratings.T.dot(movies_distance_matrix) / np.array([np.abs(movies_distance_matrix).sum(axis=1)])

In [248]:
from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print('User-based CF MSE: ' + str(get_mse(user_item_predictions.T, ratings)))
print('Item-based CF MSE: ' + str(get_mse(item_item_predictions.T, ratings)))

User-based CF MSE: 7.843480858975423
Item-based CF MSE: 8.70364050625722
