In [1]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform, euclidean, seuclidean, sqeuclidean
import matplotlib.pyplot as plt
import networkx as nx
import collections

In [95]:
from surprise import SVD, SVDpp, CoClusteringlusteringlusteringlustering, KNNBaseline
from surprise import Dataset, Reader

In [2]:
data = pd.read_csv("Data/ml-100k/u.data", sep = "\t", header = None, names = ["user_id", "item_id", "rating", "ts"])
users = pd.read_csv("Data/ml-100k/u.user", sep = "|", header = None, names = ["user_id", "age", "sex", "occupation", "ts"])
movies = pd.read_csv("Data/ml-100k/u.item", sep = "|", encoding="ISO-8859-1", header = None, names = ["item_id", "movie_title", "release_date", "3", "IMdb_url", "unknown",
"Action",
"Adventure",
"Animation",
"Children\'s",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western"])

# reduce ids by 1 for easier indexing
users.user_id = users.user_id.apply(lambda x: x-1)
movies.item_id = movies.item_id.apply(lambda x: x-1)
data.user_id = data.user_id.apply(lambda x: x-1)
data.item_id = data.item_id.apply(lambda x: x-1)


In [82]:
ratings = np.zeros((943, 1682))
for i in range(data.shape[0]):
                   ratings[data.user_id[i], data.item_id[i]] = data.rating[i]

indices = ((ratings != 0).sum(axis = 0) >= 5)
old_indices = [i for i, x in enumerate(indices) if x]
new_indices = np.arange(np.sum(indices))
indices_map = zip(new_indices, old_indices)

filtered_ratings = ratings[:, indices]
filtered_ratings.shape

filtered_data_indices = data['item_id'].map(lambda x: indices[x])
filtered_data = data[filtered_data_indices]

In [85]:
def evaluate_model(model, filtered_data, user_ratings, remove_split=0.4):
    n_users, n_movies = user_ratings.shape
    n_movies_to_remove = int(remove_split*n_movies)
    cols_to_remove = np.arange(n_movies)
    np.random.seed(1)
    np.random.shuffle(cols_to_remove)
    cols_to_remove = cols_to_remove[:n_movies_to_remove]

    # create test and train dataset
    test_indexes = filtered_data['item_id'].map(lambda movie_id: movie_id in cols_to_remove)
    train_indexes = ~test_indexes
    train_dataset = Dataset.load_from_df(filtered_data[train_indexes].drop("ts", axis=1), 
                                         Reader())
    train_dataset = train_dataset.build_full_trainset()
    
    # fit the model
    model.fit(train_dataset)

    # compute the RMSE on the test dataset
    mse = 0
    for i, row in filtered_data[test_indexes].iterrows():
        ground_truth = row['rating']
        estimated = model.predict(row['user_id'], row['item_id']).est
        mse += (ground_truth - estimated) ** 2
    rmse = np.sqrt(1 / 2 * mse / sum(test_indexes))
    return rmse

In [96]:
evaluate_model(CoClustering(), filtered_data, user_ratings)

0.7908705646905959

In [86]:
evaluate_model(SVD(), filtered_data, user_ratings)

0.7564376928469153

In [94]:
evaluate_model(SVDpp(), filtered_data, user_ratings)

0.7580321038775139