In [157]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform, euclidean, seuclidean, sqeuclidean
import matplotlib.pyplot as plt
import networkx as nx
import collections
from copy import copy

In [95]:
from surprise import SVD, SVDpp, CoClusteringlusteringlusteringlustering, KNNBaseline
from surprise import Dataset, Reader

In [2]:
data = pd.read_csv("Data/ml-100k/u.data", sep = "\t", header = None, names = ["user_id", "item_id", "rating", "ts"])
users = pd.read_csv("Data/ml-100k/u.user", sep = "|", header = None, names = ["user_id", "age", "sex", "occupation", "ts"])
movies = pd.read_csv("Data/ml-100k/u.item", sep = "|", encoding="ISO-8859-1", header = None, names = ["item_id", "movie_title", "release_date", "3", "IMdb_url", "unknown",
"Action",
"Adventure",
"Animation",
"Children\'s",
"Comedy",
"Crime",
"Documentary",
"Drama",
"Fantasy",
"Film-Noir",
"Horror",
"Musical",
"Mystery",
"Romance",
"Sci-Fi",
"Thriller",
"War",
"Western"])

# reduce ids by 1 for easier indexing
users.user_id = users.user_id.apply(lambda x: x-1)
movies.item_id = movies.item_id.apply(lambda x: x-1)
data.user_id = data.user_id.apply(lambda x: x-1)
data.item_id = data.item_id.apply(lambda x: x-1)


In [82]:
ratings = np.zeros((943, 1682))
for i in range(data.shape[0]):
                   ratings[data.user_id[i], data.item_id[i]] = data.rating[i]

indices = ((ratings != 0).sum(axis = 0) >= 5)
old_indices = [i for i, x in enumerate(indices) if x]
new_indices = np.arange(np.sum(indices))
indices_map = zip(new_indices, old_indices)

filtered_ratings = ratings[:, indices]
filtered_ratings.shape

filtered_data_indices = data['item_id'].map(lambda x: indices[x])
filtered_data = data[filtered_data_indices]

In [103]:
def evaluate_model(model, filtered_data, user_ratings, remove_split=0.4):
    n_users, n_movies = user_ratings.shape
    n_movies_to_remove = int(remove_split*n_movies)
    cols_to_remove = np.arange(n_movies)
    np.random.seed(1)
    np.random.shuffle(cols_to_remove)
    cols_to_remove = cols_to_remove[:n_movies_to_remove]

    # create test and train dataset
    test_indexes = filtered_data['item_id'].map(lambda movie_id: movie_id in cols_to_remove)
    train_indexes = ~test_indexes
    train_dataset = Dataset.load_from_df(filtered_data[train_indexes].drop("ts", axis=1), 
                                         Reader())
    train_dataset = train_dataset.build_full_trainset()
    
    # fit the model
    model.fit(train_dataset)

    # compute the RMSE on the test dataset
    mse = 0
    for i, row in filtered_data[test_indexes].iterrows():
        ground_truth = row['rating']
        estimated = model.predict(row['user_id'], row['item_id']).est
        mse += (ground_truth - estimated) ** 2
    rmse = np.sqrt(1 / 2 * mse / sum(test_indexes))
    return rmse, model

In [104]:
rmse_cocl, cocl = evaluate_model(CoClustering(), filtered_data, user_ratings)
print(rmse_cocl)

0.7908705646905959


In [86]:
rmse_svd, svd = evaluate_model(SVD(), filtered_data, user_ratings)
print(rmse_svd)

0.7564376928469153

In [111]:
n_users, n_movies = ratings.shape

In [112]:
def get_all_estimates(model, user):
    result = np.zeros(n_movies)
    for i, movie_id in enumerate(range(n_movies)):
        result[i] = model.predict(user, movie_id).est
    return result

In [193]:
def get_recommendations(signal, s, n_recommendations, n_user):
    indices = ((ratings != 0).sum(axis = 0) >= 5)
    rated_indices = (signal != 0)
    s[rated_indices] = -1
    ratings_user = copy(ratings[n_user])
    ratings_user[indices] = s[indices]
    sorted_indices = np.argsort(-ratings_user)
    sorted_values = ratings_user[sorted_indices]
    return sorted_indices[:n_recommendations], sorted_values[:n_recommendations]

In [194]:
def print_recommendations(recommendation_indices):
    movie_names = movies.movie_title[recommendation_indices]
    print(movie_names)

In [200]:
n_user = 133
signal = ratings[n_user]
s = get_all_estimates(cocl, n_user)
get_recommendations(signal, s, 10, n_user)

(array([ 173, 1448,  965,   21, 1250,  407,  317,  168,   63, 1202]),
 array([4.77896795, 4.76549856, 4.71889469, 4.67810215, 4.651587  ,
        4.63156999, 4.60694151, 4.60660025, 4.58572824, 4.57420604]))

In [202]:
recommendation_indices, recommendation_values = get_recommendations(signal, s, 20, n_user)
print(recommendation_values[:10])
print_recommendations(recommendation_indices)

[4.77896795 4.76549856 4.71889469 4.67810215 4.651587   4.63156999
 4.60694151 4.60660025 4.58572824 4.57420604]
173                        Raiders of the Lost Ark (1981)
1448                               Pather Panchali (1955)
965                         Affair to Remember, An (1957)
21                                      Braveheart (1995)
1250                                A Chef in Love (1996)
407                                 Close Shave, A (1995)
317                               Schindler's List (1993)
168                            Wrong Trousers, The (1993)
63                       Shawshank Redemption, The (1994)
1202                                       Top Hat (1935)
95                      Terminator 2: Judgment Day (1991)
11                             Usual Suspects, The (1995)
209             Indiana Jones and the Last Crusade (1989)
113     Wallace & Gromit: The Best of Aardman Animatio...
944                                        Charade (1963)
530              