In [2]:
import os
import pandas as pd

workdir = os.getcwd()
def film_data():
    with open(os.path.join(workdir, 'selected_films.txt')) as file:
        lines = file.readlines()
        
    lines = map(str.strip, lines)
    lines = [[line[:4], line[5:]] for line in lines]
    lines = [[int(line[0])] + [line[1]] + [i] for i, line in enumerate(lines, 1)]   
    lines = list(lines)
    df = pd.DataFrame(lines, columns=['Year', 'Title', 'Index'])
    return df

def user_data():
    with open(os.path.join(workdir, 'm_u_ratings.txt')) as file:
        lines = file.readlines()
    
    lines = map(str.strip, lines)
    lines = [map(int, line.split()) for line in lines]
    df = pd.DataFrame(lines, columns=['MovieID', 'UserID', 'Rating'])
    return df


In [3]:
ratings = user_data()
ratings.head()

Unnamed: 0,MovieID,UserID,Rating
0,1,2897,3
1,1,6549,4
2,1,389,4
3,1,287,3
4,1,8867,3


In [4]:
films = film_data()
films.head()

Unnamed: 0,Year,Title,Index
0,2000,Miss Congeniality,1
1,1996,Independence Day,2
2,2000,The Patriot,3
3,2004,The Day After Tomorrow,4
4,2003,Pirates of the Caribbean: The Curse of the Bla...,5


In [5]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(ratings, test_size=0.1)

In [6]:
test.head()

Unnamed: 0,MovieID,UserID,Rating
4165088,551,5510,5
4786361,660,5321,4
962454,111,4820,2
4839094,669,3103,5
5710785,832,1443,5


In [7]:
n_users = ratings.loc[ratings['UserID'].idxmax()]['UserID']
n_items = int(films["Index"].count())


In [36]:
import numpy as np

def calc_data_matrix(train):

    data_matrix = np.zeros((n_users, n_items))

    for line in train.itertuples():
                    # User ID .  #movieID . #rating
        data_matrix[line.UserID-1, line.MovieID-1] = line.Rating
    return data_matrix


In [37]:
from sklearn.metrics.pairwise import pairwise_distances 

def get_similarities(data_matrix):
    user_similarity = pairwise_distances(data_matrix, metric='cosine')
    item_similarity = pairwise_distances(data_matrix.T, metric='cosine')
    return user_similarity, item_similarity


In [38]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

In [34]:
from sklearn.metrics import mean_absolute_error as mse

def get_y_pred(prediction, test):

    y_pred = []
    for i, movieID, userID, Rating in test.itertuples():
        rating = prediction[userID-1][movieID-1]
        y_pred.append(rating)
    return y_pred

def predict_mse(prediction, test):
    y_pred = get_y_pred(prediction, test)
    y_true = test['Rating'].values
    return mse(y_true, y_pred, sample_weight=None, multioutput='uniform_average')


def pred_mse(train, test):
    """ returns mse for user_prediction and item_prediction """
    data_matrix = calc_data_matrix(train)
    user_similarity, item_similarity = get_similarities(data_matrix)
    
    user_prediction = predict(data_matrix, user_similarity, type='user')
    item_prediction = predict(data_matrix, item_similarity, type='item')
    
    mse_user = predict_mse(user_prediction, test) 
    mse_item = predict_mse(item_prediction, test)

    return mse_user, mse_item

In [15]:
from sklearn.metrics import mean_absolute_error as mse
from sklearn.model_selection import KFold

kf = KFold(n_splits=10)

for train_index, test_index in kf.split(ratings):
    train, test = ratings.iloc[train_index], ratings.iloc[test_index] 
    train, test = train_test_split(ratings, test_size=0.1)
    mse_user, mse_item = pred_mse(train, test)
    mse_user_error += mse_user
    mse_item_error += mse_item


mse_user_error /= 10
mse_item_error /= 10

In [16]:
mse_user_error, mse_item_error

(1.7049388250414481, 2.105491518290044)

# Question 1


a) MAE for user based collaborative filtering: 1.705

b) MAE for item based collaborative filtering: 2.105



In [18]:
import random
from functools import reduce

# Selected 100 users ratings, 
users = [random.randint(1, n_users) for _ in range(100)]

# Select 10% of each user's ratings 
test = [ratings[ratings.UserID==userid].sample(frac=.1) for userid in users]
test = pd.concat(test)

In [32]:
# Drop the test from the training set
train = ratings.drop(test.index)

9861256 9871158


In [39]:
data_matrix = calc_data_matrix(train)
user_similarity, item_similarity = get_similarities(data_matrix)

user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')

In [51]:
test.head()

Unnamed: 0,MovieID,UserID,Rating
5006431,699,4633,4
2060814,252,4633,3
7385980,1207,4633,4
5132607,722,4633,2
8267774,1441,4633,2


In [74]:
from collections import namedtuple

def nlargest(n, user_movie_ratings):
    Rating = namedtuple("Rating", ['movieID', 'rating'])
    ratings = []
    
    for i, predicted_rating in enumerate(user_movie_ratings):
        pass
    
def datamatrix_to_df(data_matrix):
    return pd.DataFrame(data_matrix)
    
        
df = datamatrix_to_df(data_matrix)
df.index = np.arange(1, len(df)+1)
df.columns = np.arange(1, len(df.columns.values) + 1)

# Predcitions for the 100 users
predictions = df[df.index == 433]

predictions.mask(prediction!=4633)

# # Need to select the movies for each user
# for _, ratings in predictions.iterrows():
#     userID = ratings.index
#     movieRatings = list(enumerate(ratings.values, 1))
#     print(movieRatings)
#     break

NameError: name 'prediction' is not defined