# Setup

The setup is mostly the same of user-user, so for explanation of what is being done is in the other file, the differences will be explained.

In [46]:
import pandas as pd
import numpy as np

In [47]:
def crop(df, percent):
    print("Users:", len(df["userId"].unique()))
    print("Movies:", len(df["movieId"].unique()))
    user_counts = df["userId"].value_counts()
    top_users = user_counts[user_counts > user_counts.quantile(percent)]
    df = df[df["userId"].isin(top_users.index)]
    movie_counts = df["movieId"].value_counts()
    top_movies = movie_counts[movie_counts > movie_counts.quantile(percent)]
    df = df[df["movieId"].isin(top_movies.index)]
    print(df.shape)
    print("Users:", len(df["userId"].unique()))
    print("Movies:", len(df["movieId"].unique()))
    return df

In [48]:
df = pd.read_csv("./rating.csv", sep=",")
df = crop(df, 0.97)
df.drop(columns=["timestamp"], inplace=True)

Users: 138493
Movies: 26744
(1761018, 4)
Users: 4144
Movies: 770


In [49]:
matrix = df.pivot(index="userId", columns="movieId", values="rating")
matrix.head()

movieId,1,2,6,10,11,16,17,19,21,22,...,50872,51255,51662,54286,55820,56367,58559,59315,60069,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,4.0,3.0,3.0,4.0,5.0,2.0,2.0,,2.0,3.0,...,,,,,,,,,,
104,,,,,,,3.0,,3.0,,...,,,,,,,,,,
116,3.0,2.0,1.5,2.0,2.0,3.5,,2.5,3.5,2.0,...,,,,,,,,,,
156,5.0,5.0,4.0,4.0,5.0,4.0,4.0,,4.0,4.0,...,,,,,,,,,,
208,4.0,,,,3.0,1.5,5.0,,4.5,,...,4.0,,3.0,4.5,4.0,4.5,3.5,,3.5,


Instead of getting the deviation for the users, we will get the deviation for the movies

In [50]:
movie_mean = matrix.apply(lambda column: column.mean(), axis=0)
movie_mean.head()

movieId
1     3.874794
2     2.918415
6     3.781544
10    3.258226
11    3.407371
dtype: float64

In [51]:
matrix = matrix.apply(lambda row: row - row.mean(), axis=1)
matrix.head()

movieId,1,2,6,10,11,16,17,19,21,22,...,50872,51255,51662,54286,55820,56367,58559,59315,60069,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,0.289617,-0.710383,-0.710383,0.289617,1.289617,-1.710383,-1.710383,,-1.710383,-0.710383,...,,,,,,,,,,
104,,,,,,,0.027132,,0.027132,,...,,,,,,,,,,
116,0.43763,-0.56237,-1.06237,-0.56237,-0.56237,0.93763,,-0.06237,0.93763,-0.56237,...,,,,,,,,,,
156,0.913861,0.913861,-0.086139,-0.086139,0.913861,-0.086139,-0.086139,,-0.086139,-0.086139,...,,,,,,,,,,
208,0.381643,,,,-0.618357,-2.118357,1.381643,,0.881643,,...,0.381643,,-0.618357,0.881643,0.381643,0.881643,-0.118357,,-0.118357,


# Item-Item Collaborative Filtering 

## Correlation Matrix

In [52]:
min_common_items = 10
correlation_matrix = matrix.corr(method="pearson", min_periods=min_common_items)

In [53]:
correlation_matrix.head()

movieId,1,2,6,10,11,16,17,19,21,22,...,50872,51255,51662,54286,55820,56367,58559,59315,60069,79132
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.03235,-0.045474,0.00793,0.09192,-0.101956,0.029521,-0.118515,0.017264,-0.067205,...,0.206318,0.04864,-0.127633,0.027838,0.017583,0.050641,0.035911,0.052996,0.174475,-0.015871
2,0.03235,1.0,-0.077871,0.048924,0.022701,-0.082803,-0.022878,0.118032,-0.053834,0.143411,...,-0.032416,-0.008662,0.03162,-0.093059,-0.191137,-0.06212,-0.075859,-0.002581,-0.014914,-0.012359
6,-0.045474,-0.077871,1.0,0.070428,-0.009265,0.300902,-0.059671,-0.039032,0.045133,-0.048892,...,-0.013473,0.021281,-0.007797,0.089102,0.139498,-0.052589,0.094676,0.01164,0.064027,0.121563
10,0.00793,0.048924,0.070428,1.0,0.094478,0.020054,-0.048068,0.106639,-0.004536,0.110075,...,-0.037923,-0.001136,0.007451,0.059771,-0.092602,-0.134255,0.053913,0.138779,-0.024307,-0.006317
11,0.09192,0.022701,-0.009265,0.094478,1.0,-0.113211,0.097418,-0.135341,-0.028771,0.205496,...,0.040852,-0.086583,-0.058262,0.039799,-0.16752,-0.02482,-0.06872,-0.000442,-0.020111,-0.059836


## Recomendation

In [54]:

def predict_ratings(user_id, top_n=20):
    def weighted_avg_rating(movie):
        movie = movie.iloc[0]
        top_20_corr_movies = correlation_matrix.loc[matrix.loc[user_id].notna()].abs()[movie].nlargest(top_n).index[1:]
        rating = (matrix.loc[user_id, top_20_corr_movies] * correlation_matrix.loc[movie, top_20_corr_movies]).sum() / correlation_matrix.loc[movie, top_20_corr_movies].abs().sum() if correlation_matrix.loc[movie, top_20_corr_movies].abs().sum() != 0 else 0 
        return rating
    unrated_movies = matrix.loc[user_id][matrix.loc[user_id].isna()].index
    predicted_ratings = pd.DataFrame(index=unrated_movies, data=unrated_movies).apply(weighted_avg_rating, axis=1)
    predicted_ratings.columns = ['movie_id', 'predicted_rating']

    return (predicted_ratings).dropna()

# Tests

In [55]:
def predict_ratings_test(user_id, top_n=20):
    def weighted_avg_rating(movie):
        movie = movie.iloc[0]
        top_20_corr_movies = correlation_matrix.loc[matrix.loc[user_id].notna()].abs()[movie].nlargest(top_n).index[1:]
        rating = (matrix.loc[user_id, top_20_corr_movies] * correlation_matrix.loc[movie, top_20_corr_movies]).sum() / correlation_matrix.loc[movie, top_20_corr_movies].abs().sum() if correlation_matrix.loc[movie, top_20_corr_movies].abs().sum() != 0 else 0 
        return rating
    unrated_movies = matrix.loc[user_id][matrix.loc[user_id].notna()].index
    predicted_ratings = pd.DataFrame(index=unrated_movies, data=unrated_movies).apply(weighted_avg_rating, axis=1)
    predicted_ratings.columns = ['movie_id', 'predicted_rating']

    return (predicted_ratings).dropna()

In [56]:
users = matrix.index.tolist()

In [57]:
y_pred = np.array(list())
y_true = np.array(list())
for user in users:
    deviations = np.array(predict_ratings_test(user))
    ratings = np.array(matrix.loc[user][matrix.loc[user].notna()])
    y_pred = np.append(y_pred, deviations)
    y_true = np.append(y_true, ratings)  

In [58]:
mse = np.mean((y_true - y_pred) ** 2)
mae = np.mean(np.abs(y_true - y_pred))
rmse = np.sqrt(np.mean((y_true - y_pred)**2))


In [59]:
print("MSE: ", round(mse, 3))
print("MAE: ", round(mae, 3))
print("RMSE: ", round(rmse, 3))

MSE:  0.546
MAE:  0.563
RMSE:  0.739


In [60]:
error = y_true - y_pred
lower, higher = np.percentile(error, [10,90], method="normal_unbiased")

In [61]:
print(f"Lower bound: {lower}; High bound: {higher}")

Lower bound: -0.9739502797609579; High bound: 0.8232889251775786
