# Item-to-Item Collaborative Filtering

Author: Group 22

## Import libraries

In [57]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import itertools
import math



In [58]:
ratings = "ratings.dat"
movies = "movies.dat"

## Read dataset and create dataframes

In [59]:
df_rating = pd.read_csv(
    ratings,
    sep="::",
    header=None, 
    names=['user_id', 'movie_id', 'rating', 'timestamp']
)

  df_rating = pd.read_csv(


In [60]:
df_movie = pd.read_csv(
    movies, 
    sep='::', 
    header=None, 
    names=['movie_id', 'title', 'genre'], 
    encoding = "ISO-8859-1")

  df_movie = pd.read_csv(


## Data Preprocessing

In [61]:
actual_movie_ratings = df_rating.pivot(index='movie_id', columns='user_id', values='rating')

In [62]:
actual_movie_ratings

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,,,,,,,,,3.0,4.0,...,,,,,,,,,,
3949,,,,,,,,,,,...,,,,,,,,,,
3950,,,,,,,,,,,...,,,,,,,,,,
3951,,,,,,,,,,,...,,,,,,,,,,


In [63]:
# Drop movies wtih less than 10 ratings
actual_movie_ratings = actual_movie_ratings.dropna(thresh=10, axis=0)

In [64]:
actual_movie_ratings

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,4.0,5.0,5.0,...,,4.0,,,4.0,,,,,3.0
2,,,,,,,,,,5.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,,,,,,,,,3.0,4.0,...,,,,,,,,,,
3949,,,,,,,,,,,...,,,,,,,,,,
3950,,,,,,,,,,,...,,,,,,,,,,
3951,,,,,,,,,,,...,,,,,,,,,,


In [65]:
# Use random number generator to create training and testing dataset.
def create_test_movie_ratings(x):
    if pd.isna(x):
        return np.nan
    else:
        if np.random.rand() < 0.7:
            return x
        else:
            return np.nan

In [66]:
test_movie_ratings = actual_movie_ratings.applymap(create_test_movie_ratings)

In [67]:
test_movie_ratings

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,,,,...,,4.0,,,,,,,,3.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,,,,,,,,,,4.0,...,,,,,,,,,,
3949,,,,,,,,,,,...,,,,,,,,,,
3950,,,,,,,,,,,...,,,,,,,,,,
3951,,,,,,,,,,,...,,,,,,,,,,


In [68]:
train_movie_ratings = test_movie_ratings.copy(deep=True)

In [69]:
train_movie_ratings

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,,,,4.0,,,,,...,,4.0,,,,,,,,3.0
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,1.0,,,,,
4,,,,,,,,3.0,,,...,,,,,2.0,2.0,,,,
5,,,,,,,,,,,...,,,,,1.0,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,,,,,,,,,,4.0,...,,,,,,,,,,
3949,,,,,,,,,,,...,,,,,,,,,,
3950,,,,,,,,,,,...,,,,,,,,,,
3951,,,,,,,,,,,...,,,,,,,,,,


In [70]:
mean_movie_ratings = train_movie_ratings.mean(axis=1)

In [71]:
mean_movie_ratings

movie_id
1       4.122973
2       3.189300
3       3.021341
4       2.774436
5       3.019139
          ...   
3948    3.628425
3949    4.094527
3950    3.720930
3951    3.757576
3952    3.782288
Length: 3260, dtype: float64

In [72]:
train_movie_ratings_centered = train_movie_ratings.sub(mean_movie_ratings, axis=0)

In [73]:
train_movie_ratings_centered

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.877027,,,,,-0.122973,,,,,...,,-0.122973,,,,,,,,-1.122973
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,-2.021341,,,,,
4,,,,,,,,0.225564,,,...,,,,,-0.774436,-0.774436,,,,
5,,,,,,,,,,,...,,,,,-2.019139,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,,,,,,,,,,0.371575,...,,,,,,,,,,
3949,,,,,,,,,,,...,,,,,,,,,,
3950,,,,,,,,,,,...,,,,,,,,,,
3951,,,,,,,,,,,...,,,,,,,,,,


In [74]:
train_movie_ratings_centered = train_movie_ratings_centered.fillna(0)

In [75]:
train_movie_ratings_centered

user_id,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.877027,0.0,0.0,0.0,0.0,-0.122973,0.0,0.000000,0.0,0.000000,...,0.0,-0.122973,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,-1.122973
2,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
3,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,-2.021341,0.000000,0.0,0.0,0.0,0.000000
4,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.225564,0.0,0.000000,...,0.0,0.000000,0.0,0.0,-0.774436,-0.774436,0.0,0.0,0.0,0.000000
5,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,-2.019139,0.000000,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.371575,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
3949,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
3950,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000
3951,0.000000,0.0,0.0,0.0,0.0,0.000000,0.0,0.000000,0.0,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000


## Creating a similiarity matrix between items

In [76]:
cos_similarity = cosine_similarity(train_movie_ratings_centered)

In [77]:
similarity = pd.DataFrame(cos_similarity, index=train_movie_ratings.index, columns=train_movie_ratings.index)

In [78]:
similarity

movie_id,1,2,3,4,5,6,7,8,9,10,...,3942,3943,3945,3946,3947,3948,3949,3950,3951,3952
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.045907,0.037419,0.039728,0.036284,0.016748,0.039566,4.054092e-03,-0.014043,0.041039,...,-0.007238,-0.014775,-0.018284,0.010593,0.031372,0.036178,0.027642,0.009169,0.005144,0.021883
2,0.045907,1.000000,0.014990,0.013630,0.068968,0.045032,0.053365,3.248830e-02,0.056063,0.081257,...,-0.003201,0.001958,0.022051,0.012754,-0.007282,0.053652,0.031424,-0.013185,-0.005419,-0.003091
3,0.037419,0.014990,1.000000,0.027635,0.096574,0.039724,0.064706,3.975065e-02,-0.024274,0.054249,...,0.007177,0.018149,-0.004900,0.023689,0.023188,0.026329,-0.007568,0.036042,-0.004294,0.003964
4,0.039728,0.013630,0.027635,1.000000,0.027288,0.028818,0.030050,1.027005e-02,-0.053294,0.005323,...,0.000000,0.028436,0.000000,0.017154,-0.004812,0.026071,-0.028867,0.017286,-0.010814,0.000769
5,0.036284,0.068968,0.096574,0.027288,1.000000,0.044625,0.112310,3.055309e-02,0.036797,0.043302,...,0.010691,-0.008970,0.000793,0.012408,0.040087,0.066593,-0.012886,0.009535,0.000538,-0.007485
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3948,0.036178,0.053652,0.026329,0.026071,0.066593,-0.010717,0.003584,6.152687e-03,-0.011131,0.024530,...,0.026829,-0.029911,0.045399,0.050385,-0.003838,1.000000,0.011678,-0.005787,-0.018341,0.010090
3949,0.027642,0.031424,-0.007568,-0.028867,-0.012886,0.016515,-0.001472,-2.557446e-02,0.018941,0.005841,...,0.001077,0.021455,-0.033365,0.072610,0.032020,0.011678,1.000000,0.014194,0.173332,0.002742
3950,0.009169,-0.013185,0.036042,0.017286,0.009535,-0.007621,0.019112,9.444779e-02,0.000000,0.000103,...,0.038649,-0.009760,-0.038539,-0.098102,0.045382,-0.005787,0.014194,1.000000,0.063745,0.060291
3951,0.005144,-0.005419,-0.004294,-0.010814,0.000538,-0.007348,0.004715,-3.469447e-18,0.000000,-0.009418,...,-0.006192,0.035264,-0.030290,0.085087,0.008144,-0.018341,0.173332,0.063745,1.000000,0.074503


In [35]:
test_movie_ratings.to_csv("testing")
actual_movie_ratings.to_csv("actual")

In [85]:
similarity.to_csv("similarity")

In [79]:
train_movie_ratings.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            3942, 3943, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
           dtype='int64', name='movie_id', length=3260)

In [80]:
test_movie_ratings.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            3942, 3943, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
           dtype='int64', name='movie_id', length=3260)

In [81]:
actual_movie_ratings.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            3942, 3943, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
           dtype='int64', name='movie_id', length=3260)

In [82]:
similarity.index

Int64Index([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
            ...
            3942, 3943, 3945, 3946, 3947, 3948, 3949, 3950, 3951, 3952],
           dtype='int64', name='movie_id', length=3260)

## Test Cases: Getting the missing ratings for user 1 and user 19 only.

Test cases make sure that our algorithm is correct

In [89]:
rating_1 = []
similarity_1 = []
rating_similarity_pair = []
for m in train_movie_ratings.index:
    if(not pd.isna(train_movie_ratings.loc[m, 19]) and similarity.loc[m, 1] > 0):
        rating_similarity_pair.append((train_movie_ratings.loc[m, 19], similarity.loc[m, 1]))
    rating_1.append(train_movie_ratings.loc[m, 19])
    similarity_1.append(similarity.loc[m, 1])

In [92]:
numerator = 0
denominator = 0
for pair_ in rating_similarity_pair:
    product = pair_[0] * pair_[1]
    numerator = numerator + product
    denominator = denominator + pair_[1]

In [96]:
rating_1 = []
similarity_1 = []
rating_similarity_pair = []
for m in train_movie_ratings.index:
    if(not pd.isna(train_movie_ratings.loc[m, 23]) and similarity.loc[m, 1] > 0):
        rating_similarity_pair.append((train_movie_ratings.loc[m, 23], similarity.loc[m, 1]))
    rating_1.append(train_movie_ratings.loc[m, 23])
    similarity_1.append(similarity.loc[m, 1])

In [98]:
numerator = 0
denominator = 0
for pair_ in rating_similarity_pair:
    product = pair_[0] * pair_[1]
    numerator = numerator + product
    denominator = denominator + pair_[1]

In [99]:
numerator

22.83481782322176

In [100]:
denominator

6.823086461733088

In [101]:
numerator / denominator

3.3466991736495793

In [87]:
similarity.loc[1,19]

-0.000597893720109349

In [100]:
similarity.loc[2,]

movie_id
1       0.045907
2       1.000000
3       0.014990
4       0.013630
5       0.068968
          ...   
3948    0.053652
3949    0.031424
3950   -0.013185
3951   -0.005419
3952   -0.003091
Name: 2, Length: 3260, dtype: float64

In [102]:
train_movie_ratings.loc[1,]

user_id
1       5.0
2       NaN
3       NaN
4       NaN
5       NaN
       ... 
6036    NaN
6037    NaN
6038    NaN
6039    NaN
6040    3.0
Name: 1, Length: 6040, dtype: float64

In [72]:
rating_1 = []
similarity_1 = []
rating_similarity_pair = []
for m in train_movie_ratings.index:
    if(not pd.isna(train_movie_ratings.loc[m, 23]) and similarity.loc[m, 23] > 0):
        rating_similarity_pair.append((train_movie_ratings.loc[m, 23], similarity.loc[m, 23]))
    rating_1.append(train_movie_ratings.loc[m, 23])
    similarity_1.append(similarity.loc[m, 23])

In [75]:
numerator = 0
denominator = 0
for pair_ in rating_similarity_pair:
    product = pair_[0] * pair_[1]
    numerator = numerator + product
    denominator = denominator + pair_[1]

In [117]:
train_movie_ratings.loc[:, 1]

movie_id
1       5.0
2       NaN
3       NaN
4       NaN
5       NaN
       ... 
3948    NaN
3949    NaN
3950    NaN
3951    NaN
3952    NaN
Name: 1, Length: 3260, dtype: float64

## Getting the top 10 recommended list of movies for user 1

In [125]:
movies_ = []
for i in tqdm(train_movie_ratings.index):
    if (not pd.isna(train_movie_ratings.loc[i,1])):
        continue
    rating_similarity_pair = []
    for j in train_movie_ratings.index:
        if(not pd.isna(train_movie_ratings.loc[j, 1]) and similarity.loc[j, i] > 0):
            rating_similarity_pair.append((train_movie_ratings.loc[j, 1], similarity.loc[j, i]))
    if(len(ratings) > 0):
        numerator = 0
        denominator = 0
        predicted = 0
        for pair_ in rating_similarity_pair:
            product = pair_[0] * pair_[1]
            numerator = numerator + product
            denominator = denominator + pair_[1]
        if (denominator > 0):
            predicted = numerator / denominator
            movies_.append((i, predicted))
        else:
            predicted = mean_movie_ratings.loc[i]
            movies_.append((i, predicted))
    else:
        predicted = mean_movie_ratings.loc[i]
        movies_.append((i, predicted))
            
        
        
        
    
    

100%|██████████| 3260/3260 [01:45<00:00, 31.02it/s]


In [128]:
# sort the list of tuples based on the score in descending order
sorted_list = sorted(movies_, key=lambda x: x[1], reverse=True)

# get the top n tuples
n = 10
top_n_tuples = sorted_list[:n]

print(top_n_tuples)

[(2441, 4.881680024980012), (2537, 4.693988672794359), (283, 4.6930481384500675), (1656, 4.692141991102986), (939, 4.668340788190146), (1529, 4.665811178575538), (3284, 4.663578879237756), (2843, 4.6575015478548485), (3487, 4.657150674424384), (2627, 4.653274640426725)]


In [153]:
data = {}
for movie_id, score_ in top_n_tuples:
    title = df_movie.loc[df_movie['movie_id'] == movie_id, 'title'].iloc[0]
    data[movie_id] = title
    
df_rec = pd.DataFrame.from_dict(data, orient ='index') 

In [154]:
df_rec = df_rec.rename(columns={0: 'title'})

In [155]:
df_rec

Unnamed: 0,title
2441,"Hi-Lo Country, The (1998)"
2537,Beyond the Poseidon Adventure (1979)
283,New Jersey Drive (1995)
1656,Swept from the Sea (1997)
939,"Reluctant Debutante, The (1958)"
1529,Nowhere (1997)
3284,They Might Be Giants (1971)
2843,"Black Cat, White Cat (Crna macka, beli macor) ..."
3487,"Dorado, El (1967)"
2627,Endurance (1998)


In [156]:
df_rec.index

Int64Index([2441, 2537, 283, 1656, 939, 1529, 3284, 2843, 3487, 2627], dtype='int64')

In [142]:
data

{'Hi-Lo Country, The (1998)': 2441,
 'Beyond the Poseidon Adventure (1979)': 2537,
 'New Jersey Drive (1995)': 283,
 'Swept from the Sea (1997)': 1656,
 'Reluctant Debutante, The (1958)': 939,
 'Nowhere (1997)': 1529,
 'They Might Be Giants (1971)': 3284,
 'Black Cat, White Cat (Crna macka, beli macor) (1998)': 2843,
 'Dorado, El (1967)': 3487,
 'Endurance (1998)': 2627}

## Finding all the missing ratings in testing dataset

In [107]:
scores = []

for movie_id in tqdm(test_movie_ratings.index):
    for user_id in test_movie_ratings.columns:
        if pd.isna(test_movie_ratings.loc[movie_id, user_id]) and not pd.isna(actual_movie_ratings.loc[movie_id, user_id]):
            #print(test_movie_ratings.loc[movie_id, user_id], actual_movie_ratings.loc[movie_id, user_id], "!", movie_id, user_id)
            rating_similarity_pair = []
            for m in train_movie_ratings.index:
                if(not pd.isna(train_movie_ratings.loc[m, user_id]) and similarity.loc[m, movie_id] > 0):
                    rating_similarity_pair.append((train_movie_ratings.loc[m, user_id], similarity.loc[m, movie_id]))
            numerator = 0
            denominator = 0
            for pair_ in rating_similarity_pair:
                product = pair_[0] * pair_[1]
                numerator = numerator + product
                denominator = denominator + pair_[1]
            if (denominator > 0):
                predicted = numerator / denominator
                actual = actual_movie_ratings.loc[movie_id, user_id]
                scores.append((predicted, actual))
        #else:
            #print(test_movie_ratings.loc[movie_id, user_id], actual_movie_ratings.loc[movie_id, user_id], "O", movie_id, user_id)

    #break

100%|██████████| 3260/3260 [3:36:26<00:00,  3.98s/it]  


In [108]:
len(scores)

298750

## Calculate RMSE and MAE based on scores

In [110]:
def calculate_rmse_and_mae(predictions):
    n = len(predictions)
    rmse = math.sqrt(sum([(p[0]-p[1])**2 for p in predictions])/n)
    mae = sum([abs(p[0]-p[1]) for p in predictions])/n
    return rmse, mae

In [112]:
rsme, mae = calculate_rmse_and_mae(scores)


In [115]:
print("RSME: ", rsme)
print("MAE: ", mae)

RSME:  0.960485444195601
MAE:  0.7645647527244461


## Conclusion: 
### In conclusion, the item-to-item collaborative filtering approach yielded an RSME of 0.96 and an MAE of 0.76.