In [1]:
import numpy as np
import pandas as pd

In [2]:
import math
import copy

In [3]:
rating_df_columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
rating_df = pd.read_table("ml-1m/ratings.dat", sep="::", names=rating_df_columns)


  


In [4]:
rating_df.drop(index= range(20000,1000209), inplace=True)

In [5]:
### Train(0.80) test(0.20) random split 

train_rating_df = rating_df.sample(frac = 0.8, random_state=201)
test_rating_df = rating_df.drop(train_rating_df.index)

train_users = max(train_rating_df["UserId"])
train_movies = max(train_rating_df["MovieId"])

test_users = max(test_rating_df["UserId"])
test_movies = max(test_rating_df["MovieId"])

assert train_users == test_users
assert train_movies == test_movies

In [6]:
train_rating_df = train_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
test_rating_df = test_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)

In [7]:
zero_mat = np.zeros(train_users)
for i in range(1, train_movies + 1):
    print(i, end = "\r")
    if i not in train_rating_df.columns:   
        zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1,150))
        train_rating_df = train_rating_df.join(zero_df)
    if i not in test_rating_df.columns:
        zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1,150))
        test_rating_df = test_rating_df.join(zero_df)

train_rating_df = train_rating_df[range(1, train_movies + 1)]
test_rating_df = test_rating_df[range(1, test_movies + 1)]

3952

In [8]:
train_rating_matrix = np.asarray(train_rating_df)
test_rating_matrix = np.asarray(test_rating_df)

assert train_rating_matrix.shape == test_rating_matrix.shape

In [9]:
### Mean center
### Handle strict and lineant

def mean_center(A):
    mean_centered_A = np.copy(A)
    for row in range(A.shape[0]):
        ratings_sum = A[row].sum()
        movies_rated = (A[row] > 0).sum()
        mean = ratings_sum/movies_rated
        for i in A[row].nonzero():
            mean_centered_A[row][i] -= mean
    return mean_centered_A

In [10]:
train_rating_matrix_centered = mean_center(train_rating_matrix)
test_rating_matrix_centered = mean_center(test_rating_matrix)

In [11]:
#### User-User collabarative filtering
#### Find k most similar users who have rated that movie.

### Find similarity matrix based on row

def find_similarity(A):
    similarity = np.zeros((A.shape[0], A.shape[0]))
    for i in range(A.shape[0]):
        for j in range(i+1, A.shape[0]):
            similarity_ij = np.dot(A[i], A[j])
            similarity_ij /= (np.sqrt(np.dot(A[i], A[i])) * np.sqrt(np.dot(A[j], A[j])))
            similarity[i][j] = similarity[j][i] = similarity_ij
        print(i, end = "\r")
    return similarity

In [12]:
### User User collabrative filtering with k = 5

k = 15
predicted_rating_matrix = np.zeros(train_rating_matrix_centered.shape)
for user in range(train_rating_matrix_centered.shape[0]):
    print(user, end = '\r')
    user_user_similarity = []
    mod = np.sqrt(np.dot(train_rating_matrix_centered[user], train_rating_matrix_centered[user]))
    for i in range(train_rating_matrix_centered.shape[0]):
        similarity = np.dot(train_rating_matrix_centered[user], train_rating_matrix_centered[i])
        similarity /= (mod  * np.sqrt(np.dot(train_rating_matrix_centered[i], train_rating_matrix_centered[i])))
        user_user_similarity.append(similarity)
    
    similarity = np.asarray(user_user_similarity)
    sorted_similarity = np.argsort(-similarity)
    
    for movie in range(train_rating_matrix_centered.shape[1]):
        if train_rating_matrix[user][movie] == 0 and test_rating_matrix[user][movie] != 0:
            cnt = 0
            weighted_sum = 0
            sum_of_weights = 0
            for similar_user in sorted_similarity:
                if train_rating_matrix[similar_user][movie] != 0 and similarity[similar_user] > 0:
                    cnt += 1
                    weighted_sum += similarity[similar_user] * train_rating_matrix_centered[similar_user][movie]
                    sum_of_weights += similarity[similar_user]
                    predicted_rating_matrix[user][movie] = weighted_sum/sum_of_weights
                if cnt == k:
                    break

        
                

148

In [13]:
error = 0
cnt = 0
for user in range(test_rating_matrix.shape[0]):
    for movie in range(test_rating_matrix_centered.shape[1]):
        ## Both conditions are necessary as both are required.
        if test_rating_matrix[user][movie] != 0 and predicted_rating_matrix[user][movie] != 0:
            error += ((predicted_rating_matrix[user][movie] - test_rating_matrix_centered[user][movie]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)

In [14]:
print(mse)

1.0254699630900137


In [19]:
### Item-Item collabarative filtering

train_rating_matrix_centered_transpose = train_rating_matrix_centered.T
test_rating_matrix_centered_transose = test_rating_matrix_centered.T
test_rating_matrix_transpose = test_rating_matrix.T
k = 15
predicted_rating_matrix_item = np.zeros(train_rating_matrix_centered_transpose.shape)

for movie in range(train_rating_matrix_centered_transpose.shape[0]):
    item_item_similarity = []
    mod = np.sqrt(np.dot(train_rating_matrix_centered_transpose[movie], train_rating_matrix_centered_transpose[movie]))
    print(movie, end = "\r")
    for i in range(train_rating_matrix_centered_transpose.shape[0]):
        similarity = np.dot(train_rating_matrix_centered_transpose[movie], train_rating_matrix_centered_transpose[i])
        mod2 = np.sqrt(np.dot(train_rating_matrix_centered_transpose[i], train_rating_matrix_centered_transpose[i]))
        if mod2 == 0:
            similarity = 0
        else:
            similarity /= (mod * mod2) 
        item_item_similarity.append(similarity)
    
    similarity = np.asarray(item_item_similarity)
    sorted_similarity = np.argsort(-similarity)
    
    for user in range(train_rating_matrix_centered_transpose.shape[1]):
        if test_rating_matrix_transpose[movie][user] != 0:
            cnt = 0
            weighted_sum = 0
            sum_of_weights = 0
            for similar_movie in sorted_similarity:
                if train_rating_matrix_centered_transpose[similar_movie][user] != 0 and similarity[similar_movie] > 0:
                    cnt += 1
                    weighted_sum += similarity[similar_movie] * train_rating_matrix_centered_transpose[similar_movie][user]
                    sum_of_weights += similarity[similar_movie]
                    predicted_rating_matrix_item[movie][user] = weighted_sum/sum_of_weights
                    if cnt == k:
                        break
            
                
                

012345678910



3951

In [16]:
error = 0
cnt = 0
for movie in range(test_rating_matrix_transpose.shape[0]):
    for user in range(test_rating_matrix_transpose.shape[1]):
        if test_rating_matrix_transpose[movie][user] != 0 and predicted_rating_matrix_item[movie][user] != 0:
            error += ((predicted_rating_matrix_item[movie][user] - test_rating_matrix_transpose[movie][user]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)

In [17]:
print(error)

52646.20702861256


In [18]:
print(mse)

3.705591055075076
