In [65]:
import numpy as np
import pandas as pd
import copy
import math

In [66]:
# Train-Test-Split
def train_test_split(train_fraction):
    rating_df_columns = ["UserId", "MovieId", "Rating", "TimeStamp"]
    rating_df = pd.read_table("ml-1m/ratings.dat", sep="::", names=rating_df_columns)

    rating_df.drop(index= range(100000,1000209), inplace=True)

    train_rating_df = rating_df.sample(frac = train_fraction, random_state=201)
    test_rating_df = rating_df.drop(train_rating_df.index)

    train_users = max(train_rating_df["UserId"])
    train_movies = max(train_rating_df["MovieId"])

    test_users = max(test_rating_df["UserId"])
    test_movies = max(test_rating_df["MovieId"])

    assert train_users == test_users
    users = train_users
    movies = max(train_movies, test_movies)

    train_rating_df = train_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)
    test_rating_df = test_rating_df.pivot(index = 'UserId', columns='MovieId', values='Rating').fillna(0)

    zero_mat = np.zeros(users)
    for i in range(1, movies + 1):
        print(i, end = "\r")
        if i not in train_rating_df.columns:   
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1, users + 1))
            train_rating_df = train_rating_df.join(zero_df)
        if i not in test_rating_df.columns:
            zero_df = pd.DataFrame(zero_mat,columns=[i],index=range(1, users + 1))
            test_rating_df = test_rating_df.join(zero_df)

    train_rating_df = train_rating_df[range(1, movies + 1)]
    test_rating_df = test_rating_df[range(1, movies + 1)]

    train_rating_matrix = np.asarray(train_rating_df)
    test_rating_matrix = np.asarray(test_rating_df)

    assert train_rating_matrix.shape == test_rating_matrix.shape

    return train_rating_matrix, test_rating_matrix

In [67]:
### Mean center
### Handle strict and lineant

def mean_center(train_rating_matrix, test_rating_matrix):
    assert train_rating_matrix.shape == test_rating_matrix.shape
    A = train_rating_matrix + test_rating_matrix
    
    train_rating_matrix_centered = np.zeros(A.shape)
    test_rating_matrix_centered = np.zeros(A.shape)
    
    for row in range(A.shape[0]):
        ratings_sum = A[row].sum()
        movies_rated = (A[row] > 0).sum()
        mean = ratings_sum/movies_rated
        for i in train_rating_matrix[row].nonzero():
            train_rating_matrix_centered[row][i] = A[row][i] - mean
        for j in test_rating_matrix[row].nonzero():
            test_rating_matrix_centered[row][j] = A[row][j] - mean
    return train_rating_matrix_centered, test_rating_matrix_centered

In [68]:
train_rating_matrix, test_rating_matrix = train_test_split(0.8)

  after removing the cwd from sys.path.


3952

In [69]:
train_rating_matrix_centered, test_rating_matrix_centered = mean_center(train_rating_matrix, test_rating_matrix)

In [89]:
#### Collabarative filtering
#### Find k most similar users who have rated that movie.

### Find similarity matrix based on row

def find_similarity(A):
    similarity = np.zeros((A.shape[0], A.shape[0]))
    for i in range(A.shape[0]):
        mod1 = np.sqrt(np.dot(A[i],A[i]))
        for j in range(i+1, A.shape[0]):
            similarity_ij = np.dot(A[i], A[j])
            mod2 = np.sqrt(np.dot(A[j], A[j]))
            if mod2 == 0:
                similarity[i][j] = similarity[j][i] = 0
            else:
                similarity_ij /= (mod1 * mod2)
                similarity[i][j] = similarity[j][i] = similarity_ij
        print(i, end = "\r")
    return similarity

In [91]:
### User User collabrative filtering with k = 20

k = 20
predicted_rating_matrix = np.zeros(train_rating_matrix_centered.shape)
user_user_similarity = find_similarity(train_rating_matrix_centered)


668

In [125]:

for user in range(train_rating_matrix_centered.shape[0]):
    print(user, end = '\r')
#     user_user_similarity = []
#     mod = np.sqrt(np.dot(train_rating_matrix_centered[user], train_rating_matrix_centered[user]))
#     for i in range(train_rating_matrix_centered.shape[0]):
#         similarity = np.dot(train_rating_matrix_centered[user], train_rating_matrix_centered[i])
#         similarity /= (mod  * np.sqrt(np.dot(train_rating_matrix_centered[i], train_rating_matrix_centered[i])))
#         user_user_similarity.append(similarity)
    
#     similarity = np.asarray(user_user_similarity)
    similarity = user_user_similarity[user]
    sorted_similarity = np.argsort(-similarity)
    
    for movie in range(train_rating_matrix_centered.shape[1]):
        if test_rating_matrix[user][movie] != 0:
            cnt = 0
            weighted_sum = 0
            sum_of_weights = 0
            for similar_user in sorted_similarity:
                if train_rating_matrix[similar_user][movie] != 0 and similarity[similar_user] > 0:
                    cnt += 1
                    weighted_sum += similarity[similar_user] * train_rating_matrix_centered[similar_user][movie]
                    sum_of_weights += similarity[similar_user]
                    predicted_rating_matrix[user][movie] = weighted_sum/sum_of_weights
                if cnt == k:
                    break

        
                

668

In [126]:
error = 0
cnt = 0
for user in range(test_rating_matrix.shape[0]):
    for movie in range(test_rating_matrix_centered.shape[1]):
        ## Both conditions are necessary as both are required.
        if test_rating_matrix[user][movie] != 0 and predicted_rating_matrix[user][movie] != 0:
            error += ((predicted_rating_matrix[user][movie] - test_rating_matrix_centered[user][movie]) ** 2)
            cnt += 1
mse = np.sqrt(error/cnt)
print(error)
print(mse)

16961.80813370254
0.9234612609839605


In [139]:
### Item-Item collabarative filtering

train_rating_matrix_centered_transpose = train_rating_matrix_centered.T
train_rating_matrix_transpose = train_rating_matrix.T

test_rating_matrix_centered_transpose = test_rating_matrix_centered.T
test_rating_matrix_transpose = test_rating_matrix.T

item_item_similarity = find_similarity(train_rating_matrix_centered_transpose)
k = 20
predicted_rating_matrix_item = np.zeros(train_rating_matrix_centered_transpose.shape)


31

  app.launch_new_instance()


3951

In [149]:
%%time

for movie in range(train_rating_matrix_centered_transpose.shape[0]):
#     item_item_similarity = []
#     mod = np.sqrt(np.dot(train_rating_matrix_centered_transpose[movie], train_rating_matrix_centered_transpose[movie]))
#     print(movie, end = "\r")
#     for i in range(train_rating_matrix_centered_transpose.shape[0]):
#         similarity = np.dot(train_rating_matrix_centered_transpose[movie], train_rating_matrix_centered_transpose[i])
#         mod2 = np.sqrt(np.dot(train_rating_matrix_centered_transpose[i], train_rating_matrix_centered_transpose[i]))
#         if mod2 == 0:
#             similarity = 0
#         else:
#             similarity /= (mod * mod2) 
#         item_item_similarity.append(similarity)
    
#     similarity = np.asarray(item_item_similarity)
    similarity = item_item_similarity[movie]
    sorted_similarity = np.argsort(-similarity)
    
    for user in range(train_rating_matrix_transpose.shape[1]):
        if test_rating_matrix_transpose[movie][user] != 0:
            cnt = 0
            weighted_sum = 0
            sum_of_weights = 0
            for similar_movie in sorted_similarity:
                if train_rating_matrix_transpose[similar_movie][user] != 0 and similarity[similar_movie] > 0:
                    cnt += 1
                    weighted_sum += similarity[similar_movie] * train_rating_matrix_centered_transpose[similar_movie][user]
                    sum_of_weights += similarity[similar_movie]
                    predicted_rating_matrix_item[movie][user] = weighted_sum/sum_of_weights
                    if cnt == k:
                        break
            
                
                

CPU times: user 10 s, sys: 3.99 ms, total: 10 s
Wall time: 10 s


In [150]:
error = 0
cnt = 0
for movie in range(test_rating_matrix_transpose.shape[0]):
    for user in range(test_rating_matrix_transpose.shape[1]):
        if test_rating_matrix[user][movie] != 0 and predicted_rating_matrix_item[movie][user] != 0:
            error += ((predicted_rating_matrix_item[movie][user] - test_rating_matrix_centered[user][movie]) ** 2)
            cnt += 1
            print(cnt, end = "\r")
mse = np.sqrt(error/cnt)
print(error)
print(mse)

16739.539506627785
0.9167917458033292
