In [1]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers import Embedding
from keras.layers import Concatenate, Dense, Dropout
from keras.optimizers import Adam
from keras.regularizers import l2
import numpy as np
import torch
import torch.nn as nn                 # the torch module to implement the Neural Networks
import torch.nn.parallel              # for parallel computations
import torch.optim as optim           # for optimizers
import torch.utils.data               # tools
from torch.autograd import Variable   # for Stochastic Gradient Descent
import random as rnd

df = pd.read_csv('netflix_price.csv').astype({
    'movie_id' : 'int16',
    'user_id' : 'int32',
    'rating' : 'int8',
    'date' : 'datetime64[us]'
})
probe_df = pd.read_csv('probe_data_new.csv')
print(df)
df['user_id'] = df['user_id'].astype(np.uint32)
df['movie_id'] = df['movie_id'].astype(np.uint16)
df['rating'] = df['rating'].astype(np.uint8)

# https://stackoverflow.com/questions/57507832/unable-to-allocate-array-with-shape-and-data-type

print(df.shape)


         Unnamed: 0  user_id  rating       date  movie_id
0                 0  1488844       3 2005-09-06         1
1                 1   822109       5 2005-05-13         1
2                 2   885013       4 2005-10-19         1
3                 3    30878       4 2005-12-26         1
4                 4   823519       3 2004-05-03         1
...             ...      ...     ...        ...       ...
5010194         755  1954284       5 2005-05-04      1000
5010195         756   299636       2 2005-10-27      1000
5010196         757  1635449       1 2005-01-10      1000
5010197         758   906984       4 2005-05-13      1000
5010198         759  2633357       3 2005-10-13      1000

[5010199 rows x 5 columns]
(5010199, 5)


In [2]:
df_movie_summary = df.groupby('movie_id')['rating'].agg(['count'])
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.96),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print(f'Movie minimum times of review: {movie_benchmark}')

df_cust_summary = df.groupby('user_id')['rating'].agg(['count'])
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.99),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print(f'Customer minimum times of review: {cust_benchmark}')

df = df[~df['movie_id'].isin(drop_movie_list)]
df = df[~df['user_id'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(df.shape))


Movie minimum times of review: 30631.0
Customer minimum times of review: 76.0
After Trim Shape: (112763, 5)


In [3]:
num_movies = df['movie_id'].nunique()
num_users = df['user_id'].nunique()

unique_users = df['user_id'].unique()
unique_movies = df['movie_id'].unique()
unique_users.sort()
unique_users.sort()

print("Users: ", num_users)
print("Movies :",num_movies)

x = df[['user_id','rating', 'movie_id']].values
y = df['rating'].values
print(df)
x_training, x_test, y_training, y_test = train_test_split(x, y, test_size=0.2)
# x_train, x_val, y_train, y_val = train_test_split(x_training, y_training, test_size=0.1)

# convert to a users x movies matrix
def transform(data):
    new_data = []
    for cust_id in unique_users:
        # get all movies from this user
        movie_ids = data[:,2][data[:,0]==cust_id]
        # get all ratings from this user
        rating_ids = data[:,1][data[:,0]==cust_id]
        ratings = np.zeros(num_movies)
        # fill in the user's row with the ratings
        for movie_id in movie_ids :
          idx_in_rating_list =  list(unique_movies).index(movie_id)
          ratings[idx_in_rating_list] = rating_ids[list(movie_ids).index(movie_id)]
        new_data.append(list(ratings))

    return new_data

x_training = np.array(x_training)
x_test = np.array(x_test)
x_training = transform(x_training)
print(x_training)
print(len(x_training))
print(x_test)
print(len(x_test))
print(len(y_test))
x_test_original = x_test
x_test = transform(x_test)

print(len(x_test))

Users:  4175
Movies : 40
         Unnamed: 0  user_id  rating       date  movie_id
52525             2  1990901       5 2004-02-16        28
52533            10  1468812       4 2003-01-14        28
52558            35  1276913       5 2003-09-08        28
52560            37  1663216       4 2004-06-08        28
52567            44  1744889       1 2003-04-29        28
...             ...      ...     ...        ...       ...
4945478       89891   453585       4 2005-03-06       985
4945488       89901  1385356       3 2005-04-06       985
4945492       89905  2633357       3 2005-04-21       985
4945522       89935    15737       3 2005-06-22       985
4945577       89990  2180413       4 2005-12-02       985

[112763 rows x 5 columns]
[[2.0, 3.0, 0.0, 0.0, 1.0, 0.0, 0.0, 3.0, 0.0, 0.0, 4.0, 0.0, 0.0, 3.0, 0.0, 2.0, 3.0, 5.0, 3.0, 3.0, 0.0, 2.0, 0.0, 1.0, 0.0, 2.0, 4.0, 2.0, 2.0, 0.0, 0.0, 3.0, 2.0, 3.0, 0.0, 4.0, 4.0, 0.0, 2.0, 1.0], [0.0, 0.0, 0.0, 0.0, 5.0, 4.0, 0.0, 3.0, 0.0, 4.0

In [4]:
# For each user, get it's neighbors using cosine similarity
from scipy.spatial.distance import cosine
from scipy import stats

x_training_array = np.array(x_training, dtype=float)
print(x_training_array)
x_training_array[x_training_array == 0] = np.nan

# Calculate mean of each user's ratings, excluding NaN values
user_means = np.nanmean(x_training_array, axis=1)

# # Zero-center the user vectors, replacing NaN with 0
zero_centered_users = np.nan_to_num(x_training_array - user_means.reshape(-1, 1), nan=0)


# Precompute cosine similarities
cosine_similarities = np.zeros((len(unique_users), len(unique_users)))
for i in range(0, len(unique_users)):
    for j in range(0, len(unique_users)):
        cosine_similarities[i, j] = cosine(zero_centered_users[i], zero_centered_users[j])

print(cosine_similarities)
import numpy as np
final_predictions = []
for row in x_test_original:
    userId = row[0]
    movie_id = row[2]
    index = list(unique_users).index(userId)
    user = x_training[index]
    # cosine_similarities = {}
    # for i, row in enumerate(x_training):
    #     if i % 3 != 0 :
    #         continue
    #     # get cosine similarity
    #     compariative_user_id = unique_users[i]
    #     cosine_similarity = cosine(user, row)
    #     cosine_similarities[compariative_user_id] = cosine_similarity
    similarities = cosine_similarities[index]
    # print(similarities)
    # Get the indexes of the top 5 most similar users

    similar_indices = np.argsort(similarities)[-30:]
    # similar_indices = np.argsort(similarities)[::-1]
    # print(similarities)
    # print(similar_indices)
    # sort the cosine similarities
    # cs_sort  = {key: val for key, val in sorted(cosine_similarities.items(), key = lambda ele: ele[1], reverse = True)}
    # print the highest value in cs_sort
    # print(cs_sort.items())
    # count = 0
    movie_ratings = []
    # for key, value in cs_sort.items():
    #     if(count == 0): # itself, skip
    #         count+=1
    #         continue
    #     if(count == 4): # top 5 users
    #         break
    #     # print(key, value)
    #     # print(key)
    #     similar_idx = list(unique_users).index(key)
    #     similar_user = x_training[similar_idx]
    #     # print(similar_user)
    #     # print(user)
    #     movie_idx = list(unique_movies).index(movie_id)
    #     rating = similar_user[movie_idx]
    #     # print(rating)
    #     if(rating > 0) :
    #         movie_ratings.append(rating)
    #         count+=1
    # # print(movie_ratings)
    # predicted_rating = stats.mode(movie_ratings)
    # final_predictions.append(predicted_rating.mode)
    # length = len(final_predictions)
    # if(length % 300 == 0):
    #     print(length, len(y_test))
    for value in similar_indices:
        similar_user_ratings = x_training[value]
        rating = similar_user_ratings[list(unique_movies).index(movie_id)]
        if(rating > 0):
            movie_ratings.append(rating)
    if(len(movie_ratings) == 0):
        final_predictions.append(4)
    else:
        print(movie_ratings)
        final_predictions.append(stats.mode(movie_ratings).mode)
    # print(similar_indices)
    


MSE = mean_squared_error(y_test, final_predictions)
print("RMSE: ", pow(MSE,1/2))

# 25 mins at 98% users wasn't enough

# 99% users took 11.5m

    

[[2. 3. 0. ... 0. 2. 1.]
 [0. 0. 0. ... 0. 4. 5.]
 [0. 0. 0. ... 4. 5. 0.]
 ...
 [0. 5. 3. ... 5. 0. 5.]
 [0. 0. 4. ... 0. 0. 0.]
 [3. 4. 5. ... 0. 2. 2.]]


  dist = 1.0 - uv / np.sqrt(uu * vv)


[[0.         1.13503128 1.000832   ... 1.04460861 1.01136396 0.65305005]
 [1.13503128 0.         0.94709427 ... 0.6919541  1.18419621 0.94770104]
 [1.000832   0.94709427 0.         ... 0.93592975 0.66682359 1.19856894]
 ...
 [1.04460861 0.6919541  0.93592975 ... 0.         0.86699106 1.09978902]
 [1.01136396 1.18419621 0.66682359 ... 0.86699106 0.         1.07090627]
 [0.65305005 0.94770104 1.19856894 ... 1.09978902 1.07090627 0.        ]]
[4.0, 4.0, 5.0, 3.0, 4.0, 4.0, 5.0, 4.0, 4.0, 5.0, 3.0, 3.0, 5.0, 5.0, 3.0, 4.0, 5.0, 4.0, 3.0, 1.0, 2.0, 2.0, 4.0, 2.0]
[5.0, 1.0, 5.0, 5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 3.0, 4.0, 2.0]
[5.0, 4.0, 5.0, 4.0, 5.0, 4.0, 5.0, 4.0, 4.0, 4.0, 5.0, 4.0, 5.0, 4.0, 4.0, 5.0]
[5.0, 4.0, 4.0, 4.0, 4.0, 4.0, 3.0, 4.0, 3.0, 4.0, 3.0]
[3.0, 2.0, 2.0, 3.0, 4.0, 3.0]
[4.0, 3.0, 5.0, 3.0, 4.0, 4.0, 5.0, 5.0, 4.0, 5.0, 4.0]
[2.0, 2.0, 3.0, 5.0, 4.0, 5.0, 4.0, 3.0, 5.0, 1.0, 5.0, 3.0, 3.0, 4.0, 3.0, 5.0, 3.0, 5.0, 4.0]
[4.0, 5.0, 3.0, 3.0, 4.0, 4.0, 4.0, 2.0