In [36]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**installing packages**

In [37]:
# Install packages here
# Packages for data processing
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy as sp
import datetime
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn.metrics.pairwise import cosine_similarity
import re
from scipy.sparse import csr_matrix
import scipy as sp


# Packages for visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Packages for model evaluation
from math import sqrt
from sklearn.metrics import mean_squared_error
from time import time

# Package to suppress warnings
import warnings
warnings.filterwarnings("ignore")

# Packages for saving models
import pickle

**Load the data**

In [38]:
df_train = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/train.csv')
df_test = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/test.csv')
df_tags = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/tags.csv')
df_movies  = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/movies.csv')
df_links = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/links.csv')
df_imbd  = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/imdb_data.csv')
genome_tags = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/genome_tags.csv')
genome_score = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/genome_scores.csv')
sample_submission = pd.read_csv('../input/edsa-movie-recommender-challenge-2022/sample_submission.csv')


In [39]:
sample_submission

In [40]:
df_train.head(10)

In [41]:
df_movies.head()

In [42]:
genome_score.head()

In [43]:
df_test.head(40)

**EDA**


In [44]:
# Find the number of times a user has rated, create a data frame with the count by userId
train_user = pd.DataFrame(
    df_train['userId'].value_counts()).reset_index()
train_user.rename(columns={'index':'userId','userId':'count'},
                  inplace=True)
train_user.head()

**#Checking the datatypes in the dataframes**

In [45]:
df_movies.info() 

In [46]:
df_train.info()

In [47]:
#Checking Null Values

In [48]:
# Obtaining total null values in each Data Frames columns
train_count = pd.DataFrame(df_train.isnull().sum())
test_count = pd.DataFrame(df_train.isnull().sum())
tags_count = pd.DataFrame(df_tags.isnull().sum())
movies_count = pd.DataFrame(df_movies.isnull().sum())
links_count = pd.DataFrame(df_links.isnull().sum())
imdb_count = pd.DataFrame(df_imbd.isnull().sum())
genomet_count = pd.DataFrame(genome_tags.isnull().sum())
genomes_count = pd.DataFrame(genome_score.isnull().sum())

In [49]:
train_count

In [50]:
movies_count

In [51]:
links_count

In [52]:
plt.bar(links_count.index,
        links_count.values.reshape(len(links_count), ),
        color='orange')
plt.xlabel('column_name')
plt.ylabel('count')
plt.title('Null value count in links_df')
plt.show()

In [53]:
imdb_count

In [54]:
plt.bar(imdb_count.index,
        imdb_count.values.reshape(len(imdb_count), ),
        color='red')
plt.xlabel('column_name')
plt.ylabel('count')
plt.title('Null value count in imdb_df')
plt.show()

**Creating sample df**

In [55]:
sample_train = df_train.sample(n=55000, frac=None, replace=False, weights=None, 
                random_state=None, axis=None, ignore_index=False)


**Cleaning sample_train**

In [56]:
sample_train.drop('timestamp',axis=1)

**Creating pivot table**

In [57]:
sample_pivot = sample_train.pivot_table(index=['userId'],
                                       columns=['movieId'],
                                       values='rating')
sample_pivot.shape

In [58]:
# Normalize each row (a given user's ratings) of the utility matrix
sample_pivot_norm = sample_pivot.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# Fill Nan values with 0's, transpose matrix, and drop users with no ratings
sample_pivot_norm.fillna(0, inplace=True)
sample_pivot_norm = sample_pivot_norm.T
sample_pivot_norm = sample_pivot_norm.loc[:, (sample_pivot_norm != 0).any(axis=0)]
# Save the utility matrix in scipy's sparse matrix format
sample_pivot_sparse = sp.sparse.csr_matrix(sample_pivot_norm.values)

**Creating Similarity table**

In [59]:
# Compute the similarity matrix using the cosine similarity metric
user_similarity = cosine_similarity(sample_pivot_sparse.T)
# Save the matrix as a dataframe to allow for easier indexing  
user_sim_df = pd.DataFrame(user_similarity, 
                           index = sample_pivot_norm.columns, 
                           columns = sample_pivot_norm.columns)



In [60]:
# Review a small portion of the constructed similartiy matrix  
user_sim_df.head(20)

**Sorting similar users in Descending order**

In [61]:
similar_users_df = user_sim_df.loc[:, 12].sort_values(ascending = False)[:20]

In [62]:
similar_users_df.head(20)

In [63]:
similar_users_df.index

In [64]:
final1 = sample_pivot_norm.loc[ 1212, similar_users_df.index]

In [65]:
final1

In [66]:
sample_piv = sample_pivot_norm.loc[ 1212, :].sort_values(ascending = False)

In [67]:
sample_piv.head()

In [68]:
final = pd.merge(similar_users_df, sample_piv, on = 'userId') 

In [69]:
final

In [70]:
 final.columns = ['similarity', 'sim_user_rating']

In [71]:
final

**Calculating weighted average**

In [72]:
weighted_avg = (final["sim_user_rating"] * final["similarity"]).sum() / final["similarity"].sum()
weighted_avg

In [73]:
df_test.head()

**Isolating reference user and reference movie**

In [74]:
user_id = pd.merge(df_test, sample_train, on = 'userId') 

In [75]:
user_id

In [76]:
user_avg_rating = user_id['rating'].mean

In [77]:
user_avg_rating

In [78]:
ref_user = user_id.drop(['timestamp', 'movieId_x', 'movieId_y', 'rating'], axis=1 )

In [79]:
movie_id = pd.merge(df_test, sample_train, on = 'movieId') 

In [80]:
movie_avg_rating = movie_id['rating'].mean

In [81]:
movie_avg_rating

In [82]:
ref_movie = movie_id.drop(['timestamp', 'userId_x', 'userId_y', 'rating'], axis=1 )

In [83]:
ref_movie

****Create function that applies all steps made for the sampled train ****

In [84]:
df_test.index

In [85]:
def user_rating(ref_user, ref_movie):
    user_sim_df.loc[:, :]
    user_sim_df.loc[:, 12].sort_values(ascending = False)[:20]
    final1 = sample_pivot_norm.loc[ :, similar_users_df.index]
    pd.merge(similar_users_df, sample_piv, on = 'userId') 
    rating = (final["sim_user_rating"] * final["similarity"]).sum() / final["similarity"].sum() 
    return rating


In [86]:
user_rating

In [87]:
ratings = []
for idx in df_test.index:
    temp_rating = user_rating(ref_user, ref_movie)
    ratings.append(temp_rating)


In [88]:
ratings

In [89]:
submission_df = df_test.copy()

In [90]:
ratings.index

In [91]:
submission_df['Id'] = submission_df['userId'].astype(str) + '_' + submission_df['movieId'].astype(str)

In [92]:
submission_df

In [93]:
submission_df['rating'] = ratings

In [94]:
submission_df

In [95]:
submission_df.replace(0,user_id.mean(axis=0),inplace=True)

In [100]:
submission_df = submission_df.drop(['userId', 'movieId'], axis=1 )

In [101]:
submission_df

In [102]:
submission_df.to_csv ('submission.csv',index=False)

In [99]:
# Obtain the RMSE for the sample provided.
rmse = mean_squared_error(submission_df['rating'].values,
                          submission_df['rating'].values,
                          squared=False)
print("RMSE: ", rmse)