Names: Kalyan Kumar Alisetty, Maruthi Sankar Nanduri

In [1]:
from google.colab import drive
drive.mount('/content/drive') #This statement helps to connect to the google drive

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd
import numpy as np 
from numpy import dot
from numpy.linalg import norm # importing required packages

In [3]:
ratings = pd.read_csv("/content/drive/My Drive/CS5683/training_dataset.csv") #reading the training dataset into a pandas dataframe
ratings.head()

Unnamed: 0,user_id,item_id,rating,movie_name
0,196,242,3,Kolya (1996)
1,186,302,3,L.A. Confidential (1997)
2,22,377,1,Heavyweights (1994)
3,244,51,2,Legends of the Fall (1994)
4,166,346,1,Jackie Brown (1997)


In [4]:
users = ratings['user_id'].max() # Gives the number of users
movies = ratings['item_id'].max() #Gives the number of movies

In [5]:
R = np.empty((movies,users)) #Empty utility matrix with nan values
R[:] = np.nan

In [6]:
R_avg = np.empty((movies,users)) #Empty utility matrix with nan values
R_avg[:] = np.nan

In [7]:
path = '/content/drive/My Drive/CS5683/training_dataset.csv' #We can update the matrix iteration using above dataframe also, but its slow than this process of reading each line and updating
with open(path) as f:
  count = 0
  for line in f: #reading each line from the training dataset file
    lis = line.split(',') #split the line using comma as it is CSV file
    if count > 0: # To avoid the header line
      u = int(lis[0])-1 #user ID
      m = int(lis[1])-1 # Item ID
      r = int(lis[2]) #Rating
      R[m,u] = r # updating the utility matrix
    count += 1

In [8]:
for x in range(movies):
  avg = np.nanmean(R[x])
  R_avg[x] = R[x] - avg # Here we are just calculating the avg of each row in utility matrix and subtracting them from each row and updating those into another matrix which is also utility matrix

  


In [9]:
R_avg = np.nan_to_num(R_avg) #Replacing the nan in the utility matrix to zeros

In [29]:
def cosine(a,b): # Cosine similarity function
  similarity = {} #Cosine Similarity Values storing dictionary
  for movie in b: #For each movie
    cos_sim = dot(a, R_avg[movie-1])/(norm(a)*norm(R_avg[movie-1])) #Cosine similarity 
    #print(cos_sim)
    if cos_sim > 0.5: #If the cosine similarity is greater than .5
      similarity[movie] = cos_sim # add that movie and cosine similarity value to the dictionary
  return similarity # return similarity dictionary

In [None]:
testpath = '/content/drive/My Drive/CS5683/test_dataset.csv' # Testing dataset path
true_rating = [] #To store the original rating
pred_rating = [] # To store the predicted rating
with open(testpath) as f:
  count = 0
  for line in f: #reading each line from the training dataset file
    lis = line.split(',') #split the line using comma as it is CSV file
    num = 0
    den = 0
    if count > 0: # To avoid the header line
      u = int(lis[0])-1 #user ID
      m = int(lis[1])-1 # Item ID
      r = int(lis[2]) #Rating
      other_movies = list(ratings[ratings.user_id == u+1]['item_id']) # Collect the list of movies rating by that user in the line read using the initial dataframe.
      similar = cosine(R_avg[m],other_movies) #Sending the ratings of the movie read and also list of movies rated by the user in the line read
      #print(similar)
      for movie1 in similar: #For each movie in the 50% similar movies
        num = num + R[movie1-1][u]*similar[movie1] #Calculate the value of numerator in the predicted rating
        den = den + similar[movie1] #Calculate the value of denominator in the predicted rating
      true_rating.append(r) #Appending the values of the original rating to the list
      if den == 0: #If there is not movie which is atleast 50% similar then I have considered the average rating of that user in the line read
        pred_rating.append(np.nanmean(R[:,u]))
      else:
        pred_rating.append(num/den) # Else the predicted rating calculated using the given formula
    count += 1

In [21]:
from sklearn.metrics import mean_squared_error # This method is used to calculate RMSE
from math import sqrt
def RMSE(y_actual, y_predicted):
  rms = sqrt(mean_squared_error(y_actual, y_predicted))
  return round(rms,4)

In [31]:
print(RMSE(true_rating,pred_rating)) #RMSE on the test dataset using the cosine similarity metric

1.0377


In [15]:
avg_userrating = [] # We are collecting the average rating of every user into a list
for user1 in range(users):
  avg_userrating.append(np.nanmean(R[:,user1])) #The average rating is calculate by neglecting the nan values

In [32]:
def adj_cosine(a,b): # Calculating the adjusted cosine similarity 
  adj_similarity = {} #Adjusted Cosine Similarity Values storing dictionary
  for adj_movie in b:#For each movie
    adjcos_sim = dot(np.subtract(a,avg_userrating), np.subtract(R_avg[adj_movie-1],avg_userrating))/(norm(np.subtract(a,avg_userrating))*norm(np.subtract(R_avg[adj_movie-1],avg_userrating))) #Adjusted Cosine similarity by subtracting corresponding user rating
    if adjcos_sim > 0.5: #If the adjusted cosine similarity is greater than .5
      adj_similarity[adj_movie] = adjcos_sim # add that movie and adjusted cosine similarity value to the dictionary
  return adj_similarity # return similarity dictionary

In [33]:
testpath = '/content/drive/My Drive/CS5683/test_dataset.csv' # Testing dataset path
true_rating = [] #To store the original rating
pred_rating = [] # To store the predicted rating
with open(testpath) as f:
  count = 0
  for line in f: #reading each line from the training dataset file
    lis = line.split(',') #split the line using comma as it is CSV file
    num = 0
    den = 0
    if count > 0: # To avoid the header line
      u = int(lis[0])-1 #user ID
      m = int(lis[1])-1 # Item ID
      r = int(lis[2]) # Rating
      other_movies = list(ratings[ratings.user_id == u+1]['item_id']) # Collect the list of movies rating by that user in the line read using the initial dataframe.
      adj_similar = adj_cosine(R_avg[m],other_movies) #Sending the ratings of the movie read and also list of movies rated by the user in the line read
      for movie1 in adj_similar: #For each movie in the 50% similar movies
        num = num + R[movie1-1][u]*adj_similar[movie1] #Calculate the value of numerator in the predicted rating
        den = den + adj_similar[movie1] #Calculate the value of denominator in the predicted rating using the given formula
      true_rating.append(r) #Appending the values of the original rating to the list
      if den == 0: #If there is not movie which is atleast 50% similar then I have considered the average rating of that user in the line read
        pred_rating.append(np.nanmean(R[:,u]))
      else: # Else the predicted rating calculated using the given formula
        pred_rating.append(num/den)
      #pred_rating.append(num/den)
    count += 1

In [34]:
print(RMSE(true_rating,pred_rating))  #RMSE on the test dataset using the cosine similarity metric

1.0434


**Performance:** Maruthi Sankar Nanduri has implementation Cosine similarity and Kalyan Kumar Alisetty has implemented the code for adjusted cosine similarity

**Comparision of the two metrics models(Cosine and Adjusted Cosine):** The values for the RMSE on the test dataset using both the methods are almost the same.But, the cosine similarity has little bit of edge over the adjusted cosine similarity.