# Naive collaborative filtering

In [8]:
import time
import random
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
from collections import Counter

#surprise package
from surprise import Reader
from surprise import Dataset
from surprise import accuracy

from surprise import KNNWithMeans
from surprise import NMF
from surprise import SVD
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold
from surprise.model_selection import train_test_split
from surprise.prediction_algorithms.predictions import Prediction
from sklearn import metrics
from sklearn.metrics import roc_curve, auc

In [11]:
#----THIS IS NAIVE COLLABORATIVE FILTERING----#
#Part 6

class NaiveFilter():
    def __init__(self):
        self.user_predictor = {}
    def fit(self, user_ratings_dict):
        self.user_predictor = {}
        for user_id, user_ratings in user_ratings_dict.items():
            self.user_predictor[user_id]  = np.mean(user_ratings_dict[user_id])
    def predict(self, testing_data):
        prediction_result_arr = []
        for line in testing_data:
            user_id = line[0]
            predict_rating = self.user_predictor[user_id]
            predict_result = Prediction(line[0], line[1], line[2], predict_rating, None)
            prediction_result_arr.append(predict_result)
        return prediction_result_arr
    
users_arr = []
movies_arr = []
rating_arr = []

movie_dic = {}
user_ratings_dict = {}
with open('ratings.csv') as csvRatingFile:
    csvReader = csv.reader(csvRatingFile)
    #csvReader.readline()   # skip the first line
    next(csvReader)
    for row in csvReader:  
        user_id = int(row[0])
        movies_id = int(row[1])
        rating = float(row[2])
        movie_dic.setdefault(movies_id,[]).append(rating)
        user_ratings_dict.setdefault(user_id,[]).append(rating)
        users_arr.append(int(row[0]))
        movies_arr.append(int(row[1]))
        rating_arr.append(float(row[2]))

num_ratings = len(rating_arr)
num_users = len(np.unique(users_arr))
num_movies = len(np.unique(movies_arr))

In [12]:
#Q 31
nf = NaiveFilter()        
reader = Reader(rating_scale=(0,5))
trainset,testset = train_test_split(data,test_size=0.1)
nf.fit(user_ratings_dict)
r  = nf.predict(testset)
accuracy.rmse(r, verbose=True)

n_splits = 10
kf = KFold(n_splits=n_splits)
set_kf = kf.split(data)

nf.fit(user_ratings_dict)
rmse_sum = 0
for trainset, testset in set_kf: #for each of the 10 folds, calculate rmse and mae
    predictions = nf.predict(testset)
    rmse_sum += accuracy.rmse(predictions, verbose=True)
    
rmse_avg = rmse_sum/n_splits
print("avg rmse: ", rmse_avg)

RMSE: 1.3833
RMSE: 1.3852
RMSE: 1.3788
RMSE: 1.3772
RMSE: 1.3915
RMSE: 1.3843
RMSE: 1.3744
RMSE: 1.3918
RMSE: 1.3827
RMSE: 1.3797
RMSE: 1.3849
('avg rmse: ', 1.3830500261241228)


In [45]:
#Q 32
nf = NaiveFilter()        
reader = Reader(rating_scale=(0,5))
trainset,testset = train_test_split(data,test_size=0.1)
nf.fit(user_ratings_dict)
r  = nf.predict(testset)
accuracy.rmse(r, verbose=True)

n_splits = 10
kf = KFold(n_splits=n_splits)
set_kf = kf.split(data)

nf.fit(user_ratings_dict)
rmse_sum = 0
for trainset, testset in set_kf: #for each of the 10 folds, calculate rmse and mae
    testset = pd.DataFrame(testset, columns=['userId','movieId','rating'])
    testset = testset.groupby('movieId').filter(lambda x: len(x)>2)
    testset = np.array(testset).tolist()
    predictions = nf.predict(testset)
    rmse_sum += accuracy.rmse(predictions, verbose=True)
    
rmse_avg = rmse_sum/n_splits
print("avg rmse: ", rmse_avg)

RMSE: 1.3905
RMSE: 1.4260
RMSE: 1.4167
RMSE: 1.4251
RMSE: 1.4096
RMSE: 1.4151
RMSE: 1.4265
RMSE: 1.4140
RMSE: 1.4145
RMSE: 1.4125
RMSE: 1.4231
('avg rmse: ', 1.4183049759785384)


In [46]:
#Q 33
nf = NaiveFilter()        
reader = Reader(rating_scale=(0,5))
trainset,testset = train_test_split(data,test_size=0.1)
nf.fit(user_ratings_dict)
r  = nf.predict(testset)
accuracy.rmse(r, verbose=True)

n_splits = 10
kf = KFold(n_splits=n_splits)
set_kf = kf.split(data)

nf.fit(user_ratings_dict)
rmse_sum = 0
for trainset, testset in set_kf: #for each of the 10 folds, calculate rmse and mae
    testset = pd.DataFrame(testset, columns=['userId','movieId','rating'])
    testset = testset.groupby('movieId').filter(lambda x: len(x)<2)
    testset = np.array(testset).tolist()
    predictions = nf.predict(testset)
    rmse_sum += accuracy.rmse(predictions, verbose=True)
    
rmse_avg = rmse_sum/n_splits
print("avg rmse: ", rmse_avg)

RMSE: 1.3693
RMSE: 1.2986
RMSE: 1.2531
RMSE: 1.2775
RMSE: 1.2875
RMSE: 1.2885
RMSE: 1.3118
RMSE: 1.3161
RMSE: 1.3034
RMSE: 1.2887
RMSE: 1.2845
('avg rmse: ', 1.2909739122289225)


In [47]:
#Q 34
nf = NaiveFilter()        
reader = Reader(rating_scale=(0,5))
trainset,testset = train_test_split(data,test_size=0.1)
nf.fit(user_ratings_dict)
r  = nf.predict(testset)
accuracy.rmse(r, verbose=True)

n_splits = 10
kf = KFold(n_splits=n_splits)
set_kf = kf.split(data)

nf.fit(user_ratings_dict)
rmse_sum = 0
for trainset, testset in set_kf: #for each of the 10 folds, calculate rmse and mae
    testset = pd.DataFrame(testset, columns=['userId','movieId','rating'])
    testset = testset.groupby('movieId').filter(lambda x: len(x)>5 and np.var(x['rating'])>=2)
    testset = np.array(testset).tolist()
    predictions = nf.predict(testset)
    rmse_sum += accuracy.rmse(predictions, verbose=True)
    
rmse_avg = rmse_sum/n_splits
print("avg rmse: ", rmse_avg)

RMSE: 1.3978
RMSE: 1.7345
RMSE: 1.5251
RMSE: 1.6742
RMSE: 1.6040
RMSE: 1.6637
RMSE: 1.5689
RMSE: 1.5904
RMSE: 1.4622
RMSE: 1.5084
RMSE: 1.6369
('avg rmse: ', 1.5968301985877393)
