<h3 align="center">Movie Rating Prediction using Collaborative Filtering</h3> 

#### Dependencies for the project

In [3]:
import pandas as pd
import numpy as np
import operator
from math import sqrt
import timeit

####    Reading the dataset and assigning column names.

In [4]:
columns = ['user','movie','rating','idk']
data = pd.read_csv("ratings_training_95.csv",names = columns)
test = pd.read_csv("ratings_test_05.csv",names = columns)

#### Removing the extra feature/column

In [5]:
data = data.drop('idk', 1)
test = test.drop('idk', 1)

### Scaling Data
    * Scaling data for pearson corelation or centered cosine

In [6]:
#
# Getting average rating for a user
#
all_user = list(set(data.user))
avg_rating_of_user = []
for i in range(len(all_user)):
    df = data.ix[data.user==i]
    avg_rating_of_user.append([i,np.average(df.rating.values)])

In [7]:
all_users= data.user.unique()
avg_rating = []
for i in all_users:
    avg_rating.append([i,np.average(data[data.user==i].rating.values)])
data['scaled'] = 0
for i in all_users:
    data['scaled'][data.user == i] = avg_rating[i][1]

data['adj_rating'] = data.rating - data.scaled

# creates pivoted dataframe
data_ibs = data.pivot(index='user', columns='movie', values='adj_rating')
data_ibs = data_ibs.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [8]:
# this similarity should be between -1 to 1
def my_cosine(vec1,vec2):
    num = 0
    den1 = 0
    den2 = 0
    for i in range(len(vec1)):
        num += vec1[i]*vec2[i]
        if(vec1[i]==0 or vec2[i]==0):
            continue
        else:
            den1 += vec1[i]**2
            den2 += vec2[i]**2
    den = sqrt(den1) * sqrt(den2)
    sim = num/den
    return sim

# Only uncomment when you want to create one.

sim_mat = pd.DataFrame(np.zeros((9216, 9216)))
sim_mat = sim_mat.replace(0,-5)

# Sim function is the main Guy

In [9]:
def sim(test_instance,data,pivot_df):
    similarity = []
    item_for_test = int(test_instance.movie.values)
    for_user = int(test_instance.user.values)
    checkonly_movies = data.ix[data.user==for_user].movie.values  # normal df is used here
    for m in checkonly_movies:
        # using pivot data
        if sim_mat[item_for_test][m] == -5:
            sim = my_cosine(pivot_df[item_for_test], pivot_df[m])
            sim_mat[item_for_test][m] = sim
            sim_mat[m][item_for_test] = sim
            rating = pivot_df[m][for_user]
            similarity.append((m,sim,for_user,item_for_test,rating))
        else:
            sim = sim_mat[item_for_test][m]
            rating = pivot_df[m][for_user]
            similarity.append((m,sim,for_user,item_for_test,rating))
    #similarity.sort(key=operator.itemgetter(1))
    return similarity

In [10]:
from datetime import datetime
prediction = []
count = 0
start = timeit.default_timer()
for i in range(len(test)):
    count += 1
    simi = sim(test.loc[i:i,],data,data_ibs)
    sim1 = 0
    semi_sim = 0
    for i in simi:
        from_movie = int(i[0])
        similar = i[1]
        user = int(i[2])
        pred_movie = i[3]
        if(similar > 0):
            tmp = data[(data.movie==from_movie) & (data.user == user)].rating.values*similar
            semi_sim += tmp[0]
            sim1 += similar
            temprate = test[(test.movie==pred_movie) & (test.user == user)].rating.values
    prediction.append([user,pred_movie,temprate,semi_sim,sim1])
    if(count%100 == 0):
        print('Step:',count, 'time:',str(datetime.now().time()))
stop = timeit.default_timer()
print('total time:',stop - start) 

Step: 100 time: 16:51:42.206831
Step: 200 time: 17:37:15.806758
Step: 300 time: 18:04:27.629234
Step: 400 time: 18:13:20.191576
Step: 500 time: 18:27:40.436211
Step: 600 time: 18:50:23.642408
Step: 700 time: 18:53:08.708262
Step: 800 time: 19:00:51.304764
Step: 900 time: 19:08:32.679912
Step: 1000 time: 19:10:34.263853
Step: 1100 time: 19:40:15.706248
Step: 1200 time: 20:10:14.123741
Step: 1300 time: 20:14:17.774355
Step: 1400 time: 20:17:02.797299
Step: 1500 time: 20:20:53.164768
Step: 1600 time: 20:32:14.826082
Step: 1700 time: 20:38:18.486291
Step: 1800 time: 20:46:34.106939
Step: 1900 time: 20:51:23.369293
Step: 2000 time: 20:55:47.917921
Step: 2100 time: 20:57:25.349532
Step: 2200 time: 21:01:37.305512
Step: 2300 time: 21:05:31.542956
Step: 2400 time: 21:09:48.616596
Step: 2500 time: 21:13:05.999485
Step: 2600 time: 21:18:09.027233
Step: 2700 time: 21:23:52.460510
Step: 2800 time: 21:27:30.542572
Step: 2900 time: 21:44:33.096206
Step: 3000 time: 21:56:30.182921
Step: 3100 time: 21

In [15]:
# save it for future use.
sim_mat.to_csv('similaritymatrix_95_05.csv')

col = ['user','movie','rating','sim_num','sim_den']
df= pd.DataFrame(prediction, columns=col)
df['prediction'] = round(df.sim_num/df.sim_den,1)
actual_rating = []
for i in range(len(df.rating)):
    actual_rating.append(df.rating[i][0])
df = df.drop('rating', 1)
df = df.drop('sim_num', 1)
df = df.drop('sim_den', 1)
df['rating'] = actual_rating

df = df[['user','movie','rating','prediction']]
df.to_csv('ratings_predictions_05_cf.csv')

In [12]:
mse1_20 = (((df.prediction - df.rating) ** 2).sum()) / len(df.prediction)
pow(mse1_20,0.5)

0.8986882417760402