In [1]:
# Import relevant libraries 

import pandas as pd
import numpy as np
import scipy as sp

import operator
%matplotlib inline

In [155]:
#Read the Dataset
anime = pd.read_csv('anime.csv')
rating = pd.read_csv('rating.csv')

In [156]:
#Replace the -1 rating with Nan
rating.rating.replace({-1: np.nan}, inplace = True)
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,
1,1,24,
2,1,79,
3,1,226,
4,1,241,


In [157]:
# Selecte Only TV series from the dataset

anime_tv = anime[anime['type']=='TV']
anime_tv.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266
5,32935,Haikyuu!!: Karasuno Koukou VS Shiratorizawa Ga...,"Comedy, Drama, School, Shounen, Sports",TV,10,9.15,93351


In [159]:
# Join the two dataframes on the anime_id columns

merged = rating.merge(anime_tv, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)

In [163]:
# For computing reasons I'm limiting the dataframe length to 1000 users and 2000 animes

merged=merged[['user_id', 'anime_id', 'user_rating']]
#merged = merged[pd.notnull(merged['user_rating'])]
merged_sub= merged[merged.user_id <= 1000]
train = merged_sub[merged_sub.anime_id<=2000]
train.head()

Unnamed: 0,user_id,anime_id,user_rating
0,1,20,
1,3,20,8.0
2,5,20,6.0
3,6,20,
4,10,20,


In [169]:
train = train[pd.notnull(train['user_rating'])]
train.head()

Unnamed: 0,user_id,anime_id,user_rating
1,3,20,8.0
2,5,20,6.0
5,21,20,8.0
6,28,20,9.0
7,34,20,9.0


## Calculate Average Episodes and construct user x episodes matrix

In [191]:
avg_good_epi=np.zeros(1001)
avg_bad_epi=np.zeros(1001)
avg_epi=np.zeros(1001)

In [240]:
count_good=np.zeros(1001)
count_bad=np.zeros(1001)
count_all=np.zeros(1001)

In [249]:
for records in train.iterrows():
    movie=records[1]['anime_id']
    usr=int(records[1]['user_id'])
    if(usr==1):
        print(movie)
    rating=records[1]['user_rating']
    number=anime_tv[anime_tv['anime_id']==movie]['episodes']
    epi=int(number)
    if rating>5:
        avg_good_epi[usr]+=epi
        count_good[usr]+=1
    else:
        count_bad[usr]+=1
        avg_bad_epi[usr]+=epi
    avg_epi[usr]+=epi
    count_all[usr]+=1
    

In [254]:
for x in range(1001):
    if count_good[x]>0:
        avg_good_epi[x]/=count_good[x]
    if count_bad[x]>0:
        avg_bad_epi[x]/=count_bad[x]
    if count_all[x]>0:
        avg_epi/=count_all[x]

In [196]:
user_sim=np.zeros((1001,1001))

In [212]:
t=train.sample(frac=0.75,random_state=5)
test=train.drop(t.index)

In [227]:
u=np.concatenate((avg_bad_epi,avg_good_epi),axis=0)

In [256]:
x=np.reshape(avg_bad_epi,(1001,1))
y=np.reshape(avg_good_epi,(1001,1))
z=np.reshape(avg_epi,(1001,1))

In [257]:
u=np.concatenate((x,y),axis=1)

In [233]:
u=np.concatenate((x,y),axis=1)

In [263]:
user_epi_matrix=np.concatenate((u,z),axis=1)

## Create User x Anime matrix for train and test

In [213]:
user_movie_matrix_test=np.zeros((1001,2001))

In [213]:
user_movie_matrix_test=np.zeros((1001,2001))

In [201]:
for x in train.iterrows():
    i=int(x[1]['user_id'])
    j=int(x[1]['anime_id'])
    rating=x[1]['user_rating']
    user_movie_matrix[i][j]=rating

In [214]:
for x in test.iterrows():
    i=int(x[1]['user_id'])
    j=int(x[1]['anime_id'])
    rating=x[1]['user_rating']
    user_movie_matrix_test[i][j]=rating

## Calculate Mean Rating for every user

In [211]:
mean_rating=[0]*1001
for x in range(1001):
    u = train.loc[train['user_id']==x]
    if(len(u.user_rating)>0):
        mean_rating[x]=sum(u.user_rating)/len(u.user_rating)
mean_rating[3]

7.769230769230769

## Define function for cosine similarity

In [208]:
def cos_sim(x,y):
    dot=np.dot(x,y)
    norm=np.linalg.norm(x)*np.linalg.norm(y)
    sim=dot/norm
    return sim

## Baseline prediction

In [267]:
k=0.05
pred= {'user_id':[], 'anime_id':[], 'rating':[]}
for x in range(1001):
    movies=test['anime_id'][test['user_id']==x]
    for m in movies:
        pred['anime_id'].append(m)
        similar_users=train['user_id'][train['anime_id']==m]
        rating=0
        for u in similar_users:
            if u!=x:
                rating+=cos_sim(user_movie_matrix[x],user_movie_matrix[u])*(user_movie_matrix[u][m]-mean_rating[u])
        r=mean_rating[x]+k*rating
        pred['user_id'].append(x)
        pred['rating'].append(r)

## Prediction using sum of cosine similarity of rating and episodes

In [329]:
k=0.05
b=0.8
pred_epi_sum= {'user_id':[], 'anime_id':[], 'rating':[]}
for x in range(1001):
    movies=test['anime_id'][test['user_id']==x]
    for m in movies:
        pred_epi_sum['anime_id'].append(m)
        similar_users=train['user_id'][train['anime_id']==m]
        rating=0
        for u in similar_users:
            if u!=x:
                cos_sim_rat=cos_sim(user_movie_matrix[x],user_movie_matrix[u])
                cos_sim_epi=cos_sim(user_epi_matrix[x],user_epi_matrix[u])
                rating+=((b*cos_sim_rat)+((1-b)*cos_sim_epi))*(user_movie_matrix[u][m]-mean_rating[u])
        r=mean_rating[x]+k*rating
        pred_epi_sum['user_id'].append(x)
        pred_epi_sum['rating'].append(r)

## Prediction using product of cosine similarity of rating and episodes

In [281]:
k=0.05
pred_epi= {'user_id':[], 'anime_id':[], 'rating':[]}
for x in range(1001):
    movies=test['anime_id'][test['user_id']==x]
    for m in movies:
        pred_epi['anime_id'].append(m)
        similar_users=train['user_id'][train['anime_id']==m]
        rating=0
        for u in similar_users:
            if u!=x:
                cos_sim_rat=cos_sim(user_movie_matrix[x],user_movie_matrix[u])
                cos_sim_epi=cos_sim(user_epi_matrix[x],user_epi_matrix[u])
                rating+=((cos_sim_rat)*(cos_sim_epi))*(user_movie_matrix[u][m]-mean_rating[u])
        r=mean_rating[x]+k*rating
        pred_epi['user_id'].append(x)
        pred_epi['rating'].append(r)

## Prediction by taking in account the closeness from the anime to be predicted

In [303]:
k=0.05
pred_rating= {'user_id':[], 'anime_id':[], 'rating':[]}
for x in range(1001):
    movies=test['anime_id'][test['user_id']==x]
    for m in movies:
        pred_rating['anime_id'].append(m)
        similar_users=train['user_id'][train['anime_id']==m]
        rating=0
        for u in similar_users:
            if u!=x:
                rating+=cos_sim(user_movie_matrix[x],user_movie_matrix[u])*(user_movie_matrix[u][m]-mean_rating[u])
        r=mean_rating[x]+k*rating
        number=anime_tv[anime_tv['anime_id']==m]['episodes']
        epi=int(number)
        good_close=abs(avg_good_epi[x]-epi)
        bad_close=abs(avg_bad_epi[x]-epi)
        if(good_close>bad_close):
            if(avg_good_epi[x]>0):
                r+=0.2*(abs(avg_good_epi[x]-epi)/avg_good_epi[x])
        else:
            if(avg_bad_epi[x]>0):
                r-=0.2*(abs(avg_bad_epi[x]-epi)/avg_bad_epi[x])
        pred_rating['user_id'].append(x)
        pred_rating['rating'].append(r)

In [268]:
predictions = pd.DataFrame.from_dict(pred)

In [330]:
predictions_epi_sum=pd.DataFrame.from_dict(pred_epi_sum)

In [282]:
predictions_epi=pd.DataFrame.from_dict(pred_epi)

In [304]:
predictions_rating=pd.DataFrame.from_dict(pred_rating)

## Baseline Results

In [269]:
import math
RMSE=0.0
MAE=0.0
count=0
for pred in predictions.iterrows():
    i=int(pred[1]['user_id'])
    j=int(pred[1]['anime_id'])
    rating=pred[1]['rating']
    MAE+=abs(rating-user_movie_matrix_test[i][j])
    RMSE+=(rating-user_movie_matrix_test[i][j])**2
    count+=1
MAE/=count
RMSE/=count
RMSE=math.sqrt(RMSE)
print("MAE",MAE)
print("RMSE",RMSE)


MAE 1.0895249319737477
RMSE 1.5054484040036809


## Results on sum of cosine similarity of rating and episodes

In [331]:
import math
RMSE=0.0
MAE=0.0
count=0
for pred in predictions_epi_sum.iterrows():
    i=int(pred[1]['user_id'])
    j=int(pred[1]['anime_id'])
    rating=pred[1]['rating']
    MAE+=abs(rating-user_movie_matrix_test[i][j])
    RMSE+=(rating-user_movie_matrix_test[i][j])**2
    count+=1
MAE/=count
RMSE/=count
RMSE=math.sqrt(RMSE)
print("MAE",MAE)
print("RMSE",RMSE)


MAE 1.2417140934955182
RMSE 1.8315199611732704


## Result on product of both the similarity

In [283]:
import math
RMSE=0.0
MAE=0.0
count=0
for pred in predictions_epi.iterrows():
    i=int(pred[1]['user_id'])
    j=int(pred[1]['anime_id'])
    rating=pred[1]['rating']
    MAE+=abs(rating-user_movie_matrix_test[i][j])
    RMSE+=(rating-user_movie_matrix_test[i][j])**2
    count+=1
MAE/=count
RMSE/=count
RMSE=math.sqrt(RMSE)
print("MAE",MAE)
print("RMSE",RMSE)


MAE 1.0273690125119073
RMSE 1.390845600498123


## Result using Closeness from the average good or bad episode

In [305]:
# Your Code Here...
# Report Mean Absolute Error and Root Mean Squared Error for test set

import math
RMSE=0.0
MAE=0.0
count=0
for pred in predictions_rating.iterrows():
    i=int(pred[1]['user_id'])
    j=int(pred[1]['anime_id'])
    rating=pred[1]['rating']
    MAE+=abs(rating-user_movie_matrix_test[i][j])
    RMSE+=(rating-user_movie_matrix_test[i][j])**2
    count+=1
MAE/=count
RMSE/=count
RMSE=math.sqrt(RMSE)
print("MAE",MAE)
print("RMSE",RMSE)


MAE 1.1874551048829776
RMSE 1.663290973754889
