In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [3]:
anime = pd.read_csv('../anime-recommendations-database/anime.csv')
animeRating = pd.read_csv('../anime-recommendations-database/rating.csv')

### Cleaning Data for Missing Values

In [4]:
#Checking which values are null
print anime.isnull().sum()
print animeRating.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
user_id     0
anime_id    0
rating      0
dtype: int64


In [5]:
anime['genre'] = anime['genre'].fillna('None')
anime['type'] = anime['type'].fillna('None')
anime['rating'] = anime['rating'].fillna('None')
anime.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [6]:
animeRating = animeRating[animeRating.rating > 0]
animeRating.rating.unique()

array([10,  8,  6,  9,  7,  3,  5,  4,  1,  2], dtype=int64)

In [7]:
fullMergedAnime = animeRating.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
fullMergedAnime.head()

Unnamed: 0,user_id,anime_id,rating_user,name,genre,type,episodes,rating,members
0,1,8074,10,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
1,3,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
2,5,8074,2,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
3,12,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
4,14,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892


### Splitting and Normalizing Data

In [9]:
trainData, testData = train_test_split(fullMergedAnime, test_size=0.2)

In [204]:
subTrainData = trainData[['user_id', 'anime_id', 'name', 'rating_user']]
#Limiting the dataframe
subTrainData = subTrainData[subTrainData.user_id <= 6000]

In [202]:
pivTrain = subTrainData.pivot_table(index=['user_id'], columns=['name'], values='rating_user')
print(pivTrain.shape)
pivTrain.head()
# pivTrain.loc[3, :]

(5592, 6824)


name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,2.0,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,


In [23]:
subTestData = testData[['user_id', 'anime_id', 'name', 'rating_user']]
#Limiting the dataframe
subTestData = subTestData[subTestData.user_id <= 6000]

In [24]:
pivTest = subTestData.pivot_table(index=['user_id'], columns=['name'], values='rating_user')
print(pivTest.shape)

(5087, 5080)


In [30]:
def create_Normalized_Matrix(piv):
    piv = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
    piv.fillna(0, inplace=True)
    piv = piv.T
    piv = piv.loc[:, (piv != 0).any(axis=0)]
    return piv

In [137]:
pivTrainNorm = create_Normalized_Matrix(pivTrain)
print (pivTrainNorm.shape)

(6824, 5121)


In [241]:
pivTestNorm = create_Normalized_Matrix(pivTest)
print (pivTestNorm.shape)

(5080, 4358)


### Create Sparse Matrices

In [36]:
def create_Sparse_Matrix(userAnimeMatrix):
    return sp.sparse.csr_matrix(userAnimeMatrix.values)

In [37]:
pivTrainSparse = create_Sparse_Matrix(pivTrainNorm)

### User-User Cosine Similarity

In [49]:
def user_Cosine_Similarity(userMatrix):
    return cosine_similarity(userMatrix.T)

def user_Similarity_DataFrame(userSim, pivNorm):
    return pd.DataFrame(userSim, index = pivNorm.columns, columns = pivNorm.columns)

In [54]:
userCosineSim = user_Cosine_Similarity(pivTrainSparse)
trainUserSimData = user_Similarity_DataFrame(userCosineSim, pivTrainNorm)
trainUserSimData.head()

user_id,3,5,7,8,10,11,12,14,16,17,...,5986,5989,5990,5991,5992,5993,5994,5997,5999,6000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.0,0.036955,0.101795,-0.021938,-0.034424,0.044962,-0.001709,-0.006528,0.086482,0.157174,...,0.023348,0.033997,0.001897,0.004693,-0.025995,-0.011623,0.004257,0.03961,0.0,0.0
5,0.036955,1.0,0.068027,0.033965,0.031426,0.012828,0.074144,0.086708,0.006298,0.108224,...,0.020448,-0.011153,0.018907,0.029335,0.034503,0.003326,0.076578,0.021865,-0.038883,0.003053
7,0.101795,0.068027,1.0,0.0,0.02931,0.007777,0.03288,0.064169,0.009683,0.094038,...,0.026154,-0.056127,-0.016382,-0.006811,0.014396,-0.021295,0.016623,0.034716,0.0,0.010726
8,-0.021938,0.033965,0.0,1.0,0.074536,-0.010739,0.00315,0.079841,0.0,-0.005092,...,0.0,0.0,0.0,0.0,0.0,0.0,0.010887,0.0,0.0,0.0
10,-0.034424,0.031426,0.02931,0.074536,1.0,0.048028,0.007043,0.05951,0.0,0.067629,...,0.0,0.0,0.275519,0.0,0.0,0.0,0.108379,0.0,0.0,0.0


### User-User Implementation

In [98]:
def users_Avg_Rating_Matrix(userMatrix):
    avgUsersRating = {}
    for userID in userMatrix:
        ratingsSum = 0.0
        counter = 0
        for rating in userMatrix[userID]:
            if not np.isnan(rating):
                ratingsSum += rating
                counter += 1
            else:
                continue
        avgUsersRating[userID] = ratingsSum / counter
    return avgUsersRating

def user_Avg_Rating(userID):
    userRatings = pivTrain.loc[userID, :]
    avgUserRatingList = 0.0
    counter = 0
    for rating in userRatings:
        if not np.isnan(rating):
            avgUserRatingList += rating
            counter += 1
        else:
            continue
    return avgUserRatingList/counter

In [99]:
avgUserRatings = users_Avg_Rating_Matrix(pivTrain.T)
# print avgUserRatings
# avgUserRating = user_Avg_Rating(3)
# print avgUserRating

In [239]:
def rating_User_Prediction(userID, animeName):
    if userID in pivTrainNorm.columns:
        allUserSimilarity = trainUserSimData.sort_values(by=userID, ascending=False).loc[:,userID]
        userAvg = avgUserRatings[userID]
        ratingWeight = 0.0
        similarityWeight = 0.0
        topNUsers = 0
        
        for userX in allUserSimilarity.index:
            if userID == userX:
                continue
            
            userXRating = pivTrain.loc[userX, animeName]
            if userXRating < 0.01:
                continue
    
            if topNUsers < 20:
                userXAvg = avgUserRatings[userX]
                userXSimilarity = allUserSimilarity.loc[userX]
                if np.isnan(userXRating):
                    continue
                elif not np.isnan(userXRating):
                    ratingWeight += (userXRating - userXAvg)*userXSimilarity
                    similarityWeight += abs(userXSimilarity)
                    topNUsers += 1

    else:
        return ("No data available for UserID: {}".format(userID))
    
    return userAvg + (ratingWeight / similarityWeight)

In [240]:
prediction = rating_User_Prediction(3, "Zombie-Loan")
print prediction

6.423995174734735


### Evaluation using MAE and RMSE

In [288]:
# Mean Absolute Error
def calc_Mean_Absolute_Error():
    meanAbsErrorList = 0.0
    count = 0
    for userID in pivTest.index:
        if userID in pivTestNorm.columns:
            userInfoItem = pivTest.loc[userID, :]
            for anime in userInfoItem.index:
                trueRating = userInfoItem[anime]
                if np.isnan(trueRating):
                    continue
                predictedRating = rating_User_Prediction(userID, anime)
                meanAbsErrorList += abs(predictedRating - trueRating)
                count += 1
    return meanAbsErrorList/count


# Root Mean Squared Error
import math
def calc_Root_Mean_Square_Error():
    rootMSEList = 0.0
    count = 0
    for userID in pivTest.index:
        if userID in pivTestNorm.columns:
            userInfoItem = pivTest.loc[userID, :]
            for anime in userInfoItem.index:
                trueRating = userInfoItem[anime]
                if np.isnan(trueRating):
                    continue
                predictedRating = rating_User_Prediction(userID, anime)
                rootMSEList += (predictedRating - trueRating)**2
                count += 1
    return math.sqrt(rootMSEList / count)

In [287]:
meanAbsError = calc_Mean_Absolute_Error()
print meanAbsError

KeyboardInterrupt: 

In [None]:
rootMeanSquareError = calc_Root_Mean_Square_Error()
print rootMeanSquareError