In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
anime = pd.read_csv('../anime-recommendations-database/anime.csv')
animeRating = pd.read_csv('../anime-recommendations-database/rating.csv')

### Cleaning Data for Missing Values

In [3]:
#Checking which values are null
print anime.isnull().sum()
print animeRating.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
user_id     0
anime_id    0
rating      0
dtype: int64


In [4]:
anime['genre'] = anime['genre'].fillna('None')
anime['type'] = anime['type'].fillna('None')
anime['rating'] = anime['rating'].fillna('None')
anime.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [5]:
animeRating = animeRating[animeRating.rating > 0]
animeRating.rating.unique()

array([10,  8,  6,  9,  7,  3,  5,  4,  1,  2], dtype=int64)

In [6]:
fullMergedAnime = animeRating.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
fullMergedAnime.head()

Unnamed: 0,user_id,anime_id,rating_user,name,genre,type,episodes,rating,members
0,1,8074,10,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
1,3,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
2,5,8074,2,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
3,12,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
4,14,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892


### Splitting and Normalizing Data

In [7]:
trainData, testData = train_test_split(fullMergedAnime, test_size=0.2)
fullMergedAnime.shape

(6337239, 9)

In [8]:
subTrainData = trainData[['user_id', 'anime_id', 'name', 'rating_user']]
#Limiting the dataframe
subTrainData = subTrainData[subTrainData.user_id <= 6000]

In [9]:
pivTrain = subTrainData.pivot_table(index=['user_id'], columns=['name'], values='rating_user')
print(pivTrain.shape)
pivTrain.head()

(5604, 6794)


name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies 2,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,2.0,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [10]:
subTestData = testData[['user_id', 'anime_id', 'name', 'rating_user']]
#Limiting the dataframe
subTestData = subTestData[subTestData.user_id <= 6000]

In [11]:
pivTest = subTestData.pivot_table(index=['user_id'], columns=['name'], values='rating_user')
print(pivTest.shape)

(5085, 5118)


In [12]:
def create_Normalized_Matrix(piv):
    piv = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
    piv.fillna(0, inplace=True)
    piv = piv.T
    piv = piv.loc[:, (piv != 0).any(axis=0)]
    return piv

In [13]:
pivTrainNorm = create_Normalized_Matrix(pivTrain)
print (pivTrainNorm.shape)

(6794, 5132)


In [14]:
pivTestNorm = create_Normalized_Matrix(pivTest)
print (pivTestNorm.shape)

(5118, 4428)


### Create Sparse Matrices

In [15]:
def create_Sparse_Matrix(userAnimeMatrix):
    return sp.sparse.csr_matrix(userAnimeMatrix.values)

In [16]:
pivTrainSparse = create_Sparse_Matrix(pivTrainNorm)

### User-User Cosine Similarity

In [17]:
def user_Cosine_Similarity(userMatrix):
    return cosine_similarity(userMatrix.T)

def user_Similarity_DataFrame(userSim, pivNorm):
    return pd.DataFrame(userSim, index = pivNorm.columns, columns = pivNorm.columns)

In [18]:
userCosineSim = user_Cosine_Similarity(pivTrainSparse)
trainUserSimData = user_Similarity_DataFrame(userCosineSim, pivTrainNorm)
trainUserSimData.head()

user_id,3,5,7,8,10,11,12,14,16,17,...,5986,5989,5990,5991,5992,5993,5994,5997,5999,6000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.0,0.055701,0.125425,0.036178,0.125929,0.078598,0.004161,0.011674,0.0,0.178855,...,-0.006609,0.032449,0.072245,0.041946,0.018856,-0.027929,0.047985,0.057,0.0,0.022961
5,0.055701,1.0,0.070479,0.011412,0.029557,0.040393,0.052373,0.089523,-0.002409,0.09085,...,0.006038,-0.040987,0.016813,0.026507,0.035207,0.006757,0.063902,0.023104,-0.006876,0.020706
7,0.125425,0.070479,1.0,-0.012529,0.013704,-0.018999,0.066674,0.051968,-0.026259,0.117599,...,0.015143,-0.064445,-0.014332,-0.00478,0.007015,-0.027435,0.024485,0.036133,0.0,0.018543
8,0.036178,0.011412,-0.012529,1.0,-0.06455,0.007982,0.005315,-0.045939,0.0,0.00289,...,0.0,0.0,0.0,0.0,0.0,-0.049387,0.00744,-0.015816,0.0,0.0
10,0.125929,0.029557,0.013704,-0.06455,1.0,0.041218,-0.013723,0.051495,0.0,-0.017055,...,0.0,0.0,0.282044,0.0,0.0,0.0,0.083346,0.0,0.0,0.0


### User-User Cosine Implementation

In [19]:
def users_Avg_Rating_Matrix(userMatrix):
    avgUsersRating = {}
    for userID in userMatrix:
        ratingsSum = 0.0
        counter = 0
        for rating in userMatrix[userID]:
            if not np.isnan(rating):
                ratingsSum += rating
                counter += 1
            else:
                continue
        avgUsersRating[userID] = ratingsSum / counter
    return avgUsersRating

def user_Avg_Rating(userID):
    userRatings = pivTrain.loc[userID, :]
    avgUserRatingList = 0.0
    counter = 0
    for rating in userRatings:
        if not np.isnan(rating):
            avgUserRatingList += rating
            counter += 1
        else:
            continue
    return avgUserRatingList/counter

In [20]:
avgUserRatings = users_Avg_Rating_Matrix(pivTrain.T)
avgTestUserRatings = users_Avg_Rating_Matrix(pivTest.T)
# print avgUserRatings
# avgUserRating = user_Avg_Rating(3)
# print avgUserRating

In [21]:
def rating_User_Cosine_Prediction(userID, animeName):
    if userID in avgUserRatings:
        userAvg = avgUserRatings[userID]
    else:
        userAvg = avgTestUserRatings[userID]

    if userID in pivTrainNorm.columns:
        allUserSimilarity = trainUserSimData.sort_values(by=userID, ascending=False).loc[:,userID]
        ratingWeight = 0.0
        similarityWeight = 0.0
        topNUsers = 0
        
        for userX in allUserSimilarity.index:
            if userID == userX:
                continue
            
            if animeName in pivTrain.columns:
                userXRating = pivTrain.loc[userX, animeName]
            else:
                userXRating = 0.0
            
            if userXRating < 0.01:
                continue
    
            if topNUsers < 50:
                userXAvg = avgUserRatings[userX]
                userXSimilarity = allUserSimilarity.loc[userX]
                if np.isnan(userXRating):
                    continue
                elif not np.isnan(userXRating):
                    ratingWeight += (userXRating - userXAvg)*userXSimilarity
                    similarityWeight += abs(userXSimilarity)
                    topNUsers += 1

    else:
        return userAvg
    
    if similarityWeight > 0:
        return userAvg + (ratingWeight / similarityWeight)
    else:
        return userAvg

In [22]:
userCosinePrediction = rating_User_Cosine_Prediction(3, "Zombie-Loan")
print userCosinePrediction

6.246602456190452


### Evaluation of User-User Cosine Similarity using MAE and RMSE

In [23]:
# Mean Absolute Error
def calc_Mean_Absolute_Error():
    meanAbsErrorList = 0.0
    count = 0
    for userID in pivTest.index:
        if userID in pivTestNorm.columns:
            userInfoItem = pivTest.loc[userID, :]
            for anime in userInfoItem.index:
                trueRating = userInfoItem[anime]
                if np.isnan(trueRating):
                    continue
                predictedRating = rating_User_Cosine_Prediction(userID, anime)
                meanAbsErrorList += abs(predictedRating - trueRating)
                count += 1
    return meanAbsErrorList/count


# Root Mean Squared Error
import math
def calc_Root_Mean_Square_Error():
    rootMSEList = 0.0
    count = 0
    for userID in pivTest.index:
        if userID in pivTestNorm.columns:
            userInfoItem = pivTest.loc[userID, :]
            for anime in userInfoItem.index:
                trueRating = userInfoItem[anime]
                if np.isnan(trueRating):
                    continue
                predictedRating = rating_User_Cosine_Prediction(userID, anime)
                rootMSEList += (predictedRating - trueRating)**2
                count += 1
    return math.sqrt(rootMSEList / count)

In [24]:
meanAbsError = calc_Mean_Absolute_Error()
print meanAbsError

0.8975660899823676


In [25]:
rootMeanSquareError = calc_Root_Mean_Square_Error()
print rootMeanSquareError

1.18201986274


### User-User Pearson Similarity

In [26]:
def pearson_Covariance(userMatrix):
    return np.cov(userMatrix.T)

def pearson_Similarity(userCovariance):
    return np.corrcoef(userCovariance)

In [27]:
userPearsonCovariance = pearson_Covariance(pivTrainNorm.values)

In [28]:
userPearsonSim = pearson_Similarity(userPearsonCovariance)

In [29]:
trainUserPSimData = user_Similarity_DataFrame(userPearsonSim, pivTrainNorm)
trainUserPSimData.head()

user_id,3,5,7,8,10,11,12,14,16,17,...,5986,5989,5990,5991,5992,5993,5994,5997,5999,6000
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,1.0,0.329532,0.419087,0.137756,0.239213,0.34971,0.200837,0.295468,0.097943,0.55744,...,0.08499,0.021806,0.213521,0.168114,0.020697,0.019535,0.295071,0.344123,-0.003431,0.090273
5,0.329532,1.0,0.373202,-0.021888,0.22924,0.194877,0.255306,0.439671,0.066199,0.501576,...,0.174141,0.002777,0.03895,0.134144,0.064642,0.009644,0.38468,0.287975,-0.04412,0.166375
7,0.419087,0.373202,1.0,0.022168,0.119118,0.060181,0.334814,0.424087,0.086328,0.600363,...,0.156454,-0.01614,-0.016303,0.032253,-0.003132,-0.002961,0.302467,0.359471,0.027521,0.154171
8,0.137756,-0.021888,0.022168,1.0,-0.18508,0.003646,0.011616,-0.085795,0.006462,-0.014202,...,-0.02192,-0.032677,0.016222,0.010009,-0.010967,-0.013811,-0.020098,-0.005956,0.006436,0.001042
10,0.239213,0.22924,0.119118,-0.18508,1.0,0.149689,0.147549,0.243151,0.055663,0.232109,...,0.137408,0.04274,0.441788,0.05146,-0.003544,0.022331,0.348561,0.170607,-0.016593,0.08128


### User-User Pearson Implementation

In [30]:
def users_Avg_Rating_Matrix(userMatrix):
    avgUsersRating = {}
    for userID in userMatrix:
        ratingsSum = 0.0
        counter = 0
        for rating in userMatrix[userID]:
            if not np.isnan(rating):
                ratingsSum += rating
                counter += 1
            else:
                continue
        avgUsersRating[userID] = ratingsSum / counter
    return avgUsersRating

def user_Avg_Rating(userID):
    userRatings = pivTrain.loc[userID, :]
    avgUserRatingList = 0.0
    counter = 0
    for rating in userRatings:
        if not np.isnan(rating):
            avgUserRatingList += rating
            counter += 1
        else:
            continue
    return avgUserRatingList/counter

In [31]:
avgUserRatings = users_Avg_Rating_Matrix(pivTrain.T)
avgTestUserRatings = users_Avg_Rating_Matrix(pivTest.T)
# print avgUserRatings
# avgUserRating = user_Avg_Rating(3)
# print avgUserRating

In [32]:
def rating_User_Pearson_Prediction(userID, animeName):
    if userID in avgUserRatings:
        userAvg = avgUserRatings[userID]
    else:
        userAvg = avgTestUserRatings[userID]

    if userID in pivTrainNorm.columns:
        allUserSimilarity = trainUserPSimData.sort_values(by=userID, ascending=False).loc[:,userID]
        ratingWeight = 0.0
        similarityWeight = 0.0
        topNUsers = 0
        
        for userX in allUserSimilarity.index:
            if userID == userX:
                continue
            
            if animeName in pivTrain.columns:
                userXRating = pivTrain.loc[userX, animeName]
            else:
                userXRating = 0.0
            
            if userXRating < 0.01:
                continue
    
            if topNUsers < 50:
                userXAvg = avgUserRatings[userX]
                userXSimilarity = allUserSimilarity.loc[userX]
                if np.isnan(userXRating):
                    continue
                elif not np.isnan(userXRating):
                    ratingWeight += (userXRating - userXAvg)*userXSimilarity
                    similarityWeight += abs(userXSimilarity)
                    topNUsers += 1

    else:
        return userAvg
    
    if similarityWeight > 0:
        return userAvg + (ratingWeight / similarityWeight)
    else:
        return userAvg

In [33]:
userPearsonPrediction = rating_User_Pearson_Prediction(3, "Zombie-Loan")
print userPearsonPrediction

6.258062208433782


### Evaluation of User-User Pearson Similarity using MAE and RMSE

In [34]:
# Mean Absolute Error and Root Mean Square Error
import math
def calc_Pearson_Error():
    meanAbsErrorList = 0.0
    rootMSEList = 0.0
    count = 0
    for userID in pivTest.index:
        if userID in pivTestNorm.columns:
            userInfoItem = pivTest.loc[userID, :]
            for anime in userInfoItem.index:
                trueRating = userInfoItem[anime]
                if np.isnan(trueRating):
                    continue
                predictedRating = rating_User_Pearson_Prediction(userID, anime)
                meanAbsErrorList += abs(predictedRating - trueRating)
                rootMSEList += (predictedRating - trueRating)**2
                count += 1
    return meanAbsErrorList/count, math.sqrt(rootMSEList/count)

In [35]:
pearsonMAE, pearsonRMSE = calc_Pearson_Error()
print "MAE: {}".format(pearsonMAE)
print "RMSE: {}".format(pearsonRMSE)

MAE: 0.90859305776
RMSE: 1.19552284849
