In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import scipy.sparse
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [2]:
anime = pd.read_csv('../anime-recommendations-database/anime.csv')
animeRating = pd.read_csv('../anime-recommendations-database/rating.csv')

### Cleaning Data for Missing Values

In [3]:
#Checking which values are null
print anime.isnull().sum()
print animeRating.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64
user_id     0
anime_id    0
rating      0
dtype: int64


In [4]:
anime['genre'] = anime['genre'].fillna('None')
anime['type'] = anime['type'].fillna('None')
anime['rating'] = anime['rating'].fillna('None')
anime.isnull().sum()

anime_id    0
name        0
genre       0
type        0
episodes    0
rating      0
members     0
dtype: int64

In [5]:
animeRating = animeRating[animeRating.rating > 0]
animeRating.rating.unique()

array([10,  8,  6,  9,  7,  3,  5,  4,  1,  2], dtype=int64)

In [6]:
fullMergedAnime = animeRating.merge(anime, left_on = 'anime_id', right_on = 'anime_id', suffixes= ['_user', ''])
fullMergedAnime.head()

Unnamed: 0,user_id,anime_id,rating_user,name,genre,type,episodes,rating,members
0,1,8074,10,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
1,3,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
2,5,8074,2,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
3,12,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892
4,14,8074,6,Highschool of the Dead,"Action, Ecchi, Horror, Supernatural",TV,12,7.46,535892


### Splitting and Normalizing Data

In [7]:
trainData, testData = train_test_split(fullMergedAnime, test_size=0.2)

In [8]:
subTrainData = trainData[['user_id', 'anime_id', 'name', 'rating_user']]
#Limiting the dataframe
subTrainData = subTrainData[subTrainData.user_id <= 6000]

In [9]:
pivTrain = subTrainData.pivot_table(index=['user_id'], columns=['name'], values='rating_user')
print(pivTrain.shape)
pivTrain.head()

(5598, 6796)


name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies 2,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,2.0,,,,,
7,,,,,,,,,,,...,,,,,,,,,,


In [10]:
subTestData = testData[['user_id', 'anime_id', 'name', 'rating_user']]
#Limiting the dataframe
subTestData = subTestData[subTestData.user_id <= 6000]

In [11]:
pivTest = subTestData.pivot_table(index=['user_id'], columns=['name'], values='rating_user')
print(pivTest.shape)
pivTest.head()

(5086, 5150)


name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,ef: A Tale of Memories. - Recollections,gdgd Fairies 2,gdgd Fairies Movie: tte Iu Eiga wa Dou kana...?,iDOLM@STER Xenoglossia,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,,,,,,,,,...,,,,,,,,,,


In [12]:
def create_Normalized_Matrix(piv):
    piv = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
    piv.fillna(0, inplace=True)
    piv = piv.T
    piv = piv.loc[:, (piv != 0).any(axis=0)]
    return piv

In [13]:
pivTrainNorm = create_Normalized_Matrix(pivTrain)
print (pivTrainNorm.shape)

(6796, 5129)


In [14]:
pivTestNorm = create_Normalized_Matrix(pivTest)
print (pivTestNorm.shape)

(5150, 4365)


### Create Sparse Matrices

In [15]:
def create_Sparse_Matrix(userAnimeMatrix):
    return sp.sparse.csr_matrix(userAnimeMatrix.values)

In [16]:
pivTrainSparse = create_Sparse_Matrix(pivTrainNorm)

### Item-Item Cosine Similarity

In [17]:
def item_Cosine_Similarity(itemMatrix):
    return cosine_similarity(itemMatrix)

def item_Similarity_DataFrame(itemSim, pivNorm):
    return pd.DataFrame(itemSim, index = pivNorm.index, columns = pivNorm.index)

In [70]:
itemCosineSim = item_Cosine_Similarity(pivTrainSparse)
trainItemSimData = item_Similarity_DataFrame(itemCosineSim, pivTrainNorm)
trainItemSimData.head()
# df_wc.loc[df_wc['nation']=='Germany', :]
# trainItemSimData.sort_values(by="1000-nen Joou: Queen Millennia", ascending=False)
# print pivTrainNorm.index
if "Zombie-Loan" in pivTrainNorm.index:
    print trainItemSimData.loc[:, "Zombie-Loan"]
    

name
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi      -0.016634
&quot;Bungaku Shoujo&quot; Memoire                        0.018036
&quot;Bungaku Shoujo&quot; Movie                          0.002700
.hack//G.U. Returner                                     -0.001762
.hack//G.U. Trilogy                                      -0.022605
.hack//G.U. Trilogy: Parody Mode                         -0.080269
.hack//Gift                                               0.002190
.hack//Intermezzo                                         0.014187
.hack//Liminality                                         0.000438
.hack//Quantum                                            0.016871
.hack//Quantum: Sore ike! Bokura no Chimuchimu-chan!!     0.018936
.hack//Roots                                              0.033806
.hack//Sign                                               0.020084
.hack//Tasogare no Udewa Densetsu                         0.049190
.hack//Tasogare no Udewa Densetsu: Offline de Aimashou   

### Item-Item Cosine Implementation

In [19]:
def items_Avg_Rating_Matrix(itemMatrix):
    avgItemsRating = {}
    for itemID in itemMatrix:
        ratingsSum = 0.0
        counter = 0
        for rating in itemMatrix[itemID]:
            if not np.isnan(rating):
                ratingsSum += rating
                counter += 1
            else:
                continue
        avgItemsRating[itemID] = ratingsSum / counter
    return avgItemsRating

def item_Avg_Rating(itemID):
    itemRatings = pivTrain.loc[itemID, :]
    avgItemRatingList = 0.0
    counter = 0
    for rating in itemRatings:
        if not np.isnan(rating):
            avgItemRatingList += rating
            counter += 1
        else:
            continue
    return avgItemRatingList/counter

In [20]:
avgItemRatings = items_Avg_Rating_Matrix(pivTrain)
avgTestItemRatings = items_Avg_Rating_Matrix(pivTest)
# print avgItemRatings
# avgItemRating = item_Avg_Rating(3)
# print avgItemRating

In [75]:
def rating_Item_Cosine_Prediction(userID, animeName):
    if animeName in avgItemRatings:
        itemAvg = avgItemRatings[animeName]
    else:
        itemAvg = avgTestItemRatings[animeName]
        
    if animeName in pivTrainNorm.index and userID in pivTrainNorm.columns:
        allItemSimilarity = trainItemSimData.sort_values(by=animeName, ascending=False).loc[:,animeName]
        ratingWeight = 0.0
        similarityWeight = 0.0
        topNUsers = 0
        
        for itemX in allItemSimilarity.index:
            if animeName == itemX:
                continue
            
            itemXRating = pivTrain.loc[userID, itemX]
            if itemXRating < 0.01:
                continue
    
            if topNUsers < 50:
                itemXAvg = avgItemRatings[itemX]
                itemXSimilarity = allItemSimilarity.loc[itemX]
                if np.isnan(itemXRating):
                    continue
                elif not np.isnan(itemXRating):
                    ratingWeight += (itemXRating - itemXAvg)*itemXSimilarity
                    similarityWeight += abs(itemXSimilarity)
                    topNUsers += 1
    else:
        return itemAvg
    
    if similarityWeight > 0:
        return itemAvg + (ratingWeight / similarityWeight)
    else:
        return itemAvg

In [76]:
itemCosinePrediction = rating_Item_Cosine_Prediction(3, "Zombie-Loan")
print itemCosinePrediction

6.748133946140575


### Evaluation of Item-Item Cosine Similarity using MAE and RMSE

In [77]:
# Mean Absolute Error and Root Mean Squared Error
import math
def calc_Cosine_Error():
    meanAbsErrorList = 0.0
    rootMSEList = 0.0
    count = 0
    for anime in pivTest.columns:
        if anime in pivTestNorm.index:
            itemInfo = pivTest.loc[:, anime]
            for userID in itemInfo.index:
                trueRating = itemInfo[userID]
                if np.isnan(trueRating):
                    continue
                predictedRating = rating_Item_Cosine_Prediction(userID, anime)
                meanAbsErrorList += abs(predictedRating - trueRating)
                rootMSEList += (predictedRating - trueRating)**2
                count += 1
    return meanAbsErrorList/count, math.sqrt(rootMSEList/count)

In [78]:
cosineMAE, cosineRMSE = calc_Cosine_Error()
print "MAE: {}".format(cosineMAE)
print "RMSE: {}".format(cosineRMSE)

MAE: 0.869646313672
RMSE: 1.1549518603


### Item-Item Pearson Similarity

In [79]:
def pearson_Covariance(itemMatrix):
    return np.cov(itemMatrix)

def pearson_Similarity(itemCovariance):
    return np.corrcoef(itemCovariance)

In [80]:
itemPearsonCovariance = pearson_Covariance(pivTrainNorm.values)

In [81]:
itemPearsonSim = pearson_Similarity(itemPearsonCovariance)

  c /= stddev[:, None]
  c /= stddev[None, :]


In [82]:
trainItemPSimData = item_Similarity_DataFrame(itemPearsonSim, pivTrainNorm)
trainItemPSimData.head()

name,&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,&quot;Bungaku Shoujo&quot; Memoire,&quot;Bungaku Shoujo&quot; Movie,.hack//G.U. Returner,.hack//G.U. Trilogy,.hack//G.U. Trilogy: Parody Mode,.hack//Gift,.hack//Intermezzo,.hack//Liminality,.hack//Quantum,...,gdgd Fairies 2,iDOLM@STER Xenoglossia,iDOLM@STER Xenoglossia Specials,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei,xxxHOLiC Movie: Manatsu no Yoru no Yume,xxxHOLiC Rou,xxxHOLiC Shunmuki,◯
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
&quot;Bungaku Shoujo&quot; Kyou no Oyatsu: Hatsukoi,1.0,0.089215,0.151401,-0.072505,-0.015024,-0.144006,-0.053309,-0.016488,-0.107072,0.080694,...,0.004298,-0.006761,-0.005966,-0.004126,0.133373,0.074438,0.279446,0.225721,0.120732,0.072148
&quot;Bungaku Shoujo&quot; Memoire,0.089215,1.0,0.31934,-0.100505,-0.00421,-0.322874,-0.061399,0.03743,-0.079327,0.134922,...,-0.001993,0.021003,-0.032716,-0.004766,-0.150453,-0.023785,-0.233713,-0.245689,-0.238208,0.04846
&quot;Bungaku Shoujo&quot; Movie,0.151401,0.31934,1.0,-0.245691,-0.148294,-0.317159,-0.204142,-0.159989,-0.032996,0.094942,...,0.01982,-0.076308,-0.069012,-0.00233,-0.030397,0.066061,0.066502,0.066063,0.09796,-0.090761
.hack//G.U. Returner,-0.072505,-0.100505,-0.245691,1.0,0.59288,0.411574,0.645326,0.627545,0.551149,-0.028338,...,-0.017738,0.090986,0.086478,0.056339,-0.215863,-0.290169,-0.338123,-0.379819,-0.271721,0.129778
.hack//G.U. Trilogy,-0.015024,-0.00421,-0.148294,0.59288,1.0,0.362717,0.447771,0.47558,0.349264,0.040664,...,-0.031944,0.067386,0.077217,0.039629,-0.227933,-0.297508,-0.238288,-0.300114,-0.3301,0.076963


### Item-Item Pearson Implementation

In [83]:
def items_Avg_Rating_Matrix(itemMatrix):
    avgItemsRating = {}
    for itemID in itemMatrix:
        ratingsSum = 0.0
        counter = 0
        for rating in itemMatrix[itemID]:
            if not np.isnan(rating):
                ratingsSum += rating
                counter += 1
            else:
                continue
        avgItemsRating[itemID] = ratingsSum / counter
    return avgItemsRating

def item_Avg_Rating(itemID):
    itemRatings = pivTrain.loc[itemID, :]
    avgItemRatingList = 0.0
    counter = 0
    for rating in itemRatings:
        if not np.isnan(rating):
            avgItemRatingList += rating
            counter += 1
        else:
            continue
    return avgItemRatingList/counter

In [84]:
avgItemRatings = items_Avg_Rating_Matrix(pivTrain)
avgTestItemRatings = items_Avg_Rating_Matrix(pivTest)
# print avgItemRatings
# avgItemRating = item_Avg_Rating(3)
# print avgItemRating

In [85]:
def rating_Item_Pearson_Prediction(userID, animeName):
    if animeName in avgItemRatings:
        itemAvg = avgItemRatings[animeName]
    else:
        itemAvg = avgTestItemRatings[animeName]
        
    if animeName in pivTrainNorm.index and userID in pivTrainNorm.columns:
        allItemSimilarity = trainItemPSimData.sort_values(by=animeName, ascending=False).loc[:,animeName]
        ratingWeight = 0.0
        similarityWeight = 0.0
        topNUsers = 0
        
        for itemX in allItemSimilarity.index:
            if animeName == itemX:
                continue
            
            itemXRating = pivTrain.loc[userID, itemX]
            if itemXRating < 0.01:
                continue
    
            if topNUsers < 50:
                itemXAvg = avgItemRatings[itemX]
                itemXSimilarity = allItemSimilarity.loc[itemX]
                if np.isnan(itemXRating):
                    continue
                elif not np.isnan(itemXRating):
                    ratingWeight += (itemXRating - itemXAvg)*itemXSimilarity
                    similarityWeight += abs(itemXSimilarity)
                    topNUsers += 1
    else:
        return itemAvg
    
    if similarityWeight > 0:
        return itemAvg + (ratingWeight / similarityWeight)
    else:
        return itemAvg

In [86]:
itemPearsonPrediction = rating_Item_Pearson_Prediction(3, "Zombie-Loan")
print itemPearsonPrediction

6.5980589845791995


### Evaluation of Item-Item Pearson Similarity using MAE and RMSE

In [87]:
# Mean Absolute Error and Root Mean Squared Error
import math
def calc_Pearson_Error():
    meanAbsErrorList = 0.0
    rootMSEList = 0.0
    count = 0
    for anime in pivTest.columns:
        if anime in pivTestNorm.index:
            itemInfo = pivTest.loc[:, anime]
            for userID in itemInfo.index:
                trueRating = itemInfo[userID]
                if np.isnan(trueRating):
                    continue
                predictedRating = rating_Item_Pearson_Prediction(userID, anime)
                meanAbsErrorList += abs(predictedRating - trueRating)
                rootMSEList += (predictedRating - trueRating)**2
                count += 1
    return meanAbsErrorList/count, math.sqrt(rootMSEList/count)

In [88]:
pearsonMAE, pearsonRMSE = calc_Pearson_Error()
print "MAE: {}".format(pearsonMAE)
print "RMSE: {}".format(pearsonRMSE)

MAE: 0.8654
RMSE: 1.153