In [2]:
# Import necessary packages
import pandas as pd
import numpy as np
import random
from math import sqrt

In [3]:
# grab dataset from available csv file and store it on 
ratings_matrix = pd.read_csv('dataset-ratingfix.csv', index_col='User')

In [4]:
# clean up ratings_matrix by pruning the columns that have empty ratings
ratings_matrix_clean = ratings_matrix.loc[:, ~(ratings_matrix == 0).any()]
ratings_matrix_clean

Unnamed: 0_level_0,Item2,Item3,Item4,Item5,Item6,Item7,Item8,Item10,Item12,Item13,Item14,Item15,Item16,Item18,Item19
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ActiveUser,4,1,1,4,5,5,4,2,2,3,1,1,1,2,3
User1,5,3,5,4,2,2,2,1,2,5,5,4,2,3,1
User2,5,4,1,5,1,3,4,1,5,5,5,1,1,3,1
User3,4,2,4,1,1,1,4,3,4,1,5,5,5,2,5
User4,5,1,1,4,3,4,5,1,5,2,5,4,1,1,2
User5,4,3,1,4,5,5,4,2,2,3,1,1,1,2,3
User6,1,4,5,4,3,5,3,4,4,1,5,5,4,5,3
User7,5,2,1,4,2,1,4,4,4,5,1,1,1,1,1
User8,2,3,3,4,5,4,5,4,2,4,4,3,3,5,3
User9,2,4,4,4,1,1,4,2,2,4,2,3,1,2,2


In [5]:
# search user_mean which is the mean for each users' ratings from all items (not counting the items that has empty ratings from active user)
user_mean = ratings_matrix_clean.mean(axis=1)
print(user_mean)

# make new matrix from the difference between users' ratings and user's ratings' means
difference_mean_matrix = ratings_matrix_clean.subtract(user_mean, axis=0)
print(difference_mean_matrix)

User
ActiveUser    2.600000
User1         3.066667
User2         3.000000
User3         3.133333
User4         2.933333
User5         2.733333
User6         3.733333
User7         2.466667
User8         3.600000
User9         2.533333
User10        3.000000
User11        2.733333
User12        3.466667
User13        3.066667
User14        3.133333
User15        3.800000
User16        3.066667
User17        3.000000
User18        2.800000
User19        3.133333
dtype: float64
               Item2     Item3     Item4     Item5     Item6     Item7  \
User                                                                     
ActiveUser  1.400000 -1.600000 -1.600000  1.400000  2.400000  2.400000   
User1       1.933333 -0.066667  1.933333  0.933333 -1.066667 -1.066667   
User2       2.000000  1.000000 -2.000000  2.000000 -2.000000  0.000000   
User3       0.866667 -1.133333  0.866667 -2.133333 -2.133333 -2.133333   
User4       2.066667 -1.933333 -1.933333  1.066667  0.066667  1.066667   
Us

In [6]:
# make Pearson's correlation function 
def pearson_correlation(user1_id, user2_id, difference_mean_matrix):
  user1 = difference_mean_matrix[difference_mean_matrix.index ==
                                 user1_id].values.tolist()
  user2 = difference_mean_matrix[difference_mean_matrix.index ==
                                 user2_id].values.tolist()
  numerator = 0
  denominator = 0
  for i in range(len(difference_mean_matrix.values.tolist()[0])):
    numerator += (user1[0][i] * user2[0][i])
  sum1 = 0
  sum2 = 0
  for i in range(len(difference_mean_matrix.values.tolist()[0])):
    sum1 += pow(user1[0][i], 2)
    sum2 += pow(user2[0][i], 2)
  denominator = sqrt(sum1) * sqrt(sum2)
  return numerator/denominator


In [7]:
# make listUser that lists existing users in the dataframe
listUser = ['User%s' % user for user in range(1,20)] 
listUser.insert(0, 'ActiveUser')

# make listItem that lists existing items in the dataframe
listItem = ['Item%s' % item for item in range(1,21)] 

In [19]:
datasimilarity = []

# prepare similarity data and store them to datasimilarity
# find similarity data by grabbing these information: user A, user B, 
# and their correlation points with each other using the pearson_correlation method
for i in range(20):
    for j in range(20):
        similarity = []
        similarity.extend((listUser[i], listUser[j], pearson_correlation(listUser[i], listUser[j], difference_mean_matrix)))
        datasimilarity.append(similarity)

# process compiled similarity data into one dataframe and pivot the table
similarities = pd.DataFrame(datasimilarity, columns=['UserX', 'UserY', 'Similarity'])
similarities_matrix = similarities.pivot_table(index='UserX', columns='UserY', values='Similarity')

In [9]:
similarities_matrix

UserY,ActiveUser,User1,User10,User11,User12,User13,User14,User15,User16,User17,User18,User19,User2,User3,User4,User5,User6,User7,User8,User9
UserX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ActiveUser,1.0,-0.211099,0.283024,0.011404,0.589139,-0.065413,0.843148,0.074648,-0.317489,-0.174437,0.087507,0.113085,0.183601,-0.556026,0.405948,0.939238,-0.448887,0.370684,0.366607,-0.19222
User1,-0.211099,1.0,-0.413195,0.065315,-0.158315,0.699706,-0.278018,-0.578438,0.031198,-0.141046,0.069492,-0.094245,0.450668,-0.003903,0.198592,-0.22507,-0.134429,0.132691,-0.175517,0.464113
User10,0.283024,-0.413195,1.0,0.283312,0.348479,-0.12188,-0.044567,0.370902,-0.032864,0.104006,-0.434791,0.177435,-0.156386,-0.518006,-0.248673,0.29578,0.106208,-0.172669,0.335547,-0.278563
User11,0.011404,0.065315,0.283312,1.0,-0.383329,-0.39533,-0.261282,0.306369,0.484662,0.314306,-0.135147,0.041109,-0.04726,-0.142626,0.01837,0.027808,0.15834,-0.133929,-0.373739,0.221277
User12,0.589139,-0.158315,0.348479,-0.383329,1.0,0.069466,0.549995,0.110295,-0.383585,-0.425263,-0.090044,-0.26162,-0.145326,-0.447141,0.014379,0.508188,-0.242135,0.248174,0.684215,-0.032543
User13,-0.065413,0.699706,-0.12188,-0.39533,0.069466,1.0,-0.123605,-0.685784,-0.301927,-0.225356,-0.11305,0.187103,0.508275,-0.079815,0.146079,-0.074058,-0.401954,0.207019,0.024927,0.024143
User14,0.843148,-0.278018,-0.044567,-0.261282,0.549995,-0.123605,1.0,0.082283,-0.428075,-0.247214,0.372046,0.073103,0.074343,-0.421363,0.281089,0.868648,-0.477965,0.372113,0.396505,-0.060537
User15,0.074648,-0.578438,0.370902,0.306369,0.110295,-0.685784,0.082283,1.0,-0.164693,0.0,0.057339,-0.140397,-0.515589,-0.326383,-0.480979,0.095349,0.33615,-0.174577,0.050572,-0.125951
User16,-0.317489,0.031198,-0.032864,0.484662,-0.383585,-0.301927,-0.428075,-0.164693,1.0,0.656276,-0.385399,-0.004147,-0.246697,0.449937,0.176284,-0.336406,0.233317,-0.165448,-0.483952,-0.02232
User17,-0.174437,-0.141046,0.104006,0.314306,-0.425263,-0.225356,-0.247214,0.0,0.656276,1.0,0.068908,0.557732,-0.289157,0.159632,0.030653,-0.182299,0.392756,-0.415044,-0.212718,-0.48563


In [10]:
# grab similarity list specifically only for ActiveUser's similarity counts
# since we only want to predict Active User's empty rating data
ActiveUserSimilarityList = datasimilarity[:20]
ActiveUserSimilarityList

[['ActiveUser', 'ActiveUser', 1.0000000000000002],
 ['ActiveUser', 'User1', -0.2110994167952127],
 ['ActiveUser', 'User2', 0.18360122852864083],
 ['ActiveUser', 'User3', -0.5560260019025979],
 ['ActiveUser', 'User4', 0.4059483293442529],
 ['ActiveUser', 'User5', 0.9392378019452414],
 ['ActiveUser', 'User6', -0.44888679746117377],
 ['ActiveUser', 'User7', 0.37068400126839857],
 ['ActiveUser', 'User8', 0.3666068680225597],
 ['ActiveUser', 'User9', -0.1922195118758996],
 ['ActiveUser', 'User10', 0.283024465552478],
 ['ActiveUser', 'User11', 0.011403959229610947],
 ['ActiveUser', 'User12', 0.589138924528476],
 ['ActiveUser', 'User13', -0.06541282429659163],
 ['ActiveUser', 'User14', 0.8431482365061614],
 ['ActiveUser', 'User15', 0.07464832243143299],
 ['ActiveUser', 'User16', -0.31748883446036325],
 ['ActiveUser', 'User17', -0.17443747397611128],
 ['ActiveUser', 'User18', 0.08750683673869077],
 ['ActiveUser', 'User19', 0.11308455411825565]]

In [11]:
# pick out nearest neighbors by picking out users that has > 0.7 similarity points with ActiveUser
nearest = []
for i in ActiveUserSimilarityList:
    if i[2] > 0.5 and i[1] != i[0]: # pick out everyone that matches the threshold except ActiveUser themselves
        nearest.append(i)

# if somehow there is no neighbors that meets the threshold, find 2 top similarity neighbors 
if not nearest:
    max1 = [0, 0, 0]
    max2 = [0, 0, 0]
    arr = []
    for i in ActiveUserSimilarityList:
        if max2[2] < i[2] and i[1] != i[0]:
            if max1[2] < i[2]:
                max1 = i
            else:
                max2 = i
    nearest.extend(([max1, max2]))

nearest

[['ActiveUser', 'User5', 0.9392378019452414],
 ['ActiveUser', 'User12', 0.589138924528476],
 ['ActiveUser', 'User14', 0.8431482365061614]]

In [12]:
# list user's rating mean for all items
user_mean_all_item = ratings_matrix.mean(axis=1)
user_mean_all_item

User
ActiveUser    1.95
User1         2.75
User2         3.00
User3         2.95
User4         2.95
User5         2.85
User6         3.75
User7         2.60
User8         3.50
User9         2.65
User10        2.95
User11        2.70
User12        3.55
User13        3.05
User14        3.15
User15        3.60
User16        3.05
User17        2.90
User18        2.90
User19        3.20
dtype: float64

In [13]:
# predict rating 
def predict_rating(user_mean_all_item, ratings_matrix, nearest, item):
    numerator = 0
    denominator = 0
    for i in range(len(nearest)):
        userRating = ratings_matrix[ratings_matrix.index == nearest[i][1]]
        numerator += nearest[i][2] * (userRating[item][0] - user_mean_all_item[nearest[i][1]])
        denominator += nearest[i][2]
    return (user_mean[nearest[0][0]] + (numerator/denominator))

In [14]:
# grab all ratings from active user only, including empty ones
rating_ActiveUser = ratings_matrix[ratings_matrix.index == 'ActiveUser']
rating_ActiveUser

Unnamed: 0_level_0,Item1,Item2,Item3,Item4,Item5,Item6,Item7,Item8,Item9,Item10,Item11,Item12,Item13,Item14,Item15,Item16,Item17,Item18,Item19,Item20
User,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
ActiveUser,0,4,1,1,4,5,5,4,0,2,0,2,3,1,1,1,0,2,3,0


In [20]:
# find all items that has not been rated by active user to predict the ratings later
noRating_ActiveUser = []
for item in listItem:
    if rating_ActiveUser[item][0] == 0:
        noRating_ActiveUser.append(item)
noRating_ActiveUser

['Item1', 'Item9', 'Item11', 'Item17', 'Item20']

In [16]:
# start predicting ratings for active user's empty ratings
prediction = {}
for item in noRating_ActiveUser:
    prediction[item] = (predict_rating(user_mean_all_item, ratings_matrix, nearest, item))
prediction

{'Item1': 2.214711555925965,
 'Item9': 4.221023639964569,
 'Item11': 1.4694456189549177,
 'Item17': 2.9662895769356163,
 'Item20': 3.2210236399645686}

In [17]:
# sort predicted items based on their predicted rating/value
sorted_prediction = sorted(prediction, key=prediction.get, reverse=True)
sorted_prediction

['Item9', 'Item20', 'Item17', 'Item1', 'Item11']

In [18]:
# set a threshold and list out recommended items that meet the set threshold
# if an item has a predicted rating higher than the threshold, item will be recommended to Active User
threshold = 2
for item in sorted_prediction:
    if prediction[item] >= threshold:
        print('{item} direkomendasikan dengan prediksi rating {rating}'.format(item=item, rating=prediction[item]))

Item9 direkomendasikan dengan prediksi rating 4.221023639964569
Item20 direkomendasikan dengan prediksi rating 3.2210236399645686
Item17 direkomendasikan dengan prediksi rating 2.9662895769356163
Item1 direkomendasikan dengan prediksi rating 2.214711555925965
