In [1]:
import pandas as pd
import csv
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
import math
from operator import itemgetter
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

In [2]:
ratings = pd.read_csv('./data/rating.csv')
ratings.drop_duplicates()
del ratings['Unnamed: 0']
ratings.tail(5)

Unnamed: 0,userId,rating,movieId
5111,huya,10.0,73394.0
5112,huya,10.0,70773.0
5113,huya,8.0,16220.0
5114,huya,10.0,36666.0
5115,huya,10.0,37235.0


In [3]:
print(ratings.dtypes)

userId      object
rating     float64
movieId    float64
dtype: object


In [4]:
from pandas import Series, DataFrame

missing_rating = DataFrame({'userId':['patl', 'yoya', 'ebc8', 'imag', 'glab', 'zxcv', 'kktw', 'kktw'],
                           'rating':[10.0, 10.0, 1.0, 10.0, 8.0, 1.0, 7.0, 6.0],
                           'movieId':[161967.0, 161967.0, 174065.0, 161967.0, 137327.0, 174065.0, 47701.0, 62328.0]})
missing_rating

Unnamed: 0,userId,rating,movieId
0,patl,10.0,161967.0
1,yoya,10.0,161967.0
2,ebc8,1.0,174065.0
3,imag,10.0,161967.0
4,glab,8.0,137327.0
5,zxcv,1.0,174065.0
6,kktw,7.0,47701.0
7,kktw,6.0,62328.0


In [5]:
ratings = ratings.append(missing_rating)
ratings.tail(10)

Unnamed: 0,userId,rating,movieId
5114,huya,10.0,36666.0
5115,huya,10.0,37235.0
0,patl,10.0,161967.0
1,yoya,10.0,161967.0
2,ebc8,1.0,174065.0
3,imag,10.0,161967.0
4,glab,8.0,137327.0
5,zxcv,1.0,174065.0
6,kktw,7.0,47701.0
7,kktw,6.0,62328.0


In [6]:
ratings = ratings.drop_duplicates()
ratings

Unnamed: 0,userId,rating,movieId
0,airf,2.0,136900.0
1,airf,10.0,163788.0
2,airf,10.0,174065.0
3,nanw,10.0,154667.0
4,nanw,10.0,136900.0
5,nanw,10.0,145335.0
6,nanw,9.0,120141.0
7,nanw,9.0,54704.0
8,zxcv,7.0,113351.0
9,zxcv,10.0,86507.0


In [7]:
user_data = pd.read_csv('./data/naver_user.csv')
user_data.tail(5)

Unnamed: 0,reviewNo,userId
95,15771929,eyku
96,15771927,geon
97,15771926,myil
98,15771925,hous
99,15771922,huya


In [8]:
merge_rating = pd.merge(ratings, user_data, how='outer')

result = merge_rating
result

Unnamed: 0,userId,rating,movieId,reviewNo
0,airf,2.0,136900.0,15772038
1,airf,10.0,163788.0,15772038
2,airf,10.0,174065.0,15772038
3,nanw,10.0,154667.0,15772037
4,nanw,10.0,136900.0,15772037
5,nanw,10.0,145335.0,15772037
6,nanw,9.0,120141.0,15772037
7,nanw,9.0,54704.0,15772037
8,zxcv,7.0,113351.0,15772036
9,zxcv,10.0,86507.0,15772036


In [9]:
rating_matrix = result.pivot(index='reviewNo', columns='movieId', values='rating')

rating_matrix.head(5)

movieId,10002.0,10003.0,10004.0,10005.0,10006.0,10008.0,10009.0,10012.0,10016.0,10018.0,...,181409.0,181410.0,181411.0,181414.0,181419.0,181711.0,182348.0,182360.0,183132.0,183877.0
reviewNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15771922,,,,,,,,,,,...,,,,,,,,,,
15771925,,,,,,,,,,,...,,8.0,,,,,,,,
15771926,,,,,,,,,,,...,,,,,,,,,,
15771927,,,,,,,,,,,...,,,,,,,,,,
15771929,,,,,,,,,,,...,,,,,,,,,,


In [10]:
merge_rating['count'] = 0
merge_rating = merge_rating.groupby(['userId', 'reviewNo'], as_index=False)['count'].count()
merge_rating = merge_rating.sort_values(by='count', ascending=False)

In [11]:
top10_user = merge_rating[0:10]
top10_user

Unnamed: 0,userId,reviewNo,count
89,ykm3,15771936,700
67,sang,15771961,691
82,tsp0,15771934,677
32,hosu,15771998,564
99,zxcv,15772036,357
96,zard,15772012,276
4,artn,15771948,192
76,suha,15771976,108
49,ldsl,15771977,105
37,imag,15771940,102


In [12]:
def distance_cosine(a, b):
    return 1-distance.cosine(a,b)

def distance_euclidean(a, b):
    return 1/(1+distance.euclidean(a,b))

def distance_correlation(a, b):
    return 1-distance.correlation(a,b)

def nearest_neighbor_user(user, topN, simFunc):
    ul= rating_matrix.loc[user].dropna()
    ratedIndex = ul.index
    nn = {}
    
    for uid, row in rating_matrix.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid == user: continue
            
        for i in ratedIndex:
            if not math.isnan(row[i]):
                interSectionU1.append(ul[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
        if interSectionLen < 3: continue
        sim = simFunc(interSectionU1, interSectionU2)
        sim = round(sim, 2)
        if not math.isnan(sim): nn[uid] = sim
    return sorted(nn.items(), key=itemgetter(1))[:-(topN+1):-1]

In [13]:
print('=Cosine=')
for user in top10_user['reviewNo']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(int(user), 3, distance_cosine)))

=Cosine=
User 15771936 neighbors : [(15771972, 1.0), (15771993, 0.97), (15772003, 0.95)]
User 15771961 neighbors : [(15771970, 1.0), (15771966, 1.0), (15771947, 1.0)]
User 15771934 neighbors : [(15772009, 1.0), (15771942, 1.0), (15772030, 0.99)]
User 15771998 neighbors : [(15771970, 1.0), (15771965, 1.0), (15771947, 1.0)]
User 15772036 neighbors : [(15771959, 1.0), (15772000, 0.98), (15771966, 0.98)]
User 15772012 neighbors : [(15772003, 1.0), (15772016, 0.95), (15772000, 0.92)]
User 15771948 neighbors : [(15771966, 1.0), (15771945, 1.0), (15771943, 0.98)]
User 15771976 neighbors : [(15771974, 1.0), (15771966, 1.0), (15771947, 1.0)]
User 15771977 neighbors : [(15771971, 1.0), (15771959, 1.0), (15771983, 0.99)]
User 15771940 neighbors : [(15771993, 1.0), (15771981, 1.0), (15771947, 1.0)]


In [14]:
print('=Correlation=')
for user in top10_user['reviewNo']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(int(user), 3, distance_correlation)))

=Correlation=


  dist = 1.0 - uv / np.sqrt(uu * vv)


User 15771936 neighbors : [(15771993, 1.0), (15772022, 0.69), (15772019, 0.5)]
User 15771961 neighbors : [(15771972, 1.0), (15772015, 0.94), (15771974, 0.93)]
User 15771934 neighbors : [(15771942, 1.0), (15772030, 0.98), (15771947, 0.96)]
User 15771998 neighbors : [(15772005, 0.89), (15771922, 0.88), (15771980, 0.66)]
User 15772036 neighbors : [(15771966, 0.97), (15771943, 0.94), (15771971, 0.62)]
User 15772012 neighbors : [(15772016, 0.94), (15771974, 0.58), (15771969, 0.58)]
User 15771948 neighbors : [(15771943, 0.94), (15771938, 0.73), (15771926, 0.59)]
User 15771976 neighbors : [(15771940, 0.85), (15771974, 0.58), (15771927, 0.53)]
User 15771977 neighbors : [(15771971, 1.0), (15771959, 1.0), (15771998, 0.66)]
User 15771940 neighbors : [(15771981, 0.98), (15771995, 0.97), (15771974, 0.94)]


In [15]:
print('=Euclidean=')
for user in top10_user['reviewNo']:
    print('User {0} neighbors : {1}'.format(user, nearest_neighbor_user(int(user), 3, distance_euclidean)))

=Euclidean=
User 15771936 neighbors : [(15771972, 0.25), (15771993, 0.2), (15772020, 0.12)]
User 15771961 neighbors : [(15772031, 0.33), (15771970, 0.33), (15771954, 0.33)]
User 15771934 neighbors : [(15772030, 0.41), (15771988, 0.29), (15772003, 0.25)]
User 15771998 neighbors : [(15771922, 0.17), (15772030, 0.16), (15772005, 0.15)]
User 15772036 neighbors : [(15771959, 1.0), (15771943, 0.24), (15772022, 0.19)]
User 15772012 neighbors : [(15772016, 0.22), (15772003, 0.15), (15771974, 0.13)]
User 15771948 neighbors : [(15771945, 0.41), (15771943, 0.23), (15771938, 0.15)]
User 15771976 neighbors : [(15771974, 0.31), (15771926, 0.31), (15771966, 0.29)]
User 15771977 neighbors : [(15771971, 1.0), (15771959, 0.5), (15771938, 0.25)]
User 15771940 neighbors : [(15771993, 1.0), (15771947, 1.0), (15771981, 0.33)]


In [16]:
def predict_rating_cosine(userid, nn=100, simFunc=distance_cosine):
    neighbor = nearest_neighbor_user(userid, nn, simFunc)
    neighbor_id = [id for id, sim in neighbor]
    
    neighbor_movie = rating_matrix.loc[neighbor_id].dropna(1, how='all', thresh=1)
    neighbor_dict = (dict(neighbor))
    ret =[]
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dict.get(v[0], 0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
        
    return ret

In [17]:
from pandas import DataFrame, Series
temp = DataFrame({'userId':['ykm3', 'sang', 'tsp0', 'hosu', 'zxcv', 'suha', 'ldsl', 'zard', 'artn', 'imag'], 
                  'rating':['3.0', '10.0', '7.0', '7.0', '10.0', '7.0', '1.0', '10.0', '10.0', '9.0'], 
                  'movieId':['145162', '161967', '163788', '180399', '86507', '180399', '157297', '158653', '172174', '181409']})
temp

Unnamed: 0,userId,rating,movieId
0,ykm3,3.0,145162
1,sang,10.0,161967
2,tsp0,7.0,163788
3,hosu,7.0,180399
4,zxcv,10.0,86507
5,suha,7.0,180399
6,ldsl,1.0,157297
7,zard,10.0,158653
8,artn,10.0,172174
9,imag,9.0,181409


In [18]:
selected_user = pd.merge(top10_user, temp, on='userId')
selected_user

Unnamed: 0,userId,reviewNo,count,rating,movieId
0,ykm3,15771936,700,3.0,145162
1,sang,15771961,691,10.0,161967
2,tsp0,15771934,677,7.0,163788
3,hosu,15771998,564,7.0,180399
4,zxcv,15772036,357,10.0,86507
5,zard,15772012,276,10.0,158653
6,artn,15771948,192,10.0,172174
7,suha,15771976,108,7.0,180399
8,ldsl,15771977,105,1.0,157297
9,imag,15771940,102,9.0,181409


In [19]:
result1 = []
for i in range(10):
    userId = int(selected_user.iloc[i].reviewNo)
    movieId = int(selected_user.iloc[i].movieId)
    predict = predict_rating_cosine(userId, 300, distance_cosine)
    
    for movie in predict:
        if movieId == movie[0]:
            result1.append([int(userId), int(movieId), movie[1]])
            
resultdf = pd.DataFrame(result1, columns=['userId', 'movieId', 'rating'])
resultdf

Unnamed: 0,userId,movieId,rating
0,15771936,145162,5.796813
1,15771961,161967,8.595979
2,15771934,163788,9.152855
3,15771998,180399,7.0
4,15772036,86507,8.567416
5,15771976,180399,7.0
6,15771977,157297,5.796262


In [20]:
realdata_rating = []
for i in range(7):
    if i > 4:
        realdata_rating.append(float(selected_user.iloc[i+2, 3]))
    else:
        realdata_rating.append(float(selected_user.iloc[i, 3]))
resultdata_rating = resultdf.rating.tolist()
print(resultdata_rating)
print(realdata_rating)
error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print("=Cosine=")
print("\nError Rate(Absolute) : ", error_rate_absol)
print("Error Rate(Squared) : ", error_rate_squared)

[5.796812749003985, 8.59597924773022, 9.152854511970533, 6.999999999999999, 8.567415730337078, 6.999999999999999, 5.79626168224299]
[3.0, 10.0, 7.0, 7.0, 10.0, 7.0, 1.0]
=Cosine=

Error Rate(Absolute) :  1.797504852164316
Error Rate(Squared) :  5.6406631699636165


In [21]:
def predict_rating_euclidean(userid, nn=100, simFunc=distance_euclidean):
    neighbor = nearest_neighbor_user(userid, nn, simFunc)
    neighbor_id = [id for id, sim in neighbor]
    
    neighbor_movie = rating_matrix.loc[neighbor_id].dropna(1, how='all', thresh=1)
    neighbor_dict = (dict(neighbor))
    ret =[]
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dict.get(v[0], 0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
        
    return ret

In [22]:
result_euclidean = []
for i in range(10):
    userId = int(selected_user.iloc[i].reviewNo)
    movieId = int(selected_user.iloc[i].movieId)
    predict = predict_rating_euclidean(userId, 300, distance_euclidean)
    
    for movie in predict:
        if movieId == movie[0]:
            result_euclidean.append([int(userId), int(movieId), movie[1]])
            
resultdf_euclidean = pd.DataFrame(result_euclidean, columns=['userId', 'movieId', 'rating'])
resultdf_euclidean

Unnamed: 0,userId,movieId,rating
0,15771936,145162,4.166667
1,15771961,161967,9.247312
2,15771934,163788,8.96875
3,15771998,180399,7.0
4,15772036,86507,8.125
5,15771976,180399,7.0
6,15771977,157297,6.525


In [23]:
realdata_rating_euclidean = []
for i in range(7):
    if i > 4:
        realdata_rating_euclidean.append(float(selected_user.iloc[i+2, 3]))
    else:
        realdata_rating_euclidean.append(float(selected_user.iloc[i, 3]))
    
resultdata_rating_euclidean = resultdf_euclidean.rating.tolist()
print(resultdata_rating_euclidean)
print(realdata_rating_euclidean)
error_rate_absol_euclidean = mean_absolute_error(realdata_rating_euclidean, resultdata_rating_euclidean)
error_rate_squared_euclidean = mean_squared_error(realdata_rating_euclidean, resultdata_rating_euclidean)
print("=Euclidean=")
print("\nError Rate(Absolute) : ", error_rate_absol_euclidean)
print("Error Rate(Squared) : ", error_rate_squared_euclidean)

[4.166666666666666, 9.247311827956983, 8.968749999999998, 7.0, 8.125, 7.0, 6.525000000000001]
[3.0, 10.0, 7.0, 7.0, 10.0, 7.0, 1.0]
=Euclidean=

Error Rate(Absolute) :  1.6125864055299546
Error Rate(Squared) :  5.692125308277796


In [24]:
def predict_rating_correlation(userid, nn=100, simFunc=distance_correlation):
    neighbor = nearest_neighbor_user(userid, nn, simFunc)
    neighbor_id = [id for id, sim in neighbor]
    
    neighbor_movie = rating_matrix.loc[neighbor_id].dropna(1, how='all', thresh=1)
    neighbor_dict = (dict(neighbor))
    ret =[]
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dict.get(v[0], 0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId, wsum/jsum])
        
    return ret

In [25]:
result_correlation = []
for i in range(10):
    userId = int(selected_user.iloc[i].reviewNo)
    movieId = int(selected_user.iloc[i].movieId)
    predict = predict_rating_correlation(userId, 300, distance_correlation)
    
    for movie in predict:
        if movieId == movie[0]:
            result_correlation.append([int(userId), int(movieId), movie[1]])
            
resultdf_correlation = pd.DataFrame(result_correlation, columns=['userId', 'movieId', 'rating'])
resultdf_correlation

  dist = 1.0 - uv / np.sqrt(uu * vv)
  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


Unnamed: 0,userId,movieId,rating
0,15771936,145162,5.390625
1,15771961,161967,10.71028
2,15771934,163788,15.230769
3,15771998,180399,7.0
4,15772036,86507,11.344828
5,15771976,180399,7.0
6,15771977,157297,0.963964


In [26]:
realdata_rating_correlation = []
for i in range(7):
    if i > 4:
        realdata_rating_correlation.append(float(selected_user.iloc[i+2, 3]))
    else:
        realdata_rating_correlation.append(float(selected_user.iloc[i, 3]))
    
resultdata_rating_correlation = resultdf_correlation.rating.tolist()
print(resultdata_rating_correlation)
print(realdata_rating_correlation)
error_rate_absol_correlation = mean_absolute_error(realdata_rating_correlation, resultdata_rating_correlation)
error_rate_squared_correlation = mean_squared_error(realdata_rating_correlation, resultdata_rating_correlation)
print("=Correlation=")
print("\nError Rate(Absolute) : ", error_rate_absol_correlation)
print("Error Rate(Squared) : ", error_rate_squared_correlation)

[5.390625, 10.71028037383178, 15.230769230769202, 7.0, 11.344827586206897, 7.0, 0.9639639639639633]
[3.0, 10.0, 7.0, 7.0, 10.0, 7.0, 1.0]
=Correlation=

Error Rate(Absolute) :  1.8160768895491308
Error Rate(Squared) :  10.825001151824129
