In [1]:
import pandas as pd
import csv
from collections import defaultdict
from datetime import datetime
import matplotlib.patches as mpatches
import matplotlib
import time
import math
from operator import itemgetter
from scipy.spatial import distance
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

# [과제1] 네이버 평점 데이터 수

In [2]:
df = pd.read_csv('./data/rating.csv')
df.tail()

Unnamed: 0,userId,rating,movieId
5119,huya,10,73394
5120,huya,10,70773
5121,huya,8,16220
5122,huya,10,36666
5123,huya,10,37235


In [3]:
df_user = pd.read_csv('./data/naver_user.csv')
df_user.head()

Unnamed: 0,reviewNo,userId
0,15772038,airf
1,15772037,nanw
2,15772036,zxcv
3,15772035,sdh1
4,15772032,guan


In [4]:
user_num = df.userId.unique().shape[0]
movie_num = df.movieId.unique().shape[0]
print('유저 수는 {0}명 이고, \n영화의 수는 {1}개 입니다'.format(user_num,movie_num))

유저 수는 100명 이고, 
영화의 수는 2697개 입니다


# [과제 2-a] 유사 사용자 탐색 

In [5]:
data = pd.merge(df, df_user, on = ['userId'],how = 'left')
data['Count'] = 1

### 가장 많은 평점을 남긴 사용자 10명의 2번째 리뷰 

In [6]:
Top10 = data.groupby(['userId'])['Count'].sum().nlargest(10).reset_index()
data = data[True == data.duplicated(['reviewNo'], keep='first')]
data = data[False == data.duplicated(['reviewNo'], keep='first')]
del Top10['Count']
del data['Count']
Top10 = pd.merge(Top10, data, on = ['userId'],how = 'left')
Top10.head(10)

Unnamed: 0,userId,rating,movieId,reviewNo
0,ykm3,3,145162,15771936
1,sang,10,161967,15771961
2,tsp0,7,163788,15771934
3,hosu,7,180399,15771998
4,zxcv,10,86507,15772036
5,zard,10,158653,15772012
6,artn,10,172174,15771948
7,suha,7,180399,15771976
8,ldsl,1,157297,15771977
9,imag,9,181409,15771940


In [7]:
df = pd.merge(df, df_user, on = ['userId'],how = 'left')
df.head()

Unnamed: 0,userId,rating,movieId,reviewNo
0,airf,2,136900,15772038
1,airf,10,163788,15772038
2,airf,10,174065,15772038
3,nanw,10,154667,15772037
4,nanw,10,136900,15772037


In [8]:
UM_matrix_ds = df.pivot(index ='reviewNo', columns = 'movieId',values = 'rating')
UM_matrix_ds.head(5)

movieId,10002,10003,10004,10005,10006,10008,10009,10012,10016,10018,...,181409,181410,181411,181414,181419,181711,182348,182360,183132,183877
reviewNo,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15771922,,,,,,,,,,,...,,,,,,,,,,
15771925,,,,,,,,,,,...,,8.0,,,,,,,,
15771926,,,,,,,,,,,...,,,,,,,,,,
15771927,,,,,,,,,,,...,,,,,,,,,,
15771929,,,,,,,,,,,...,,,,,,,,,,


### 유사 사용자 탐색을 위해 선언 

In [9]:
def distance_cosine(a,b):
    return 1-distance.cosine(a,b)

In [10]:
def distance_correlation(a,b):
    return 1-distance.correlation(a,b)

In [11]:
def distance_euclidean(a,b):
    return 1/(1+distance.euclidean(a,b))

In [12]:
def nearest_neighbor_user(user, topN, simFunc):
    u1 = UM_matrix_ds.loc[user].dropna()
    ratedIndex = u1.index
    nn = {}
    
    for uid, row in UM_matrix_ds.iterrows():
        interSectionU1 = []
        interSectionU2 = []
        if uid == user : continue
        for i in ratedIndex:
            if not math.isnan(row[i]):
                interSectionU1.append(u1[i])
                interSectionU2.append(row[i])
        interSectionLen = len(interSectionU1)
        
        if interSectionLen < 3: continue
        sim = simFunc(interSectionU1, interSectionU2)
        
        if not math.isnan(sim): nn[uid] = sim
    return sorted(nn.items(), key=itemgetter(1))[:-(topN+1):-1]

### [과제 2-a] 결과 - Cosine

In [13]:
print('Cosine 결과')
topN = 3
for user in Top10['reviewNo']:
    neighbor = []
    nearest = nearest_neighbor_user(int(user),3,distance_cosine)
    for i in range(0,topN):
        neighbor.append([nearest[i][0], round(nearest[i][1],2)])
    print('User {0} neighbors : {1}'.format(user, neighbor))

Cosine 결과
User 15771936 neighbors : [[15771972, 1.0], [15771993, 0.97], [15772003, 0.95]]
User 15771961 neighbors : [[15771947, 1.0], [15771970, 1.0], [15771944, 1.0]]
User 15771934 neighbors : [[15771942, 1.0], [15772009, 1.0], [15771970, 0.99]]
User 15771998 neighbors : [[15771970, 1.0], [15771965, 1.0], [15771947, 1.0]]
User 15772036 neighbors : [[15771959, 1.0], [15771943, 0.98], [15771966, 0.98]]
User 15772012 neighbors : [[15772003, 1.0], [15772016, 0.95], [15772000, 0.92]]
User 15771948 neighbors : [[15771966, 1.0], [15771945, 1.0], [15771943, 0.98]]
User 15771976 neighbors : [[15771966, 1.0], [15771947, 1.0], [15771926, 1.0]]
User 15771977 neighbors : [[15771971, 1.0], [15771959, 1.0], [15771938, 0.99]]
User 15771940 neighbors : [[15771993, 1.0], [15771947, 1.0], [15771981, 1.0]]


### [과제 2-a] 결과 - Correlation

In [14]:
print('Correlation 결과')
topN = 3
for user in Top10['reviewNo']:
    neighbor = []
    nearest = nearest_neighbor_user(int(user),3,distance_correlation)
    for i in range(0,topN):
        neighbor.append([nearest[i][0], round(nearest[i][1],2)])
    print('User {0} neighbors : {1}'.format(user, neighbor))

Correlation 결과


  dist = 1.0 - uv / np.sqrt(uu * vv)


User 15771936 neighbors : [[15771993, 1.0], [15772022, 0.69], [15772019, 0.5]]
User 15771961 neighbors : [[15771972, 1.0], [15772015, 0.94], [15771974, 0.93]]
User 15771934 neighbors : [[15771942, 1.0], [15772030, 0.98], [15771947, 0.96]]
User 15771998 neighbors : [[15772005, 0.89], [15771922, 0.88], [15771977, 0.66]]
User 15772036 neighbors : [[15771966, 0.97], [15771943, 0.94], [15771971, 0.62]]
User 15772012 neighbors : [[15772016, 0.94], [15771974, 0.58], [15771969, 0.58]]
User 15771948 neighbors : [[15771943, 0.94], [15771938, 0.73], [15771926, 0.59]]
User 15771976 neighbors : [[15771940, 0.85], [15771974, 0.58], [15771927, 0.53]]
User 15771977 neighbors : [[15771971, 1.0], [15771959, 1.0], [15771998, 0.66]]
User 15771940 neighbors : [[15771981, 0.98], [15771995, 0.97], [15771974, 0.94]]


### [과제 2-a] 결과 - Euclidean

In [15]:
print('Euclidean 결과')
topN = 3
for user in Top10['reviewNo']:
    neighbor = []
    nearest = nearest_neighbor_user(int(user),3,distance_euclidean)
    for i in range(0,topN):
        neighbor.append([nearest[i][0], round(nearest[i][1],2)])
    print('User {0} neighbors : {1}'.format(user, neighbor))

Euclidean 결과
User 15771936 neighbors : [[15771972, 0.25], [15771993, 0.2], [15772020, 0.12]]
User 15771961 neighbors : [[15772031, 0.33], [15771970, 0.33], [15771954, 0.33]]
User 15771934 neighbors : [[15772030, 0.41], [15771988, 0.29], [15772003, 0.25]]
User 15771998 neighbors : [[15771922, 0.17], [15772030, 0.16], [15772005, 0.15]]
User 15772036 neighbors : [[15771959, 1.0], [15771943, 0.24], [15772022, 0.19]]
User 15772012 neighbors : [[15772016, 0.22], [15772003, 0.15], [15771974, 0.13]]
User 15771948 neighbors : [[15771945, 0.41], [15771943, 0.23], [15771938, 0.15]]
User 15771976 neighbors : [[15771974, 0.31], [15771926, 0.31], [15771966, 0.29]]
User 15771977 neighbors : [[15771971, 1.0], [15771959, 0.5], [15771938, 0.25]]
User 15771940 neighbors : [[15771993, 1.0], [15771947, 1.0], [15771981, 0.33]]


# [과제 2-b] 영화 평점 예측

In [16]:
def predict_rating(userid, nn= 100, simFunc= distance_cosine):
    neighbor = nearest_neighbor_user(userid,nn,simFunc)
    neigbor_id = [id for id, sim in neighbor]
    
    neighbor_movie = UM_matrix_ds.loc[neigbor_id].dropna(1,how = 'all', thresh = 1)
    neighbor_dict = (dict(neighbor))
    ret = []
    
    for movieId, row in neighbor_movie.iteritems():
        jsum, wsum = 0, 0
        for v in row.dropna().iteritems():
            sim = neighbor_dict.get(v[0],0)
            jsum += sim
            wsum += (v[1]*sim)
        ret.append([movieId,wsum/jsum])
    return ret

### [과제 2-b] Cosine 예측 결과

In [17]:
result = []
for i in range(10):
    userId = int(Top10.iloc[i].reviewNo)
    movieId = int(Top10.iloc[i].movieId)
    predict = predict_rating(userId, 300, distance_cosine)
    
    for movie in predict:
        if movieId == movie[0]:
            result.append([int(userId), int(movieId),movie[1]])
resultdf = pd.DataFrame(result, columns=['userId','movieId','rating'])
print('Cosine 결과')
resultdf

Cosine 결과


Unnamed: 0,userId,movieId,rating
0,15771936,145162,5.796201
1,15771961,161967,8.59564
2,15771934,163788,9.150525
3,15771998,180399,7.0
4,15772036,86507,8.561077
5,15771976,180399,7.0
6,15771977,157297,5.80092


In [18]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(Top10[Top10['reviewNo'] == userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print('\nError Rate(Absolute) : ',error_rate_absol)
print('Error Rate(Squared) : ',error_rate_squared)


Error Rate(Absolute) :  1.798704122760153
Error Rate(Squared) :  5.6478653086387975


### [과제 2-b] Correlation 예측 결과

In [19]:
result = []
for i in range(10):
    userId = int(Top10.iloc[i].reviewNo)
    movieId = int(Top10.iloc[i].movieId)
    predict = predict_rating(userId, 300, distance_correlation)
    
    for movie in predict:
        if movieId == movie[0]:
            result.append([int(userId), int(movieId),movie[1]])
resultdf = pd.DataFrame(result, columns=['userId','movieId','rating'])
print('Correlation 결과')
resultdf

  dist = 1.0 - uv / np.sqrt(uu * vv)
  from ipykernel import kernelapp as app


Correlation 결과


Unnamed: 0,userId,movieId,rating
0,15771936,145162,5.433217
1,15771961,161967,10.715807
2,15771934,163788,15.406481
3,15771998,180399,7.0
4,15772036,86507,11.304849
5,15771976,180399,7.0
6,15771977,157297,0.954683


In [20]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(Top10[Top10['reviewNo'] == userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print('\nError Rate(Absolute) : ',error_rate_absol)
print('Error Rate(Squared) : ',error_rate_squared)


Error Rate(Absolute) :  1.8436674517673026
Error Rate(Squared) :  11.258077133072339


### [과제 2-b]Euclidean 예측 결과

In [21]:
result = []
for i in range(10):
    userId = int(Top10.iloc[i].reviewNo)
    movieId = int(Top10.iloc[i].movieId)
    predict = predict_rating(userId, 300, distance_euclidean)
    
    for movie in predict:
        if movieId == movie[0]:
            result.append([int(userId), int(movieId),movie[1]])
resultdf = pd.DataFrame(result, columns=['userId','movieId','rating'])
print('Euclidean 결과')
resultdf

Euclidean 결과


Unnamed: 0,userId,movieId,rating
0,15771936,145162,4.649008
1,15771961,161967,9.263031
2,15771934,163788,8.956179
3,15771998,180399,7.0
4,15772036,86507,8.054289
5,15771976,180399,7.0
6,15771977,157297,6.588583


In [22]:
realdata_rating = []
for userid in resultdf['userId']:
    realdata_rating.append(float(Top10[Top10['reviewNo'] == userid]['rating']))

resultdata_rating = resultdf.rating.tolist()

error_rate_absol = mean_absolute_error(realdata_rating, resultdata_rating)
error_rate_squared = mean_squared_error(realdata_rating, resultdata_rating)
print('\nError Rate(Absolute) : ',error_rate_absol)
print('Error Rate(Squared) : ',error_rate_squared)


Error Rate(Absolute) :  1.6966357708958737
Error Rate(Squared) :  6.0152917257030625
