In [201]:
import os

import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
from math import sqrt

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## 데이터 불러오기

In [2]:
data_path = './data/ml-latest-small/'

rating_df = pd.read_csv(os.path.join(data_path, 'ratings.csv'), encoding = 'utf-8')

display(rating_df.head())
print(rating_df.shape)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


(100836, 4)


### 데이터 분리 (Train / Test)

In [145]:
train_df, test_df = train_test_split(rating_df, test_size=0.2, random_state=42)

print(f'Train Dataset Shape : {train_df.shape}\nTest Datset Shape : {test_df.shape}')

Train Dataset Shape : (80668, 4)
Test Datset Shape : (20168, 4)


## Sparse Matrix 생성 (User - Item)

In [4]:
user_idx =  sorted(list(set(train_df['userId'].values)))
movie_idx = sorted(list(set(train_df['movieId'].values)))

print(f'사용자 수 : {len(user_idx)}\n영화 수 : {len(movie_idx)}')
print(user_idx[:3])
print(movie_idx[:3])

사용자 수 : 610
영화 수 : 8983
[1, 2, 3]
[1, 2, 3]


In [5]:
train_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
80568,509,7347,3.0,1435994597
50582,326,71462,4.0,1322252335
8344,57,2115,3.0,965798155
99603,610,1127,4.0,1479544102
71701,462,2409,2.0,1174438249


In [6]:
sparse_matrix = train_df.pivot_table(index='userId', 
                                    columns='movieId',
                                    values = 'rating').transpose()


In [7]:
sparse_matrix.fillna(0, inplace=True)

# NaN에 0이 아닌 다른 숫자를 주면서 실험을 해보는 것도 좋을듯

In [8]:
sparse_matrix

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,...,0.0,0.0,4.0,3.0,4.0,2.5,0.0,2.5,3.0,5.0
2,0.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,0.0,0.0,...,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0,0.0
3,4.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193583,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
193587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## cosine simliarity 구하기

In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def cosine_matrix(a, b):
    cosim_values = cosine_similarity(a.values, b.values)

    return pd.DataFrame(data = cosim_values, columns = a.index.values, index = a.index.values)


## Item-based 이웃기반 협업필터링

In [10]:
item_sparse_df = cosine_matrix(sparse_matrix, sparse_matrix)
item_sparse_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
1,1.000000,0.362258,0.191738,0.000000,0.234845,0.278680,0.184020,0.128930,0.140521,0.329772,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.362258,1.000000,0.185543,0.092463,0.218363,0.228204,0.194008,0.160969,0.058837,0.349398,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.191738,0.185543,1.000000,0.122155,0.223768,0.177748,0.276536,0.246393,0.194060,0.235465,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.000000,0.092463,0.122155,1.000000,0.035394,0.000000,0.208622,0.189295,0.000000,0.080288,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.234845,0.218363,0.223768,0.035394,1.000000,0.209460,0.328209,0.271967,0.193847,0.180153,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193583,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193585,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0
193587,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0


In [11]:
# userId_grouped.groups
# {1: [128, 156, 54, 110, 137, 196, 202, 150, 198, 191, 170, 34, 140, 225, 122, 12, 83, 224, 120, 210, 24, 180, 144, 212, 176, 109, 190, 98, 104, 160, 7, 27, 208, 175, 6, 197, 80, 4, 41, 187, 108, 72, 21, 32, 86, 119, 130, 216, 52, 167, 49, 33, 231, 121, 152, 85, 173, 62, 223, 40, 69, 107, 73, 131, 0, 78, 217, 13, 204, 143, 11, 133, 47, 82, 71, 116, 61, 31, 214, 44, 38, 139, 19, 65, 171, 9, 67, 99, 213, 200, 227, 201, 3, 220, 30, 165, 17, 89, 91, 1, ...], 
#  2: [242, 237, 248, 245, ... }

userId_grouped = train_df.groupby('userId')

item_prediction_result_df = pd.DataFrame(index = list(userId_grouped.indices.keys()),
                                                      columns = sparse_matrix.index)
item_prediction_result_df

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,,,,,,,,,,,...,,,,,,,,,,
607,,,,,,,,,,,...,,,,,,,,,,
608,,,,,,,,,,,...,,,,,,,,,,
609,,,,,,,,,,,...,,,,,,,,,,


In [12]:
# userID  : 1 

# grouped DataFrame
#                 columns
# index 
#       userId  movieId  rating  timestamp
# 128       1     2090     5.0  964982838
# 156       1     2427     5.0  964982242
# 54        1     1030     3.0  964982903
# 110       1     1793     4.0  964981404
# 137       1     2139     5.0  964982791
# ..      ...      ...     ...        ...
# 126       1     2058     5.0  964982400
# 77        1     1220     5.0  964981909
# 117       1     2000     4.0  964982211
# 55        1     1031     5.0  964982653
# 206       1     3243     3.0  964981093

for userId, group in tqdm_notebook(userId_grouped):
    # user가 rating한 movieId * 전체 movieId
    user_sim = item_sparse_df.loc[group['movieId']]
    # user가 rating한 movieId * 1
    user_rating = group['rating']
    # 전체 movieId * 1
    sim_sum = user_sim.sum(axis = 0)

    # userId 전체 rating prediction (8938 * 1)
    pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum + 1)
    item_prediction_result_df.loc[userId] = pred_ratings

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for userId, group in tqdm_notebook(userId_grouped):


  0%|          | 0/610 [00:00<?, ?it/s]

In [13]:
item_prediction_result_df

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
1,4.258985,4.225111,4.230892,3.660328,4.061304,4.244936,4.082133,4.115884,3.766055,4.209,...,0.411052,0.411052,0.411052,0.411052,0.411052,0.411052,0.411052,0.411052,0.411052,1.956337
2,3.23691,3.178129,2.639684,0.323271,2.506375,2.997009,1.924204,2.326197,1.241721,2.955827,...,1.2315,1.2315,1.2315,1.2315,1.2315,1.2315,1.2315,1.2315,1.2315,1.767844
3,1.313045,1.18366,1.189929,0.195481,0.724177,1.448498,0.770711,0.651952,0.78813,1.449468,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.348641,3.295965,3.202888,2.606133,3.182028,3.268109,3.260882,3.083274,2.805374,3.218618,...,0.497143,0.497143,0.497143,0.497143,0.497143,0.497143,0.497143,0.497143,0.497143,1.468258
5,3.353742,3.301582,3.086839,2.729331,3.085781,3.244677,3.1241,3.049025,2.194879,3.257455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.706917
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,3.634795,3.566787,3.482549,3.348661,3.52192,3.655325,3.625809,3.472507,3.302513,3.569193,...,1.637355,1.637355,1.637355,1.637355,1.637355,1.637355,1.637355,1.637355,1.637355,2.924673
607,3.625598,3.616726,3.613566,3.201787,3.458089,3.640195,3.489825,3.498278,3.164143,3.620283,...,0.246631,0.246631,0.246631,0.246631,0.246631,0.246631,0.246631,0.246631,0.246631,0.994172
608,3.18849,3.136215,3.014317,2.567629,2.997874,3.24358,2.937567,2.990444,2.660836,3.145266,...,1.134961,1.134961,1.134961,1.134961,1.134961,1.134961,1.134961,1.134961,1.134961,3.437221
609,2.993017,2.930762,2.745635,2.228823,2.69651,2.894252,2.701254,2.516283,2.143655,3.0474,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.306278


In [14]:
## 확인

idx = 0
for userId, group in userId_grouped:
    if idx < 1:
        print('유저 아이디 : %d' % userId)

        user_sim = item_sparse_df.loc[group['movieId']]
        print('유저가 남긴 영화의 유사도 \n', user_sim)

        user_rating = group['rating']
        
        print('유저가 남긴 영화 평점\n', user_rating)
        sim_sum = user_sim.sum(axis = 0)

        pred_ratings = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum + 1)
        print('matmul', np.matmul(user_sim.T.to_numpy(), user_rating))
        print('----',sim_sum+1 ,'---')
        print('예측 평점')
        print(pred_ratings)
    else:
        continue
    idx += 1

유저 아이디 : 1
유저가 남긴 영화의 유사도 
         1         2         3         4         5         6         7       \
2090  0.230178  0.180995  0.155336  0.000000  0.115021  0.072939  0.054743   
2427  0.158116  0.122543  0.212151  0.012819  0.045585  0.203903  0.081421   
1030  0.131946  0.101683  0.176803  0.083896  0.105470  0.081840  0.063435   
1793  0.053865  0.000000  0.134802  0.000000  0.000000  0.078575  0.000000   
2139  0.220132  0.194935  0.240277  0.068009  0.128246  0.125313  0.080220   
...        ...       ...       ...       ...       ...       ...       ...   
2058  0.176429  0.180664  0.143569  0.000000  0.118734  0.259047  0.099275   
1220  0.310261  0.245127  0.218426  0.000000  0.050988  0.318297  0.097920   
2000  0.311418  0.264546  0.246346  0.000000  0.068900  0.319940  0.069541   
1031  0.229687  0.253332  0.278447  0.040146  0.056237  0.169267  0.064110   
3243  0.141659  0.178712  0.321097  0.093764  0.077461  0.070293  0.080822   

        8         9         10     

## User Based 협업필터링

In [15]:
user_matrix = sparse_matrix.transpose()
user_matrix

movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,2.5,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
609,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
user_cosine_df = cosine_matrix(user_matrix, user_matrix)
user_cosine_df

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,1.000000,0.016314,0.049021,0.165799,0.123392,0.118556,0.112563,0.142135,0.056088,0.012906,...,0.070901,0.152097,0.187324,0.067264,0.151517,0.139042,0.198771,0.232811,0.112174,0.143902
2,0.016314,1.000000,0.000000,0.004627,0.000000,0.013391,0.029067,0.032754,0.000000,0.080739,...,0.170123,0.020395,0.014415,0.000000,0.000000,0.019846,0.016076,0.055610,0.032404,0.075810
3,0.049021,0.000000,1.000000,0.000000,0.005770,0.004833,0.000000,0.005911,0.000000,0.000000,...,0.006401,0.005889,0.015344,0.000000,0.012783,0.008884,0.004642,0.009433,0.000000,0.031309
4,0.165799,0.004627,0.000000,1.000000,0.133565,0.090914,0.094497,0.050417,0.000000,0.021991,...,0.075828,0.090252,0.241155,0.054366,0.081585,0.162277,0.083074,0.107276,0.026720,0.068325
5,0.123392,0.000000,0.005770,0.133565,1.000000,0.238812,0.071386,0.393773,0.000000,0.006245,...,0.050523,0.343953,0.101064,0.159651,0.111464,0.086797,0.073278,0.097040,0.205395,0.053090
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.139042,0.019846,0.008884,0.162277,0.086797,0.086447,0.137372,0.080967,0.053366,0.061267,...,0.136437,0.077495,0.244189,0.061137,0.132016,1.000000,0.120745,0.224829,0.064349,0.159929
607,0.198771,0.016076,0.004642,0.083074,0.073278,0.135438,0.171735,0.159539,0.014172,0.012561,...,0.093158,0.158940,0.156456,0.101872,0.083353,0.120745,1.000000,0.208673,0.096324,0.097743
608,0.232811,0.055610,0.009433,0.107276,0.097040,0.136393,0.238417,0.155110,0.091135,0.051562,...,0.134926,0.141069,0.188459,0.111872,0.154623,0.224829,0.208673,1.000000,0.110371,0.260886
609,0.112174,0.032404,0.000000,0.026720,0.205395,0.181736,0.052096,0.439794,0.000000,0.028483,...,0.028450,0.306228,0.055558,0.181878,0.093744,0.064349,0.096324,0.110371,1.000000,0.057971


In [18]:
movieId_grouped = train_df.groupby('movieId')

user_prediction_result_df = pd.DataFrame(index = list(movieId_grouped.indices.keys()),
                                                      columns = user_matrix.index)
user_prediction_result_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,,,,,,,,,,...,,,,,,,,,,
193583,,,,,,,,,,,...,,,,,,,,,,
193585,,,,,,,,,,,...,,,,,,,,,,
193587,,,,,,,,,,,...,,,,,,,,,,


In [19]:
for movieId, group in tqdm_notebook(movieId_grouped):
    user_sim =  user_cosine_df.loc[group['userId']]
    user_rating = group['rating']
    sim_sum = user_sim.sum(axis=0)

    pred_rating = np.matmul(user_sim.T.to_numpy(), user_rating) / (sim_sum + 1)

    user_prediction_result_df.loc[movieId] = pred_rating

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for movieId, group in tqdm_notebook(movieId_grouped):


  0%|          | 0/8983 [00:00<?, ?it/s]

In [20]:
user_prediction_result_df

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
1,3.740314,3.41796,2.382156,3.641485,3.730396,3.788125,3.769571,3.765594,3.436889,3.558568,...,3.704037,3.776657,3.690981,3.72976,3.769396,3.657551,3.737898,3.726095,3.729687,3.730516
2,3.094927,2.674527,1.500534,2.972038,3.258272,3.341466,3.083024,3.332515,2.708752,2.783947,...,2.999499,3.343025,2.973342,3.338386,3.1446,3.044382,3.110844,3.103048,3.25369,3.09359
3,2.789876,1.61624,1.013439,2.473705,2.623135,3.053123,2.446294,2.790097,1.78289,1.7181,...,2.119969,2.848375,2.595595,2.776116,2.428492,2.525752,2.652162,2.668563,2.628101,2.433483
4,0.951121,0.160544,0.073546,0.644564,1.359217,1.761489,0.632034,1.482637,0.201592,0.266761,...,0.459825,1.517327,0.786407,1.314543,0.7722,0.833127,1.00561,0.951733,1.269659,0.591461
5,2.27525,1.448245,0.41241,2.028397,2.573024,2.751361,2.219484,2.707813,1.402479,1.501527,...,1.828184,2.710109,2.063649,2.666476,2.26056,2.08718,2.280287,2.373766,2.631978,2.043206
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.027287,0.260269,0.0,0.046936,0.0,0.0,0.051417,0.0,0.0,0.069101,...,0.472685,0.0,0.023375,0.0,0.092484,0.064043,0.021543,0.050768,0.0,0.37947
193583,0.023876,0.227735,0.0,0.041069,0.0,0.0,0.04499,0.0,0.0,0.060463,...,0.4136,0.0,0.020453,0.0,0.080924,0.056037,0.01885,0.044422,0.0,0.332036
193585,0.023876,0.227735,0.0,0.041069,0.0,0.0,0.04499,0.0,0.0,0.060463,...,0.4136,0.0,0.020453,0.0,0.080924,0.056037,0.01885,0.044422,0.0,0.332036
193587,0.023876,0.227735,0.0,0.041069,0.0,0.0,0.04499,0.0,0.0,0.060463,...,0.4136,0.0,0.020453,0.0,0.080924,0.056037,0.01885,0.044422,0.0,0.332036


In [21]:
print('---- Item 기반 이웃 협업필터링 ----')
display(item_prediction_result_df.head())

print('---- User 기반 이웃 협업필터링 ----')
display(user_prediction_result_df.transpose().head())

---- Item 기반 이웃 협업필터링 ----


movieId,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
1,4.258985,4.225111,4.230892,3.660328,4.061304,4.244936,4.082133,4.115884,3.766055,4.209,...,0.411052,0.411052,0.411052,0.411052,0.411052,0.411052,0.411052,0.411052,0.411052,1.956337
2,3.23691,3.178129,2.639684,0.323271,2.506375,2.997009,1.924204,2.326197,1.241721,2.955827,...,1.2315,1.2315,1.2315,1.2315,1.2315,1.2315,1.2315,1.2315,1.2315,1.767844
3,1.313045,1.18366,1.189929,0.195481,0.724177,1.448498,0.770711,0.651952,0.78813,1.449468,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.348641,3.295965,3.202888,2.606133,3.182028,3.268109,3.260882,3.083274,2.805374,3.218618,...,0.497143,0.497143,0.497143,0.497143,0.497143,0.497143,0.497143,0.497143,0.497143,1.468258
5,3.353742,3.301582,3.086839,2.729331,3.085781,3.244677,3.1241,3.049025,2.194879,3.257455,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.706917


---- User 기반 이웃 협업필터링 ----


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.740314,3.094927,2.789876,0.951121,2.27525,3.703472,2.626654,1.449118,1.70502,3.222512,...,0.030698,0.023876,0.027287,0.027287,0.023876,0.027287,0.023876,0.023876,0.023876,0.228209
2,3.41796,2.674527,1.61624,0.160544,1.448245,3.174517,1.42666,0.520892,0.454905,2.828223,...,0.292802,0.227735,0.260269,0.260269,0.227735,0.260269,0.227735,0.227735,0.227735,0.422754
3,2.382156,1.500534,1.013439,0.073546,0.41241,2.121499,0.650073,0.15634,0.18322,1.803467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.641485,2.972038,2.473705,0.644564,2.028397,3.512056,2.465829,1.077831,1.288335,3.055768,...,0.052803,0.041069,0.046936,0.046936,0.041069,0.046936,0.041069,0.041069,0.041069,0.178123
5,3.730396,3.258272,2.623135,1.359217,2.573024,3.516378,2.826829,1.608303,1.335225,3.16351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161087


## 추천시스템 평가 (test_df)

In [96]:
user_prediction_result_df = user_prediction_result_df.transpose()

In [97]:
user_prediction_result_df.head()

Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,191005,193565,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.740314,3.094927,2.789876,0.951121,2.27525,3.703472,2.626654,1.449118,1.70502,3.222512,...,0.030698,0.023876,0.027287,0.027287,0.023876,0.027287,0.023876,0.023876,0.023876,0.228209
2,3.41796,2.674527,1.61624,0.160544,1.448245,3.174517,1.42666,0.520892,0.454905,2.828223,...,0.292802,0.227735,0.260269,0.260269,0.227735,0.260269,0.227735,0.227735,0.227735,0.422754
3,2.382156,1.500534,1.013439,0.073546,0.41241,2.121499,0.650073,0.15634,0.18322,1.803467,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,3.641485,2.972038,2.473705,0.644564,2.028397,3.512056,2.465829,1.077831,1.288335,3.055768,...,0.052803,0.041069,0.046936,0.046936,0.041069,0.046936,0.041069,0.041069,0.041069,0.178123
5,3.730396,3.258272,2.623135,1.359217,2.573024,3.516378,2.826829,1.608303,1.335225,3.16351,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.161087


In [88]:
test_df.head()  ## train_df에 있는 movieId, userId만 평가할 수 있음

Unnamed: 0,userId,movieId,rating,timestamp
67037,432,77866,4.5,1335139641
42175,288,474,3.0,978465565
93850,599,4351,3.0,1498524542
6187,42,2987,4.0,996262677
12229,75,1610,4.0,1158989841


In [89]:
print(user_prediction_result_df.shape)
print(item_prediction_result_df.shape)

(8983, 610)
(610, 8983)


In [98]:
pred_ratings.to_frame()

Unnamed: 0,0
1,4.258985
2,4.225111
3,4.230892
4,3.660328
5,4.061304
...,...
193581,0.411052
193583,0.411052
193585,0.411052
193587,0.411052


In [153]:
def evaluate(test_df, prediction_result_df):
    groups_with_movie_idx = test_df.groupby('movieId')
    groups_with_user_idx = test_df.groupby('userId')

    intersection_movie_idx = sorted(list(set(list(prediction_result_df.columns)).intersection(set(list(groups_with_movie_idx.indices.keys())))))
    intersection_user_idx = sorted(list(set(list(prediction_result_df.index)).intersection(set(list(groups_with_user_idx.indices.keys())))))

    print('Test Data & Prediction 같이 있는 영화 수 : ', len(intersection_movie_idx))
    print('Test Data & Prediction 같이 있는 사용자 수  : ',len(intersection_user_idx))

    evaluate_using_df = prediction_result_df.loc[intersection_user_idx][intersection_movie_idx]
    
    grouped = test_df.groupby('userId')
    result_df = pd.DataFrame(columns= ['rmse'])

    for userId, group in tqdm_notebook(grouped):
        if userId in intersection_user_idx:
            pred_ratings = evaluate_using_df.loc[userId][evaluate_using_df.loc[userId].index.intersection(list(group['movieId'].values))]
            # print('---- pred 1 ----')
            # display(pred_ratings)
            pred_ratings = pred_ratings.to_frame(name='rating').reset_index() # .rename(columns={'index' : 'movieId', 'rating' : 'pred_rating'})

            # print('---- pred 2 ----')
            # display(pred_ratings)
        
            # print('--- group ---')
            # print(group[['rating', 'movieId']])
            actual_ratings = group[['rating', 'movieId']].rename(columns = {'rating' : 'true_rating'})

            # print('--- actual_ratings')
            # display(actual_ratings)

            result_df = pd.merge(actual_ratings, pred_ratings, how = 'inner', on='movieId')

            reusult_df = result_df.round(4) # 반올림
        
    return reusult_df



In [154]:
evaluate(test_df, item_prediction_result_df)

Test Data & Prediction 같이 있는 영화 수 :  4401
Test Data & Prediction 같이 있는 사용자 수  :  610


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for userId, group in tqdm_notebook(grouped):


  0%|          | 0/610 [00:00<?, ?it/s]

Unnamed: 0,true_rating,movieId,rating
0,3.0,4228,3.571375
1,4.0,46970,3.720459
2,3.0,7324,3.577028
3,3.5,27904,3.742933
4,1.0,120635,3.556273
...,...,...,...
218,2.0,7454,3.626313
219,5.0,2076,3.841106
220,4.5,3016,3.711387
221,3.5,4448,3.675157


-----

### 전체 데이터셋 평가 (위에는 마지막 사용자에 대한 평가)

In [162]:
common_user_idx = set(list(user_prediction_result_df.index)).intersection(set(test_df['userId']))
common_movie_idx = set(list(user_prediction_result_df.columns)).intersection(set(test_df['movieId']))

print('Common User', len(common_user_idx))
print('Common Item', len(common_movie_idx))

Common User 610
Common Item 4401


In [150]:
common_predict_df = user_prediction_result_df.loc[common_user_idx][common_movie_idx]
common_predict_df.tail()

  common_predict_df = user_prediction_result_df.loc[common_user_idx][common_movie_idx]
  common_predict_df = user_prediction_result_df.loc[common_user_idx][common_movie_idx]


Unnamed: 0_level_0,1,2,3,4,5,6,7,8,9,10,...,98243,8132,8142,8157,114662,65514,40946,106487,106489,106491
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,3.657551,3.044382,2.525752,0.833127,2.08718,3.63601,2.541677,1.300969,1.255474,3.155145,...,1.608779,2.563258,0.68939,1.362376,2.480569,2.187383,0.205897,2.46735,2.383898,1.108423
607,3.737898,3.110844,2.652162,1.00561,2.280287,3.652392,2.578136,1.382262,1.519839,3.194088,...,1.254256,2.344404,0.445201,1.327345,2.051076,1.873974,0.186263,2.120152,2.131702,0.923764
608,3.726095,3.103048,2.668563,0.951733,2.373766,3.751769,2.665409,1.670518,1.754752,3.315045,...,1.815619,3.236373,1.034534,1.720237,2.682038,2.536776,0.349079,2.590209,2.586917,1.447144
609,3.729687,3.25369,2.628101,1.269659,2.631978,3.561744,2.840061,1.593327,1.614517,3.272353,...,0.782071,1.999692,0.273972,0.713472,1.763762,1.359055,0.113536,1.825762,1.776084,0.779968
610,3.730516,3.09359,2.433483,0.591461,2.043206,3.704967,2.362356,1.393514,1.222528,3.243026,...,2.116431,2.779762,2.5,1.437907,2.849329,2.737849,0.336432,2.87406,2.693586,1.614371


In [204]:
common_user_idx = set(list(user_prediction_result_df.index)).intersection(set(test_df['userId']))
common_movie_idx = set(list(user_prediction_result_df.columns)).intersection(set(test_df['movieId']))
common_predict_df = user_prediction_result_df.loc[common_user_idx][common_movie_idx]

grouped = test_df.groupby('userId')
user_result = pd.DataFrame(columns = ['movieId', 'pred_rating', 'rating'])

# count = 0

for userId, group in tqdm_notebook(grouped):
    if userId in common_user_idx:
        # if count < 1:
        pred_rating = common_predict_df.loc[userId][common_predict_df.loc[userId].index.intersection(list(group['movieId'].values))]
        # print(pred_rating)
        pred_rating = pred_rating.to_frame().reset_index().rename(columns = {userId : 'pred_rating', 'index' : 'movieId'})
        # display(pred_rating)
        actual_rating = group[['rating', 'movieId']]
        # display(actual_rating)
        final_df = pd.merge(pred_rating, actual_rating, how='inner', on='movieId')
        # display(final_df)
        # count += 1
        user_result = pd.concat([user_result, final_df], axis = 0)

print(user_result.shape)
print('전체 데이터에 대한 RMSE : ', sqrt(mean_squared_error(user_result['pred_rating'], user_result['rating'])))

  common_predict_df = user_prediction_result_df.loc[common_user_idx][common_movie_idx]
  common_predict_df = user_prediction_result_df.loc[common_user_idx][common_movie_idx]
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for userId, group in tqdm_notebook(grouped):


  0%|          | 0/610 [00:00<?, ?it/s]

(19355, 3)
전체 데이터에 대한 RMSE :  1.4130603578370085


In [207]:
common_user_idx = set(list(user_prediction_result_df.index)).intersection(set(test_df['userId']))
common_movie_idx = set(list(user_prediction_result_df.columns)).intersection(set(test_df['movieId']))

common_predict_df = item_prediction_result_df.loc[common_user_idx][common_movie_idx]

grouped = test_df.groupby('userId')
item_result = pd.DataFrame(columns = ['movieId', 'pred_rating', 'rating'])

# count = 0
for userId, group in tqdm_notebook(grouped):
    if userId in common_user_idx:
        pred_rating = common_predict_df.loc[userId][common_predict_df.loc[userId].index.intersection(list(group['movieId'].values))]
        # print(pred_rating)
        pred_rating = pred_rating.to_frame().reset_index().rename(columns = {userId : 'pred_rating', 'index' : 'movieId'})
        # display(pred_rating)
        actual_rating = group[['rating', 'movieId']]
        # display(actual_rating)
        final_df = pd.merge(pred_rating, actual_rating, how='inner', on='movieId')

        item_result = pd.concat([item_result, final_df], axis=0)

print(item_result.shape)
print('전체 데이터에 대한 RMSE : ', sqrt(mean_squared_error(item_result['pred_rating'], item_result['rating'])))

  common_predict_df = item_prediction_result_df.loc[common_user_idx][common_movie_idx]
  common_predict_df = item_prediction_result_df.loc[common_user_idx][common_movie_idx]
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for userId, group in tqdm_notebook(grouped):


  0%|          | 0/610 [00:00<?, ?it/s]

(19355, 3)
전체 데이터에 대한 RMSE :  0.9440534851834362
