In [8]:
import random
import numpy as np
import pandas as pd
import operator
from scipy.sparse import coo_matrix
from numpy.linalg import norm
from sklearn.metrics import mean_squared_error

In [2]:
ls

[0m[01;34msample_data[0m/


In [3]:
import os, sys 
from google.colab import drive 

### 해당 코드 실행 시 colab에서 실행중인 폴더의 /content/drive/My Drive가 구글 드라이브에 연결됨

drive.mount('/content/drive')


Mounted at /content/drive


In [4]:
#### 영화 데이터셋 형태 확인 ####
movies = pd.read_csv("/content/drive/MyDrive/boostcamp/data/graph/others/movies.csv")

print("### Movie Dataset Format ###", end = '\n\n')
print(movies.head())

### Movie Dataset Format ###

   movieId  ...                                       genres
0        1  ...  Adventure|Animation|Children|Comedy|Fantasy
1        2  ...                   Adventure|Children|Fantasy
2        3  ...                               Comedy|Romance
3        4  ...                         Comedy|Drama|Romance
4        5  ...                                       Comedy

[5 rows x 3 columns]


In [5]:
################### 가상 유저 추가해주기 위한 작업 데이터 전처리 - 장르별 영화 리스트를 뽑는다 #################

# 이 셀을 실행하는 이유는 후에 특정 장르를 좋아하는 가상의 유저를 만들어서 추천의 결과를 보기 위함입니다.

movie_dict = dict()                 # {movie_id : (movie_title, movie_genre)}
musical_list = list()               # musical 장르의 영화들
horror_list = list()                # horror 장르의 영화들
documentary_list = list()           # documentary 장르의 영화들
comedy_list = list()                # comedy 장르의 영화들
animation_list = list()

for (movie_id, movie_title, movie_genre) in movies.itertuples(index=False):
    movie_dict[movie_id] = (movie_title, movie_genre)
    if 'Musical' in movie_genre:
        musical_list.append(movie_id)
    if 'Horror' in movie_genre:
        horror_list.append(movie_id)
    if 'Documentary' in movie_genre:
        documentary_list.append(movie_id)
    if 'Comedy' in movie_genre:
        comedy_list.append(movie_id)
    if 'Animation' in movie_genre:
        animation_list.append(movie_id)

In [6]:
ratings = pd.read_csv("/content/drive/MyDrive/boostcamp/data/graph/others/ratings.csv")

# 평점 데이터셋 형태 확인
print("### Rating Dataset Format ###", end='\n\n')
print(ratings.head(), end='\n\n\n')
ratings.drop(['timestamp'], axis=1, inplace=True)
print(ratings.head())

### Rating Dataset Format ###

   userId  movieId  rating  timestamp
0       1        1     4.0  964982703
1       1        3     4.0  964981247
2       1        6     4.0  964982224
3       1       47     5.0  964983815
4       1       50     5.0  964982931


   userId  movieId  rating
0       1        1     4.0
1       1        3     4.0
2       1        6     4.0
3       1       47     5.0
4       1       50     5.0


In [9]:
'''
    유저가 평균적으로 영화를 130개정도 본 것으로 나타나서 일반적으로 추가하는 사람들도 그정도로 설정
    그리고, 영화에 대한 평점을 줄 때는 영화 순서를 섞고, 무조건 5점을 주는 것이 아니라 4, 5점 중 랜덤으로 설정
    낮은 점수를 줄 때도 마찬가지.
'''
# uid==800을 가지는 유저를 추가한다. 이 유저는 뮤지컬의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);

print(ratings)
new_uid = 800
rows = list()
for movie_id in musical_list[:100]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in horror_list[:50]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in documentary_list[:20]:
    rows.append([new_uid, movie_id, random.randint(2,3)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)


        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
100831     610   166534     4.0
100832     610   168248     5.0
100833     610   168250     5.0
100834     610   168252     5.0
100835     610   170875     3.0

[100836 rows x 3 columns]
        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101001     800    27850     2.0
101002     800   158842     2.0
101003     800     1123     3.0
101004     800     4237     3.0
101005     800   102852     2.0

[101006 rows x 3 columns]


In [10]:
# uid==850을 가지는 유저를 추가한다. 이 유저는 다큐 영화의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);


print(ratings)
new_uid = 850
rows = list()
for movie_id in documentary_list[:100]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in horror_list[:50]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in animation_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,2)])

for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101001     800    27850     2.0
101002     800   158842     2.0
101003     800     1123     3.0
101004     800     4237     3.0
101005     800   102852     2.0

[101006 rows x 3 columns]
        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101161     850   157296     1.0
101162     850     6857     1.0
101163     850     5159     2.0
101164     850    74791     2.0
101165     850   162578     2.0

[101166 rows x 3 columns]


In [11]:
# uid==900을 가지는 유저를 추가한다. 이 유저는 호러의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);

new_uid = 900
rows = list()
for movie_id in horror_list[:120]:
    rows.append([new_uid, movie_id, random.randint(4,5)])
for movie_id in documentary_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in comedy_list[:30]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in animation_list[:20]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101341     900     3725     1.0
101342     900   143245     1.0
101343     900    62336     1.0
101344     900    46337     1.0
101345     900   128991     1.0

[101346 rows x 3 columns]


In [12]:
# uid==950을 가지는 유저를 추가한다. 이 유저는 애니메이션의 광팬이다.

random.shuffle(musical_list);
random.shuffle(horror_list);
random.shuffle(documentary_list);
random.shuffle(comedy_list);
random.shuffle(animation_list);

new_uid = 950
rows = list()
for movie_id in horror_list[:20]:
    rows.append([new_uid, movie_id, random.randint(1,2)])
for movie_id in documentary_list[:10]:
    rows.append([new_uid, movie_id, random.randint(1,3)])
for movie_id in comedy_list[:30]:
    rows.append([new_uid, movie_id, random.randint(2,4)])
for movie_id in animation_list[:150]:
    rows.append([new_uid, movie_id, random.randint(3,5)])
for row in rows:
    ratings = ratings.append(pd.Series(row, index=ratings.columns), ignore_index=True)
print(ratings)

        userId  movieId  rating
0            1        1     4.0
1            1        3     4.0
2            1        6     4.0
3            1       47     5.0
4            1       50     5.0
...        ...      ...     ...
101551     950   143245     5.0
101552     950   163639     5.0
101553     950   167746     3.0
101554     950   104245     5.0
101555     950    96281     4.0

[101556 rows x 3 columns]


In [13]:
########################################################################
######### Mapping user id, movie id to user index, movie index #########
########################################################################

uid_2_idx = dict()     # user id --> user idx
mid_2_idx = dict()    # movie id --> movie idx

u_idx = 0
m_idx = 0
for user_id, movie_id, r in ratings.itertuples(index=False):
    if user_id not in uid_2_idx:
        uid_2_idx[user_id] = u_idx
        u_idx = u_idx+1

    if movie_id not in mid_2_idx:
        mid_2_idx[movie_id] = m_idx
        m_idx = m_idx+1

num_user = len(uid_2_idx)
num_movie = len(mid_2_idx)
print(num_user) # 유저수
print(num_movie) # 영화수

uidx_2_id = {v:k for (k,v) in uid_2_idx.items()}    # user idx --> user id
midx_2_id = {v:k for (k,v) in mid_2_idx.items()}    # movie idx --> movie id


614
9725


In [14]:
# rating matrix 초기화
rating_matrix = np.zeros((num_user, num_movie)) # num_user x num_movie를 크기로 가지는 numpy 행렬 선언, 각 요소는 유저가 영화에 매긴 rating을 나타냄.

for user_id, movie_id, r in ratings.itertuples(index=False):
    u_idx = uid_2_idx[user_id]
    m_idx = mid_2_idx[movie_id]
    rating_matrix[u_idx, m_idx] = r

rating_matrix

array([[4., 4., 4., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 4., 0.],
       [5., 0., 0., ..., 0., 0., 0.]])

In [15]:
######################################################################################################################################
# Training Set과 Test Set을 분리해 주는 함수
######################################################################################################################################
def train_test_split(ratings):
    test = np.zeros_like(ratings)
    train = ratings.copy()
    for x in range(ratings.shape[0]):
        nonzero_idx = ratings[x, :].nonzero()[0]
        test_ratings = np.random.choice(nonzero_idx, 
                                        size=int(len(nonzero_idx)/5),  
                                        replace=False)
        train[x, test_ratings] = 0.
        test[x, test_ratings] = ratings[x, test_ratings]
        
    assert(np.all((train * test) == 0))     # train set과 test set이 완전히 분리되었는지 확인

    return train, test

######################################################################################
###################### 유저별 평균을 기존 rating에서 빼준다 ##########################
######################################################################################
####### *****rating이 "있는" 값들의 평균만을 rating이 있는 곳에서만 빼준다############
def subtract_mean_ratings(ratings):
    mean_subtracted_ratings = np.zeros_like(ratings)
    avg_ratings = np.zeros(ratings.shape[0])
    for i in range(ratings.shape[0]):
        nonzero_idx = ratings[i].nonzero()[0]                       # rating 값이 존재하는(0이 아닌) index 추출
        sum_ratings = np.sum(ratings[i])          
        num_nonzero = len(nonzero_idx)
        avg_rating = sum_ratings / num_nonzero                      # rating 값들의 평균값 계산
        if num_nonzero == 0:                  
            avg_rating = 0
        avg_ratings[i] = avg_rating
        mean_subtracted_ratings[i, nonzero_idx] = ratings[i, nonzero_idx] - avg_rating 
                                                                    # 원 rating matrix에서 평균 값을 빼줌
    return mean_subtracted_ratings, avg_ratings


In [16]:
train_ratings, test_ratings = train_test_split(rating_matrix)
mean_subtracted_ratings, avg_ratings = subtract_mean_ratings(train_ratings)

![image](https://user-images.githubusercontent.com/48677363/109418269-d9f66f80-7a0a-11eb-844f-a46225867401.png)


![image](https://user-images.githubusercontent.com/48677363/109418422-a9630580-7a0b-11eb-8193-3554c968d3d2.png)


In [33]:
def calculate_rmse(R, U, V, lambda_u, lambda_v):
    error = 0
    for u, i, r_ui in zip(R.row, R.col, R.data):
        # "Insert Your Code!"
        error += (r_ui - np.dot(U[u], V[:, i])) ** 2
    for u in range(U.shape[0]):
        # "Insert Your Code!"
        error += lambda_u * (norm(U[u]) ** 2)
    for i in range(V.shape[1]):
        # "Insert Your Code!"
        error += lambda_v * (norm(V[:,i]) ** 2)
    rmse = np.sqrt(error/len(R.data))
    return rmse

def SGD(U, V, R, lr, lambda_u, lambda_v):
    for u, i, r_ui in R:
        # Insert Your Code!
        e = r_ui - np.dot(U[u], V[:,i])
        grad_u = 2 * e * (-V[:,i]) + 2 * lambda_u * U[u]
        grad_v = 2 * e * (-U[u,:]) + 2 * lambda_v * V[:,i]
        U[u] = U[u] - lr * grad_u
        V[:, i] = V[:, i] - lr * grad_v
        
    return U, V


def train(ratings, dim=10, max_epoch=50, lambda_u=0.1, lambda_v=0.1, lr=0.01):
    #lambda_u : regularization coefficient of U matrix, lambda_v : regularizatoin coeff of V matrix,
    # lr : learning rate
    num_u, num_i = ratings.shape

    U = np.random.rand(num_u, dim)
    V = np.random.rand(dim, num_i)
    R = coo_matrix(ratings)
    rmse = calculate_rmse(R, U, V, lambda_u, lambda_v)
    print("Initial RMSE: " + str(rmse))

    for epoch in range(max_epoch):
        if epoch != 0 and epoch%10==0:
            lr /= 2
        rating_data = list(zip(R.row, R.col, R.data))
        random.shuffle(rating_data)
        U, V = SGD(U, V, rating_data, lr, lambda_u, lambda_v)
        rmse = calculate_rmse(R, U, V, lambda_u, lambda_v)        
        print('Epoch: {:5}, RMSE: {:15}, Learning Rate:{}'.format(epoch, rmse, lr))
    
    return U, V 

def predict(U, V, user_id=None):
    # user_id가 주어지지 않은 경우엔 전체 예측된 평점 행렬을 return,
    # 주어진 경우엔 해당 유저에 대한 예측 평점 행렬을 return한다.

    if user_id is None:
        return np.dot(U, V)
    else:
        user_idx = uid_2_idx[user_id]
        return np.dot(U[user_idx], V )


#######################################################################################
##########user id = uid인 유저에게 유저가 본 영화를 제외하고 N개를 추천해준다######
#######################################################################################
def recommend_for_uid(uid, ori_rating_matrix, U, V, top_N):

    movie_rating_dict = dict()
    u_predicted_rating = predict(U, V, user_id=uid)
    for movie_idx, pred_rating in enumerate(u_predicted_rating):
        movie_rating_dict[movie_idx] = pred_rating
    
    uidx = uid_2_idx[uid]
    sorted_dict = sorted(movie_rating_dict.items(), key=operator.itemgetter(1), reverse=True)       # 전체 영화를 대상으로 예측 rating을 기준으로 나열
    already_seen_movie_idxs = np.nonzero(ori_rating_matrix[uidx])[0]                                # 유저가 평점을매긴(이미 본) 영화들의 idx

    print_cnt = 0
    for idx, pred_rating in sorted_dict:
        if print_cnt == top_N:
            break
        if idx not in already_seen_movie_idxs:
            print(str(movie_dict[midx_2_id[idx]])+"    "+str(pred_rating))
            print_cnt = print_cnt+1


In [34]:
U, V = train(mean_subtracted_ratings)

Initial RMSE: 2.7895811373480206
Epoch:     0, RMSE: 0.9780018000324875, Learning Rate:0.01
Epoch:     1, RMSE: 0.9545814739430523, Learning Rate:0.01
Epoch:     2, RMSE: 0.9437708586728852, Learning Rate:0.01
Epoch:     3, RMSE: 0.9353692191382806, Learning Rate:0.01
Epoch:     4, RMSE: 0.9274391133529494, Learning Rate:0.01
Epoch:     5, RMSE: 0.9166994187750706, Learning Rate:0.01
Epoch:     6, RMSE: 0.899381521598756, Learning Rate:0.01
Epoch:     7, RMSE: 0.8817504672331042, Learning Rate:0.01
Epoch:     8, RMSE: 0.8628940873411419, Learning Rate:0.01
Epoch:     9, RMSE: 0.8474557624154331, Learning Rate:0.01
Epoch:    10, RMSE: 0.8374987883785165, Learning Rate:0.005
Epoch:    11, RMSE: 0.830069610156552, Learning Rate:0.005
Epoch:    12, RMSE: 0.8236317109533862, Learning Rate:0.005
Epoch:    13, RMSE: 0.8169700634159396, Learning Rate:0.005
Epoch:    14, RMSE: 0.8113189978569127, Learning Rate:0.005
Epoch:    15, RMSE: 0.8054457580335065, Learning Rate:0.005
Epoch:    16, RMSE:

In [35]:
predicted_ratings = predict(U, V)
predicted_ratings += avg_ratings.reshape([-1,1])
print(predicted_ratings)

[[4.53608618 4.24901643 4.55389726 ... 4.28559529 4.50491747 4.2098708 ]
 [3.73005748 3.92828458 4.12524144 ... 4.49845689 4.7179534  4.40854497]
 [1.35328565 2.66813652 1.20038201 ... 3.05075427 2.96624938 2.04590862]
 ...
 [3.13169854 3.10711626 3.25588901 ... 3.08812031 3.00304178 3.48415122]
 [2.75373305 3.94377944 3.40657115 ... 3.54152527 3.85110451 3.80106804]
 [3.40407582 3.40175766 3.68259398 ... 3.37618886 3.91010663 4.02098585]]


In [36]:
# 타겟 유저를 정해서, 해당 유저가 본 영화들의 분포를 뽑아내보기
# 유저의 추첨 결과가 올바르게 나왔는지 해석하기 위하여 해당 함수를 만들어서 추천 결과와 출력합니다.  
def print_user_preference(ori_rating_matrix, u_idx, type):
    '''
        type : ['avg', 'sum', 'cnt']
        avg 는 평균 평점, sum은 평점의 합, cnt은 평점 매긴 횟수
    '''
    target_hist = ori_rating_matrix[u_idx]
    target_seen_movie_idx =  np.nonzero(target_hist)[0]
    target_genre_dict = dict()
    target_genre_sum_rating_dict = dict()
    for movie_idx in target_seen_movie_idx:
        _, genre = movie_dict[midx_2_id[movie_idx]]
        for g in genre.split("|"):
            if g not in target_genre_dict.keys():
                target_genre_dict[g]=0
                target_genre_sum_rating_dict[g]=0
            target_genre_dict[g]=target_genre_dict[g]+1
            target_genre_sum_rating_dict[g] = target_genre_sum_rating_dict[g] + ori_rating_matrix[u_idx][movie_idx]

    # # 많이 본 횟수 대로 
    if type=='cnt':
        target_genre_dict = sorted(target_genre_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_dict:
            print(k,v)
    
    # # 영화 평점의 합대로
    elif type=='sum':
        target_genre_sum_rating_dict = sorted(target_genre_sum_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_sum_rating_dict:
            print(k,v)
        
    # 영화 평균 평점 순서대로
    elif type=="avg":    
        target_genre_avg_rating_dict = dict()
        for k,v in target_genre_sum_rating_dict.items():
            target_genre_avg_rating_dict[k] = v/target_genre_dict[k]
        target_genre_avg_rating_dict = sorted(target_genre_avg_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_avg_rating_dict:
            print(k,v)
    
    else:
        target_genre_avg_rating_dict = dict()
        for k,v in target_genre_sum_rating_dict.items():
            target_genre_avg_rating_dict[k] = v/target_genre_dict[k]
        target_genre_avg_rating_dict = sorted(target_genre_avg_rating_dict.items(), key=operator.itemgetter(1), reverse=True)
        for k,v in target_genre_avg_rating_dict:
            print(k,v,target_genre_dict[k],target_genre_sum_rating_dict[k])

In [37]:
##### Check the RMSE of the test set ####### 
pred = predicted_ratings[test_ratings.nonzero()].flatten()
actual = test_ratings[test_ratings.nonzero()].flatten()

print("### Test RMSE ###")
print(np.sqrt(mean_squared_error(pred,actual)))

### Test RMSE ###
0.8717832142841332


In [38]:
musical_fan = 800
docu_fan = 850
horror_fan = 900
animation_fan =  950

In [39]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, musical_fan, 'sum')
recommend_for_uid(musical_fan, rating_matrix, U, V, top_N=30)

('The Artist (2011)', 'Comedy|Drama|Romance')    1.8313238591338274
("Doug's 1st Movie (1999)", 'Animation|Children')    1.7009352861018563
('Death Proof (2007)', 'Action|Adventure|Crime|Horror|Thriller')    1.6842807141551401
('Pearl Jam Twenty (2011)', 'Documentary|Musical')    1.6683720240907736
('Fifty Shades of Grey (2015)', 'Drama|Romance')    1.659167002096692
('Follow Me, Boys! (1966)', 'Comedy|Drama')    1.5314294138544347
('There Once Was a Dog (1982)', 'Animation|Children|Comedy')    1.5209660603888113
("Ivan's Childhood (a.k.a. My Name is Ivan) (Ivanovo detstvo) (1962)", 'Drama|War')    1.5190116491273833
('Last Waltz, The (1978)', 'Documentary')    1.5152458554975428
('Wrong Turn (2003)', 'Horror|Thriller')    1.5145034111930369
('Iron Man (1931)', 'Drama')    1.5092918263409656
('22 Jump Street (2014)', 'Action|Comedy|Crime')    1.4438146420619065
('Jim Jefferies: Freedumb (2016)', 'Comedy')    1.443638808981635
('Year of the Dragon (1985)', 'Action|Crime|Drama')    1.418

In [40]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, docu_fan,'sum')
recommend_for_uid(docu_fan, rating_matrix, U, V, top_N=30)

('Peeping Tom (1960)', 'Drama|Horror|Thriller')    2.1946486495721715
('Tale of Two Sisters, A (Janghwa, Hongryeon) (2003)', 'Drama|Horror|Mystery|Thriller')    1.868048483557578
('I Walked with a Zombie (1943)', 'Drama|Horror')    1.8302623676622958
('Thing, The (1982)', 'Action|Horror|Sci-Fi|Thriller')    1.8152167874988345
("It's a Very Merry Muppet Christmas Movie (2002)", 'Children|Comedy')    1.8114983729179226
('Phantom, The (1996)', 'Action|Adventure')    1.8093458411014427
('Fifty Shades Darker (2017)', 'Drama|Romance')    1.8031369637974828
('Opera (1987)', 'Crime|Horror|Mystery')    1.75828476069098
('Witchfinder General (Conquerer Worm, The) (1968)', 'Horror')    1.7068386531075426
('Kwaidan (Kaidan) (1964)', 'Horror')    1.6849416188562716
('Babe: Pig in the City (1998)', 'Adventure|Children|Drama')    1.6599706318928873
('Cook the Thief His Wife & Her Lover, The (1989)', 'Comedy|Drama')    1.6522565511810838
("Dead Man's Shoes (2004)", 'Crime|Thriller')    1.6043907284328

In [41]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, horror_fan,'sum')
recommend_for_uid(horror_fan, rating_matrix, U, V, top_N=30)

('Cable Guy, The (1996)', 'Comedy|Thriller')    2.0305647921061385
('Meatballs III (1987)', 'Comedy')    1.868982882305965
('Jerky Boys, The (1995)', 'Comedy')    1.8316649245057004
("Twelve O'Clock High (1949)", 'Drama|War')    1.804374381231377
('Bad Lieutenant (1992)', 'Crime|Drama')    1.7933511811818719
('Popstar: Never Stop Never Stopping (2016)', 'Comedy')    1.7926306580329427
('Kill Command (2016)', 'Action|Horror|Sci-Fi')    1.792007762643156
('Scream (1996)', 'Comedy|Horror|Mystery|Thriller')    1.7205681238196497
('Cook the Thief His Wife & Her Lover, The (1989)', 'Comedy|Drama')    1.670964887547751
('Dragon Ball Z: Dead Zone (Doragon bôru Z 1: Ora no Gohan wo kaese) (1989)', 'Action|Adventure|Animation|Fantasy|Sci-Fi')    1.6492066375742778
('Hellbenders (2012)', 'Comedy|Horror|Thriller')    1.6459067194056693
('Bats (1999)', 'Horror|Thriller')    1.5941628248321371
('Star Wars: Episode I - The Phantom Menace (1999)', 'Action|Adventure|Sci-Fi')    1.5926681147132626
("Orc

In [42]:
# 해당 유저가 본 영화의 분포를 알고 싶으면 print_user_preference 함수를 실행시켜보세요!
# print_user_preference(rating_matrix, horror_fan,'sum')
recommend_for_uid(animation_fan, rating_matrix, U, V, top_N=30)

('Heidi Fleiss: Hollywood Madam (1995)', 'Documentary')    1.307743948604379
('Panic Room (2002)', 'Thriller')    1.200012948620719
('84 Charing Cross Road (1987)', 'Drama|Romance')    1.168331773713939
('Shaolin Temple (Shao Lin si) (1976)', 'Action|Adventure|Drama')    1.143231795071151
('Producers, The (1968)', 'Comedy')    1.1363483607156841
('Pure Formality, A (Pura formalità, Una) (1994)', 'Crime|Film-Noir|Mystery|Thriller')    1.0880611540408858
('Yogi Bear (2010)', 'Children|Comedy')    1.0850159975358808
('Meteor (1979)', 'Sci-Fi')    1.0768171240719828
('Madhouse (1990)', 'Comedy')    1.0708116122249656
('Raven, The (2012)', 'Mystery|Thriller')    1.0647061106232663
("Garfield's Pet Force (2009)", 'Animation')    1.0635061563495665
('The End of the Tour (2015)', 'Drama')    1.0605086579553769
('Out-of-Towners, The (1999)', 'Comedy')    1.0395255993019952
('Other Woman, The (2014)', 'Comedy|Romance')    1.0392230323296372
('Circle of Friends (1995)', 'Drama|Romance')    1.0296

# Test Code

In [43]:
################ Generating Synthetic Data #######################
synthetic_rating = np.zeros((5,10))

for i in range(10):
    random.seed(i)
    u_idx = random.randint(0,4)
    i_idx = random.randint(0,9)
    r_ui = random.randint(1,5)
    synthetic_rating[u_idx ,i_idx] = r_ui

synthetic_R = coo_matrix(synthetic_rating)
synthetic_R_zipped = list(zip(synthetic_R.row, synthetic_R.col, synthetic_R.data))
np.random.seed(7)
synthetic_U = np.random.rand(5, 3)
synthetic_V = np.random.rand(3, 10)

In [44]:
answer_U = np.load('/content/drive/MyDrive/boostcamp/data/graph/others/answer_U.npy')
answer_V = np.load('/content/drive/MyDrive/boostcamp/data/graph/others/answer_V.npy')
answer_rmse = np.load('/content/drive/MyDrive/boostcamp/data/graph/others/answer_rmse.npy')

In [45]:
def test_code(sgd, rmse):
    U, V = sgd(synthetic_U, synthetic_V, synthetic_R_zipped, lr=0.01, lambda_u=0.1, lambda_v=0.1) 
    mse_u = mean_squared_error(U, answer_U)
    mse_v = mean_squared_error(V, answer_V)
    assert(mse_u <1e-2 or mse_v < 1e-2), 'calculated U, V is different with the answer : SGD 함수 오류'   ## 정답과 오차범위 내의 값이 아니면 assertion 오류

    rmse = calculate_rmse(synthetic_R, U, V, lambda_u=0.1, lambda_v=0.1)
    assert(abs(answer_rmse-rmse)<1e-05), 'calculated rmse is diferent with the answer : RMSE 함수 오류'   ## 정답과 오차범위 내의 값이 아니면 assertion 오류
    
    print("모든 함수 알맞게 구현됨")


In [46]:
test_code(SGD, calculate_rmse)

모든 함수 알맞게 구현됨
