In [1]:
import os

import pandas as pd
import numpy as np

from tqdm import tqdm

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

### 데이터 가져오기

In [3]:
os.listdir('./data/ml-latest-small/')

['links.csv', 'tags.csv', 'ratings.csv', 'README.txt', 'movies.csv']

In [4]:
data_path = './data/ml-latest-small/'

rating_df = pd.read_csv(os.path.join(data_path, 'ratings.csv'), encoding='utf-8')

movie_df = pd.read_csv(os.path.join(data_path, 'movies.csv'), encoding='utf-8')
movie_df.set_index('movieId', inplace=True)

tag_df = pd.read_csv(os.path.join(data_path, 'tags.csv'), encoding='utf-8')

print(rating_df.shape)
print(movie_df.shape)
print(tag_df.shape)

(100836, 4)
(9742, 2)
(3683, 4)


### Genre를 이용해 movie representation

In [8]:
movie_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [18]:
total_count = len(movie_df.index)
total_genre = list(set([genre for mlist in movie_df['genres'] for genre in mlist.split('|')]))

print(total_count)
print(total_genre, '\n장르 개수:', len(total_genre))

9742
['Romance', 'Comedy', 'Animation', 'Thriller', 'Adventure', 'IMAX', 'Film-Noir', 'Mystery', 'Crime', 'Documentary', 'Fantasy', 'Drama', 'Action', 'Sci-Fi', 'Horror', '(no genres listed)', 'Children', 'War', 'Musical', 'Western'] 
장르 개수: 20


In [34]:
## 장르의 빈도수

genre_count = dict.fromkeys(total_genre)

for i in movie_df['genres']:
    glist = i.split('|')

    for j in glist:
        if genre_count[j] == None:
            genre_count[j] = 1
        else:
            genre_count[j] += 1

genre_count


{'Romance': 1596,
 'Comedy': 3756,
 'Animation': 611,
 'Thriller': 1894,
 'Adventure': 1263,
 'IMAX': 158,
 'Film-Noir': 87,
 'Mystery': 573,
 'Crime': 1199,
 'Documentary': 440,
 'Fantasy': 779,
 'Drama': 4361,
 'Action': 1828,
 'Sci-Fi': 980,
 'Horror': 978,
 '(no genres listed)': 34,
 'Children': 664,
 'War': 382,
 'Musical': 334,
 'Western': 167}

In [36]:
## 전체 빈도수 나누기 장르 빈도수 -> 역수 -> log (IDF)

for genre in genre_count:
    genre_count[genre] = np.log10(total_count / genre_count[genre])

genre_count

{'Romance': 0.7856152382210405,
 'Comedy': 0.4139225416416778,
 'Animation': 1.2026069149931968,
 'Thriller': 0.7112681505684965,
 'Adventure': 0.8872447746804204,
 'IMAX': 1.7899910382813284,
 'Film-Noir': 2.0491288726171324,
 'Mystery': 1.2304935032683613,
 'Crime': 0.9098289421369025,
 'Documentary': 1.3451954487495636,
 'Fantasy': 1.0971106675631865,
 'Drama': 0.3490620385623247,
 'Action': 0.7266719338379385,
 'Sci-Fi': 0.9974220495432563,
 'Horror': 0.9983092704481497,
 '(no genres listed)': 2.457169208193496,
 'Children': 1.1664800458677336,
 'War': 1.4065847623240424,
 'Musical': 1.4649016584241867,
 'Western': 1.7659316540881678}

In [38]:
genre_representation = pd.DataFrame(columns = sorted(total_genre), index = movie_df.index)

for idx, row in tqdm(movie_df.iterrows()):
    dict_temp = {i : genre_count[i] for i in row['genres'].split('|')}
    row_to_add = pd.DataFrame(dict_temp, index=[idx])
    genre_representation.update(row_to_add)

genre_representation

9742it [00:13, 722.04it/s]


Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


### Tag를 이용해 Movie Representation

In [39]:
tag_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [48]:
tag_column = list(map(lambda x: x.split(','), tag_df['tag']))
tag_set = set([tag.strip() for sub_tag in tag_column for tag in sub_tag])

print(len(tag_column))
print(len(tag_set))

3683
1589


In [62]:
# IDF 계산

total_movie_count = len(set(tag_df['movieId']))

tag_count_dict = dict().fromkeys(tag_set)

for tags in tag_df['tag']:
    for tag in tags.split(','):
        if tag_count_dict[tag] == None:
            tag_count_dict[tag] = 1
        else:
            tag_count_dict[tag] += 1


tag_idf = dict()
for tag in tag_count_dict:
    tag_idf[tag] = np.log10(total_movie_count / tag_count_dict[tag])

print(list(tag_idf.items())[:3])
print(len(list(tag_idf.items())))

[('ridiculous', 2.895422546039408), ('courtroom drama', 2.895422546039408), ('melancholic', 3.196452541703389)]
1589


In [67]:
tag_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [96]:
list(tag_idf.items())[:10]

[('ridiculous', 2.895422546039408),
 ('courtroom drama', 2.895422546039408),
 ('melancholic', 3.196452541703389),
 ('brutality', 2.7193312869837265),
 ('threesome', 3.196452541703389),
 ('Quakers', 3.196452541703389),
 ('Hawkeye', 3.196452541703389),
 ('bittersweet', 2.2422100322640643),
 ('old', 3.196452541703389),
 ('E. M. Forster', 3.196452541703389)]

In [127]:
tag_representation = pd.DataFrame(columns = tag_set, index = sorted(list(set(tag_df['movieId']))))

for name, group in tqdm(tag_df.groupby('movieId')):
    temp_list = list(map(lambda x: x.split(','), group['tag']))
    temp_tag_list = list(set(map(lambda x: x.strip(','), list([tag for tags in temp_list for tag in tags]))))

    dict_temp = {i: tag_idf[i.strip()] for i in temp_tag_list}
    row_to_add = pd.DataFrame(dict_temp, index = [group['movieId'].values[0]])
    tag_representation.update(row_to_add)

tag_representation

100%|██████████| 1572/1572 [01:47<00:00, 14.69it/s]


Unnamed: 0,ridiculous,courtroom drama,melancholic,brutality,threesome,Quakers,Hawkeye,bittersweet,old,E. M. Forster,...,humorous,Mexico,American propaganda,jack nicholson,short films,Gal Gadot,art,gore,psychological thriller,Shakespeare sort of
1,,,,,,,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,3.196453,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183611,,,,,,,,,,,...,,,,,,,,,,
184471,,,,,,,,,,,...,,,,,,,,,,
187593,,,,,,,,,,,...,,,,,,,,,,
187595,,,,,,,,,,,...,,,,,,,,,,


In [148]:
print(genre_representation.shape)
print(tag_representation.shape)

(9742, 20)
(1572, 1589)


### genre representation + tag representation

In [149]:
genre_representation

Unnamed: 0_level_0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,,,0.887245,1.202607,1.16648,0.413923,,,,1.097111,,,,,,,,,,
2,,,0.887245,,1.16648,,,,,1.097111,,,,,,,,,,
3,,,,,,0.413923,,,,,,,,,,0.785615,,,,
4,,,,,,0.413923,,,0.349062,,,,,,,0.785615,,,,
5,,,,,,0.413923,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,0.726672,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193583,,,,1.202607,,0.413923,,,,1.097111,,,,,,,,,,
193585,,,,,,,,,0.349062,,,,,,,,,,,
193587,,0.726672,,1.202607,,,,,,,,,,,,,,,,


In [153]:
tag_representation.loc[2]

ridiculous                NaN
courtroom drama           NaN
melancholic               NaN
brutality                 NaN
threesome                 NaN
                         ... 
Gal Gadot                 NaN
art                       NaN
gore                      NaN
psychological thriller    NaN
Shakespeare sort of       NaN
Name: 2, Length: 1589, dtype: object

In [154]:
movie_representation = pd.concat([genre_representation, tag_representation], axis=1).fillna(0)
movie_representation.head()

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,...,humorous,Mexico,American propaganda,jack nicholson,short films,Gal Gadot,art,gore,psychological thriller,Shakespeare sort of
1,0.0,0.0,0.887245,1.202607,1.16648,0.413923,0.0,0.0,0.0,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.887245,0.0,1.16648,0.0,0.0,0.0,0.0,1.097111,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.349062,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.413923,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### 유사도 확인

In [160]:
from sklearn.metrics.pairwise import cosine_similarity

def cos_sim_matrix(a, b):
    cos_sim = cosine_similarity(a, b)
    result_df = pd.DataFrame(data=cos_sim, index=[a.index])

    return result_df

In [162]:
cosim_df = cos_sim_matrix(movie_representation, movie_representation)
cosim_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
1,1.000000,0.124438,0.008403,0.040571,0.011755,0.000000,0.016339,0.331122,0.000000,0.131794,...,0.064466,0.260941,0.071492,0.271710,0.0,0.348295,0.379492,0.000000,0.232553,0.093519
2,0.124438,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.240843,0.000000,0.095861,...,0.000000,0.000000,0.000000,0.000000,0.0,0.108082,0.117763,0.000000,0.000000,0.000000
3,0.008403,0.000000,1.000000,0.179391,0.011294,0.000000,0.072246,0.000000,0.000000,0.000000,...,0.006560,0.000000,0.068686,0.000000,0.0,0.020322,0.022142,0.000000,0.000000,0.089849
4,0.040571,0.000000,0.179391,1.000000,0.054530,0.000000,0.348828,0.000000,0.000000,0.000000,...,0.031674,0.101979,0.567487,0.000000,0.0,0.098119,0.106908,0.365843,0.000000,0.433821
5,0.011755,0.000000,0.011294,0.054530,1.000000,0.000000,0.640342,0.000000,0.000000,0.000000,...,0.009177,0.000000,0.096091,0.000000,0.0,0.028429,0.030976,0.000000,0.000000,0.125697
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.348295,0.108082,0.020322,0.098119,0.028429,0.211466,0.039515,0.000000,0.397065,0.213809,...,0.206804,0.631077,0.172901,0.657123,0.0,1.000000,0.917791,0.000000,0.767770,0.226174
193583,0.379492,0.117763,0.022142,0.106908,0.030976,0.000000,0.043055,0.000000,0.000000,0.000000,...,0.169874,0.687605,0.188388,0.715984,0.0,0.917791,1.000000,0.000000,0.612800,0.246433
193585,0.000000,0.000000,0.000000,0.365843,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.278750,0.644671,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.232553,0.000000,0.000000,0.000000,0.000000,0.275428,0.000000,0.000000,0.517166,0.278480,...,0.247849,0.821961,0.000000,0.855885,0.0,0.767770,0.612800,0.000000,1.000000,0.000000


In [163]:
cosim_df.shape

(9742, 9742)

In [165]:
print(cosim_df.shape)
print(cosim_df[1].sort_values(ascending=False))

(9742, 9742)
2         1.000000
46972     0.322201
158813    0.300850
119655    0.300850
80748     0.300850
            ...   
4921      0.000000
4920      0.000000
4919      0.000000
4917      0.000000
193609    0.000000
Name: 1, Length: 9742, dtype: float64


In [175]:
print(movie_df.loc[1], end='\n\n')
print(movie_df.loc[46972], end='\n\n')
print(movie_df.loc[158813], end='\n\n')
print(movie_df.loc[119655], end='\n\n')
print(movie_df.loc[80748], end='\n\n')

title                                Toy Story (1995)
genres    Adventure|Animation|Children|Comedy|Fantasy
Name: 1, dtype: object

title     Night at the Museum (2006)
genres    Action|Comedy|Fantasy|IMAX
Name: 46972, dtype: object

title     Alice Through the Looking Glass (2016)
genres                Adventure|Children|Fantasy
Name: 158813, dtype: object

title             Seventh Son (2014)
genres    Adventure|Children|Fantasy
Name: 119655, dtype: object

title     Alice in Wonderland (1933)
genres    Adventure|Children|Fantasy
Name: 80748, dtype: object



In [181]:
display(tag_df.loc[tag_df['movieId'] == 1])

display(tag_df.loc[tag_df['movieId'] == 46972])

Unnamed: 0,userId,movieId,tag,timestamp
629,336,1,pixar,1139045764
981,474,1,pixar,1137206825
2886,567,1,fun,1525286013


Unnamed: 0,userId,movieId,tag,timestamp
173,62,46972,Ben Stiller,1525554254
174,62,46972,Robin Williams,1525554255


### 추천시스템 확인하기

In [182]:
train_df, test_df = train_test_split(rating_df, test_size=0.2, random_state=42)

print(train_df.shape)
print(test_df.shape)

(80668, 4)
(20168, 4)


In [201]:
test_userid = list(pd.Series(test_df['userId']).value_counts().index)

In [202]:
result_df = pd.DataFrame()

for user_id in tqdm(test_userid):
    user_record_df = train_df.loc[train_df.userId == int(user_id), :]
    
    user_sim_df = cosim_df.loc[user_record_df['movieId']]  # (n, 9742); n은 userId가 평점을 매긴 영화 수
    user_rating_df = user_record_df[['rating']]  # (n, 1)
    sim_sum = np.sum(user_sim_df.T.to_numpy(), -1)  # (9742, 1)
    # print("user_id=", i, user_record_df.shape, user_sim_df.T.shape, user_rating_df.shape, sim_sum.shape)


    # user가 평점을 준 영화의 TF-IDFvector.T와 유저 평점을 준 데이터를 matmul / 유저가 평점을 매긴 영화의 vector합
    prediction = np.matmul(user_sim_df.T.to_numpy(), user_rating_df.to_numpy()).flatten() / (sim_sum+1) # (9742, 1)

    prediction_df = pd.DataFrame(prediction, index=cosim_df.index).reset_index()
    prediction_df.columns = ['movieId', 'pred_rating']    
    prediction_df = prediction_df[['movieId', 'pred_rating']][prediction_df.movieId.isin(test_df[test_df.userId == user_id]['movieId'].values)]

    temp_df = prediction_df.merge(test_df[test_df.userId == user_id], on='movieId')
    result_df = pd.concat([result_df, temp_df], axis=0)


100%|██████████| 610/610 [00:06<00:00, 88.76it/s] 


In [204]:
mse = mean_squared_error(y_true=result_df['rating'].values, y_pred=result_df['pred_rating'].values)
rmse = np.sqrt(mse)

print(mse, rmse)
     

1.4030875287276834 1.1845199570829035
