# 데이터 준비와 전처리

In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [5]:
ratings = ratings[['user_id', 'movie_id', 'count']]

In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding = 'ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


# 분석

* ratings에 있는 유니크한 영화 개수

In [7]:
ratings['movie_id'].nunique()

3628

* rating에 있는 유니크한 사용자 수

In [8]:
ratings['user_id'].nunique()

6039

* 가장 인기 있는 영화 30개(인기순)

In [9]:
movie_data = pd.merge(ratings, movies)
movie_count = movie_data.groupby('title')['count'].count()
movie_count.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

# 내가 선호하는 영화 5가지를 Rating에 추가

In [10]:
my_favorite = ['Star Wars: Episode IV - A New Hope (1977)',
               'Star Wars: Episode V - The Empire Strikes Back (1980)',
               'Star Wars: Episode VI - Return of the Jedi (1983)',
               'Men in Black (1997)',
               'Toy Story (1995)']

favorite_movie_id = movies[movies['title'].isin(my_favorite)]
my_movie = pd.DataFrame({'user_id': [9999]*5, 'movie_id': favorite_movie_id['movie_id'], 'count':[5]*5})

if not ratings.isin({'user_id':[9999]})['user_id'].any():
    ratings = ratings.append(my_movie)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,count
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4
0,9999,1,5
257,9999,260,5
1178,9999,1196,5
1192,9999,1210,5
1539,9999,1580,5


In [11]:
movie_data = pd.merge(ratings, movies, on='movie_id')
movie_data

Unnamed: 0,user_id,movie_id,count,title,genre
0,1,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
1,2,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
2,12,1193,4,One Flew Over the Cuckoo's Nest (1975),Drama
3,15,1193,4,One Flew Over the Cuckoo's Nest (1975),Drama
4,17,1193,5,One Flew Over the Cuckoo's Nest (1975),Drama
...,...,...,...,...,...
836478,5851,3607,5,One Little Indian (1973),Comedy|Drama|Western
836479,5854,3026,4,Slaughterhouse (1987),Horror
836480,5854,690,3,"Promise, The (Versprechen, Das) (1994)",Romance
836481,5938,2909,4,"Five Wives, Three Secretaries and Me (1998)",Documentary


# CSR matrix

In [12]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)))
csr_data

<10000x3953 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

# als_model 구성 및 훈련

In [13]:
from implicit.als import AlternatingLeastSquares

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [14]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [15]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x10000 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [16]:
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [17]:
my_vector, toystory_vector = als_model.user_factors[9999], als_model.item_factors[1]

In [18]:
my_vector

array([-0.9614615 ,  0.09623419,  0.00515637, -0.11170446,  0.00884977,
        0.60949194, -0.2101584 ,  0.71586126,  0.23052366,  0.6086106 ,
        0.18349898,  0.61331964, -0.29418042, -0.07947499,  0.22182791,
       -0.8114619 , -0.36097044,  0.9713177 ,  0.2106912 , -0.19761775,
        0.00149618, -0.5302655 ,  0.07940509,  0.32380933,  0.04396937,
        0.4837826 ,  0.19835784,  0.90991694, -0.79561204, -0.33402693,
        0.3078056 , -0.8109019 ,  0.41365564, -0.28748533,  0.47410828,
        0.35878655, -0.2984988 ,  0.2994849 , -0.7823457 , -0.5252666 ,
       -0.01826419, -0.17144805, -0.32208788,  0.18724875,  0.57042277,
        0.73237747, -0.09514275, -0.10910483, -0.03949476,  0.04232701,
       -0.39750218, -0.09393778, -0.10352883, -0.322656  ,  0.2953964 ,
       -1.3642067 , -0.5004925 ,  0.08601248, -0.45801497,  0.08344215,
        0.24734665,  0.4876088 , -0.88186365, -0.5440605 ,  0.56177115,
        0.46166533, -0.6063473 , -0.3045965 ,  0.034336  ,  0.35

In [19]:
toystory_vector

array([-0.00362334,  0.01319815,  0.01385839,  0.03010371,  0.0264615 ,
        0.00323669, -0.01712759,  0.02859684, -0.01530854, -0.01043341,
       -0.01985486,  0.01946157,  0.00836221, -0.03172748,  0.00648231,
        0.00167965, -0.00833563,  0.02263536,  0.00249072, -0.01299965,
        0.00284533, -0.00151023,  0.01704675,  0.01823026,  0.01755355,
       -0.00144922,  0.00517896, -0.00142389, -0.00665215, -0.00084948,
       -0.00436715, -0.01361353,  0.02921794,  0.04735756,  0.01946933,
        0.03213778, -0.01121561,  0.01115779, -0.03340486, -0.03791621,
        0.02671244,  0.01949244, -0.0281719 ,  0.00522945, -0.00974048,
       -0.00056593,  0.0282808 ,  0.01506378,  0.02272713, -0.00684432,
        0.01826618, -0.00512244, -0.01049292, -0.00184993,  0.04176442,
       -0.04431012,  0.02921868,  0.00692778,  0.00188705,  0.009052  ,
       -0.00470073,  0.02617746,  0.00249448, -0.00607496,  0.02952346,
        0.01227275, -0.0029493 , -0.0404748 , -0.00284497,  0.02

In [20]:
np.dot(my_vector, toystory_vector)

0.444229

#### jumanji_vector = als_model.item_factors[2]
np.dot(my_vector, jumanji_vector)

# 내가 좋아하는 영화와 비슷한 영화 추천 받기

In [22]:
def get_similar_movie(movie_name: str):
    movie_id = movies[movies['title']=='Toy Story (1995)']['movie_id']
    similar_movie = als_model.similar_items(movie_id.values[0], N=15)
    similar_movies = movies[movies['movie_id'].isin([s[0] for s in similar_movie])]
    return similar_movies

In [23]:
get_similar_movie('Toy Story (1995)')

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
33,34,Babe (1995),Children's|Comedy|Drama
352,356,Forrest Gump (1994),Comedy|Romance|War
360,364,"Lion King, The (1994)",Animation|Children's|Musical
584,588,Aladdin (1992),Animation|Children's|Comedy|Musical
591,595,Beauty and the Beast (1991),Animation|Children's|Musical
1245,1265,Groundhog Day (1993),Comedy|Romance
1526,1566,Hercules (1997),Adventure|Animation|Children's|Comedy|Musical
1838,1907,Mulan (1998),Animation|Children's
1854,1923,There's Something About Mary (1998),Comedy


# 내가 가장 좋아할 만한 영화 추천 받기

In [24]:
user = 9999
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(2628, 0.5733394),
 (3114, 0.4069528),
 (1198, 0.4005217),
 (480, 0.36701807),
 (589, 0.3637605),
 (2571, 0.35087487),
 (356, 0.33185685),
 (1270, 0.31703776),
 (2355, 0.2939697),
 (1097, 0.27406126),
 (1197, 0.25820953),
 (1265, 0.25672895),
 (2028, 0.24642056),
 (2858, 0.23777112),
 (2396, 0.22662202),
 (3175, 0.21416788),
 (1240, 0.21138379),
 (1193, 0.20835109),
 (3793, 0.19881597),
 (924, 0.19014095)]

In [25]:
movies[movies['movie_id'].isin([m[0] for m in movie_recommended])]

Unnamed: 0,movie_id,title,genre
352,356,Forrest Gump (1994),Comedy|Romance|War
476,480,Jurassic Park (1993),Action|Adventure|Sci-Fi
585,589,Terminator 2: Judgment Day (1991),Action|Sci-Fi|Thriller
912,924,2001: A Space Odyssey (1968),Drama|Mystery|Sci-Fi|Thriller
1081,1097,E.T. the Extra-Terrestrial (1982),Children's|Drama|Fantasy|Sci-Fi
1176,1193,One Flew Over the Cuckoo's Nest (1975),Drama
1179,1197,"Princess Bride, The (1987)",Action|Adventure|Comedy|Romance
1180,1198,Raiders of the Lost Ark (1981),Action|Adventure
1220,1240,"Terminator, The (1984)",Action|Sci-Fi|Thriller
1245,1265,Groundhog Day (1993),Comedy|Romance


---

# 결과

ToyStory를 좋아하는 영화 리스트에 넣었지만 선호도가 0.44정도로 나온 이유는 나머지 영화와 장르가 달라서인 것 같다.   
   
ToyStory와 유사한 영화 추천받았을 때는 대부분 animation, children, comedy 장르로 적절하게 추천되었다.   
   
내가 좋아할만한 영화 추천받기에서는 action과 sci-fi가 주를 이루어 제대로 추천받은 것 같다.   