## 데이터 준비와 전처리

In [1]:
import os
import pandas as pd
rating_file_path = os.getenv("HOME")+"/project/aiffel-lms/E7_Recommendation/data/ml-1m/ratings.dat"
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
original_data_size = len(ratings)
ratings.head()


Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'original_data_size: {original_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / original_data_size:.2%}')

original_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
#ratings.rename(columns={'rating':'count'}, inplace=True)

In [4]:
movie_file_path = os.getenv('HOME') + '/project/aiffel-lms/E7_Recommendation/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre']
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


---
## 데이터 분석

In [5]:
print(f'Movies in ratings: {ratings["movie_id"].nunique()}')
print(f'Users in ratings: {ratings["user_id"].nunique()}')
famous_movie = ratings.groupby('movie_id')['user_id'].count()
famous_movie.sort_values(ascending=False).head(30)

Movies in ratings: 3628
Users in ratings: 6039


movie_id
2858    3211
260     2910
1196    2885
1210    2716
2028    2561
589     2509
593     2498
1198    2473
1270    2460
2571    2434
480     2413
2762    2385
608     2371
110     2314
1580    2297
527     2257
1197    2252
2396    2213
1617    2210
318     2194
858     2167
1265    2121
1097    2102
2997    2066
2716    2051
296     2030
356     2022
1240    2019
1       2000
457     1941
Name: user_id, dtype: int64

---
## 선호하는 영화 5가지 rating에 추가

In [6]:

df = pd.merge(movies, ratings)

using_cols = ['title', 'user_id', 'rating']
df = df[using_cols]
df['title'] = df['title'].str.slice(start=0, stop=-7)
    
my_favorite = ['Toy Story', 'Bad Boys', 'Grumpier Old Men', 'Waiting to Exhale', 'Father of the Bride Part II']

my_movielist = pd.DataFrame({"user_id":['hwan']*5, 'title':my_favorite, 'rating': 5.0})

if not df.isin({'user_id':['hwan']})['user_id'].any():
    df = df.append(my_movielist)

df.tail(10)


Unnamed: 0,title,user_id,rating
836473,"Contender, The",5682,3.0
836474,"Contender, The",5812,4.0
836475,"Contender, The",5831,3.0
836476,"Contender, The",5837,4.0
836477,"Contender, The",5998,4.0
0,Toy Story,hwan,5.0
1,Bad Boys,hwan,5.0
2,Grumpier Old Men,hwan,5.0
3,Waiting to Exhale,hwan,5.0
4,Father of the Bride Part II,hwan,5.0


In [7]:
user_unique = df['user_id'].unique()
movie_unique = df['title'].unique()

user_to_idx={v:k for k, v in enumerate(user_unique)}
movie_to_idx={v:k for k, v in enumerate(movie_unique)}

print(user_to_idx['hwan'])
print(movie_to_idx['Contender, The'])

6039
3585


In [8]:
temp_user_data = df['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(df):
    print('user_id column indexing complete')
    df['user_id'] = temp_user_data
else:
    print('user_id column indexing fail')

temp_movie_data = df['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(df):
    print('movie column indexing complete')
    df['title'] = temp_movie_data
else:
    print('movie column indexing fail')

df

user_id column indexing complete
movie column indexing complete


Unnamed: 0,title,user_id,rating
0,0,0,5.0
1,0,1,4.0
2,0,2,4.0
3,0,3,5.0
4,0,4,5.0
...,...,...,...
0,0,6039,5.0
1,136,6039,5.0
2,2,6039,5.0
3,3,6039,5.0


---
## CSR matrix 만들기

In [9]:
from scipy.sparse import csr_matrix

num_user = df['user_id'].nunique()
num_movie = df['title'].nunique()

print(df.rating)
csr_data = csr_matrix((df.rating, (df.user_id, df.title)), shape=(num_user, num_movie))
csr_data

0    5.0
1    4.0
2    4.0
3    5.0
4    5.0
    ... 
0    5.0
1    5.0
2    5.0
3    5.0
4    5.0
Name: rating, Length: 836483, dtype: float64


<6040x3586 sparse matrix of type '<class 'numpy.float64'>'
	with 834213 stored elements in Compressed Sparse Row format>

---
## als_model = AlternatingLeastSquares 모델 훈련시키기

In [10]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

als_model=AlternatingLeastSquares(factors=900, regularization=0.01, use_gpu=False, iterations=50, dtype=np.float32)

csr_data_transpose = csr_data.T
csr_data_transpose

<3586x6040 sparse matrix of type '<class 'numpy.float64'>'
	with 834213 stored elements in Compressed Sparse Column format>

---
## 선호하는 5가지 영화 중 하나와 그 외의 영화 하나를 골라 훈련된 모델이 예측한 나의 선호도 파악하기

In [11]:
als_model.fit(csr_data_transpose)

hwan, toystory = user_to_idx['hwan'], movie_to_idx['Toy Story']
hwan_vector, toystory_vector = als_model.user_factors[hwan], als_model.item_factors[toystory]

hwan_vector

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




array([-0.08215957, -0.22033149, -0.09267487,  0.21130836,  0.16860552,
        0.07010721, -0.17089197, -0.21330702, -0.02397619,  0.03063262,
        0.02963717,  0.36706218,  0.23576476, -0.12634017, -0.1314003 ,
       -0.03860078,  0.0035059 ,  0.17473364,  0.00438889,  0.34451565,
        0.00742182, -0.04288349, -0.09093987,  0.20595373, -0.0207769 ,
        0.30932215,  0.13066751,  0.07564159,  0.0956312 , -0.07229344,
       -0.34243855,  0.2660948 , -0.009585  ,  0.15587482, -0.38171813,
       -0.16052598,  0.11563183, -0.23973182, -0.08309272,  0.19675966,
        0.14590375, -0.21142532, -0.20850454,  0.15054768, -0.08171515,
       -0.14281066,  0.10193663, -0.02918687,  0.0291086 , -0.04534946,
        0.25907418,  0.20299973,  0.06533761, -0.36076367, -0.19158392,
        0.25725213, -0.0195307 , -0.383616  , -0.12370528,  0.07875057,
        0.12193955, -0.07396614,  0.05018849, -0.19213188,  0.19509855,
       -0.31475958,  0.2325259 ,  0.19811477, -0.00106487, -0.11

In [12]:
toystory_vector

array([ 3.24554518e-02, -1.10448468e-02,  1.27951149e-02,  1.25024775e-02,
       -1.84864420e-02, -3.04329861e-02, -1.61123574e-02,  9.42623615e-03,
        1.39708156e-02, -6.11374155e-03,  2.33271178e-02,  1.93275735e-02,
        3.19435634e-02,  4.31093387e-03, -1.56950913e-02, -1.43334419e-02,
        2.26891246e-02, -1.01175522e-02,  9.46849119e-03, -2.01236345e-02,
        2.75228284e-02, -2.34912354e-02, -1.57022830e-02, -6.78768149e-03,
        8.19783565e-03,  4.22849730e-02,  1.01609621e-02,  5.01606613e-03,
       -2.00327137e-03,  3.54885571e-02, -1.69760617e-03,  3.21920142e-02,
       -5.09780273e-03,  2.04395819e-02, -2.10418589e-02, -6.05975511e-03,
        1.43755311e-02, -5.65000065e-03, -1.53529653e-02, -1.45179108e-02,
        1.09711410e-02,  1.99621706e-03,  5.57384593e-03, -6.27041468e-03,
       -5.16901957e-03, -3.80584947e-03,  2.18611434e-02,  6.74602482e-03,
        1.58091038e-02,  4.74115973e-03,  1.83060318e-02,  1.13457423e-02,
       -3.83353932e-03,  

In [13]:
np.dot(hwan_vector, toystory_vector)

0.98267025

In [14]:
nikita = movie_to_idx['Nikita (La Femme Nikita)']
nikita_vector = als_model.item_factors[nikita]
np.dot(hwan_vector, nikita_vector)

0.03388393

---
## 내가 좋아하는 영화와 비슷한 영화 추천 받기

In [15]:
favorite_id = movie_to_idx['Bad Boys']
similar_movie = als_model.similar_items(favorite_id, N=15)
similar_movie

[(136, 0.34588084),
 (115, 0.14996842),
 (624, 0.147274),
 (2011, 0.14707409),
 (571, 0.14649357),
 (1656, 0.14626078),
 (752, 0.1454982),
 (1006, 0.14548697),
 (2818, 0.14543653),
 (1574, 0.14542535),
 (3124, 0.14501025),
 (662, 0.14495409),
 (3578, 0.1448411),
 (3137, 0.14476664),
 (3110, 0.14472347)]

In [16]:
idx_to_movie = {v:k for k, v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

['Bad Boys',
 'Race the Sun',
 'Yankee Zulu',
 'Master Ninja I',
 'Tough and Deadly',
 'Woo',
 'Boy Called Hate, A',
 'American Strays',
 'Bay of Blood (Reazione a catena)',
 'Tokyo Fist',
 'Blood and Sand (Sangre y Arena)',
 'Sunset Park',
 'Bootmen',
 'Horror Hotel (a.k.a. The City of the Dead)',
 'Born American']

In [17]:
def get_similar_movie(movie_title:str):
    movie_id = movie_to_idx[movie_title]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    return similar_movie

get_similar_movie('Enemy of the State')

['Enemy of the State',
 'Second Best',
 'Last Klezmer: Leopold Kozlowski, His Life and Music, The',
 'Shooter, The',
 'Castaway Cowboy, The',
 'Tough and Deadly',
 'Living Dead Girl, The (La Morte Vivante)',
 'Convent, The (Convento, O)',
 'Born to Win',
 "Wooden Man's Bride, The (Wu Kui)"]

---
## 내가 가장 좋아할만한 영화 추천 받기

In [18]:
user = user_to_idx['hwan']
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(3117, 0.24775556),
 (1225, 0.20227511),
 (177, 0.17921175),
 (2930, 0.15291083),
 (416, 0.14768374),
 (2007, 0.14377484),
 (1361, 0.13083397),
 (1454, 0.12070297),
 (1422, 0.118581064),
 (212, 0.1176374),
 (225, 0.11137688),
 (358, 0.109992504),
 (2455, 0.10991386),
 (2423, 0.10934064),
 (2438, 0.10871741),
 (275, 0.10829471),
 (762, 0.10614187),
 (2293, 0.10481716),
 (117, 0.102845594),
 (3289, 0.102474965)]

In [19]:
[idx_to_movie[i[0]] for i in movie_recommended]

['Grumpy Old Men',
 'Mirror Has Two Faces, The',
 'Nine Months',
 'Sister Act 2: Back in the Habit',
 'Cliffhanger',
 'Toys',
 "Romy and Michele's High School Reunion",
 'Soul Food',
 'Conan the Barbarian',
 'Circle of Friends',
 'French Kiss',
 'Reality Bites',
 'Brighton Beach Memoirs',
 'Summer of Sam',
 'Ghostbusters II',
 'Natural Born Killers',
 'First Wives Club, The',
 'Forces of Nature',
 'Boomerang',
 'Moonraker']

In [20]:
french_kiss = movie_to_idx['French Kiss']
explain = als_model.explain(user, csr_data, itemid=french_kiss)

[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[('Father of the Bride Part II', 0.09443659820804647),
 ('Grumpier Old Men', 0.031392117050336114),
 ('Waiting to Exhale', 0.007482345884377852),
 ('Toy Story', -0.0033212490605346624),
 ('Bad Boys', -0.019552196836540286)]

---
## Report
1. factors 와 iteration 을 늘릴수록 정확도가 높아졌는데, 어느수준까지 올려야 적절한 것인지 알기 어려웠음.
2. 특정 영화에 대한 선호도를 검사하면 음수값이 나오는 경우가 있는데 정상적인 것인지(매우 불호) 잘못된 것인지 모르겠음
3. 추천받는 다른 영화에 대한 선호도가 높게 나오지 않는데 모델이 잘못된 탓인지, favorite 영화들이 공통점이 없어서 찾기 힘든 것인지 모르겠음 