In [1]:
import scipy
import implicit
import os

import numpy as np
import pandas as pd

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.drop('timestamp', axis=1, inplace=True)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# 검색을 위해 타이틀 소문자화
movies['title'] = movies['title'].str.lower()
movies['genre'] = movies['genre'].str.replace('|', ', ', regex=True)
genres = np.unique(', '.join(movies['genre']).split(', '))

for genre in genres:
    movies[genre] = movies['genre'].str.contains(genre).astype('int')

movies.drop('genre', axis=1, inplace=True)

In [7]:
movies

Unnamed: 0,movie_id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,toy story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,grumpier old men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,waiting to exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,father of the bride part ii (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,meet the parents (2000),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,requiem for a dream (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,3950,tigerland (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,3951,two family house (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [8]:
print('ratings에 있는 유니크한 영화 개수 :' , ratings.movie_id.nunique())
print('ratings에 있는 유니크한 사용자 수 :' , ratings.user_id.nunique())
# print(ratings.groupby('movie_id').sum().sort_values('counts', ascending=False).iloc[:30])
top_30_list = ratings.groupby('movie_id').sum().sort_values('counts', ascending=False).iloc[:30].index.values
top_30_movie = list(movies[movies.movie_id == x].title.values[0] for x in top_30_list)
for i in top_30_movie:
    print(i)

ratings에 있는 유니크한 영화 개수 : 3628
ratings에 있는 유니크한 사용자 수 : 6039
american beauty (1999)
star wars: episode iv - a new hope (1977)
star wars: episode v - the empire strikes back (1980)
saving private ryan (1998)
star wars: episode vi - return of the jedi (1983)
raiders of the lost ark (1981)
silence of the lambs, the (1991)
matrix, the (1999)
sixth sense, the (1999)
terminator 2: judgment day (1991)
fargo (1996)
schindler's list (1993)
braveheart (1995)
shawshank redemption, the (1994)
back to the future (1985)
godfather, the (1972)
princess bride, the (1987)
jurassic park (1993)
l.a. confidential (1997)
shakespeare in love (1998)
men in black (1997)
pulp fiction (1994)
being john malkovich (1999)
groundhog day (1993)
e.t. the extra-terrestrial (1982)
forrest gump (1994)
terminator, the (1984)
toy story (1995)
ghostbusters (1984)
alien (1979)


In [9]:
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts
1000198,6040,2021,3
1000199,6040,2022,5
1000200,6040,2028,5
1000201,6040,1080,4
1000202,6040,1089,4
1000203,6040,1090,3
1000205,6040,1094,5
1000206,6040,562,5
1000207,6040,1096,4
1000208,6040,1097,4


In [10]:
movies

Unnamed: 0,movie_id,title,Action,Adventure,Animation,Children's,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,toy story (1995),0,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0
1,2,jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,grumpier old men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,waiting to exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0
4,5,father of the bride part ii (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3878,3948,meet the parents (2000),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
3879,3949,requiem for a dream (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3880,3950,tigerland (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3881,3951,two family house (2000),0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0


In [11]:
def fav_movie(x : list):
    tmp = {}
    for name in x:
        for title in movies.title:
            if name in title:
                tmp[title] = (movies[movies.title== title].movie_id.values[0])

    return tmp

my_favorite = fav_movie(['matrix', 'good will', 'shawshank', 'titanic (1997)', 'truman'])
my_movielist = pd.DataFrame({'user_id': ['1000209']*5, 'movie_id': my_favorite.values(), 'counts':[5]*5})

if not ratings.isin({'user_id':['1000209']})['movie_id'].any():
    ratings = ratings.append(my_movielist)

ratings.reset_index(inplace=True, drop=True)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,counts
836473,6040,1090,3
836474,6040,1094,5
836475,6040,562,5
836476,6040,1096,4
836477,6040,1097,4
836478,1000209,2571,5
836479,1000209,1704,5
836480,1000209,318,5
836481,1000209,1721,5
836482,1000209,1682,5


In [12]:
my_favorite

{'matrix, the (1999)': 2571,
 'good will hunting (1997)': 1704,
 'shawshank redemption, the (1994)': 318,
 'titanic (1997)': 1721,
 'truman show, the (1998)': 1682}

In [13]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

# 유저, 아티스트 indexing 하는 코드 idx는 index의 약자입니다.
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [14]:
# 인덱싱이 잘 되었는지 확인해 봅니다. 
print(user_to_idx['1000209'])    # 358869명의 유저 중 마지막으로 추가된 유저이니 358868이 나와야 합니다. 
print(movie_to_idx[2571])

6039
124


In [15]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data    
else:
    print('user_id column indexing Fail!!')

    
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('artist column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('artist column indexing Fail!!')

ratings

user_id column indexing OK!!
artist column indexing OK!!


Unnamed: 0,user_id,movie_id,counts
0,0,0,5
1,0,1,3
2,0,2,3
3,0,3,4
4,0,4,5
...,...,...,...
836478,6039,124,5
836479,6039,248,5
836480,6039,157,5
836481,6039,27,5


In [17]:
# 실습 위에 설명보고 이해해서 만들어보기
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape = (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

In [18]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [19]:
als_model = AlternatingLeastSquares(factors=200, regularization=0.01, use_gpu=False, iterations=40, dtype=np.float32)

In [20]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [21]:
def get_similar_movie(movie_name: str):
    movie_id = movie_to_idx[movie_name]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    similar_movie_name = [movies[movies.movie_id == i].title.values[0] for i in similar_movie]
    return similar_movie_name

In [22]:
recog_df = {}
factor_list, iter_list, inner_val, mat_sim, rec_list = [], [], [], [], []

for factor in range(100, 601, 100):
    for iteration in range(10, 60, 10):
        als_model = AlternatingLeastSquares(factors=factor, regularization=0.01, use_gpu=False, iterations=iteration, dtype=np.float32)
        csr_data_transpose = csr_data.T
        csr_data_transpose
        als_model.fit(csr_data_transpose)
        lim, matrix = user_to_idx['1000209'], movie_to_idx[2571]
        lim_vector, matrix_vector = als_model.user_factors[lim], als_model.item_factors[matrix]
        print('Factor : {}, iteration : {}, Inner_val : {}'.format(factor, iteration, np.dot(lim_vector, matrix_vector)))
        factor_list.append(factor)
        iter_list.append(iteration)
        inner_val.append(np.dot(lim_vector, matrix_vector))
        
        favorite_movie = 2571
        movie_id = movie_to_idx[favorite_movie]
        similar_movie = als_model.similar_items(movie_id, N=15)
        idx_to_movie = {v:k for k,v in movie_to_idx.items()}
        similar_movie_id = [idx_to_movie[i[0]] for i in similar_movie]
        mat_sim.append(get_similar_movie(2571))
        
        
        user = user_to_idx['1000209']
        movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
        similar_movie_id = [idx_to_movie[i[0]] for i in movie_recommended]
        rec_list.append([movies[movies.movie_id == i].title.values[0] for i in similar_movie_id])

  0%|          | 0/10 [00:00<?, ?it/s]

Factor : 100, iteration : 10, Inner_val : 0.41763848066329956


  0%|          | 0/20 [00:00<?, ?it/s]

Factor : 100, iteration : 20, Inner_val : 0.42323529720306396


  0%|          | 0/30 [00:00<?, ?it/s]

Factor : 100, iteration : 30, Inner_val : 0.41241323947906494


  0%|          | 0/40 [00:00<?, ?it/s]

Factor : 100, iteration : 40, Inner_val : 0.3909377455711365


  0%|          | 0/50 [00:00<?, ?it/s]

Factor : 100, iteration : 50, Inner_val : 0.4190536439418793


  0%|          | 0/10 [00:00<?, ?it/s]

Factor : 200, iteration : 10, Inner_val : 0.6045154333114624


  0%|          | 0/20 [00:00<?, ?it/s]

Factor : 200, iteration : 20, Inner_val : 0.6208409667015076


  0%|          | 0/30 [00:00<?, ?it/s]

Factor : 200, iteration : 30, Inner_val : 0.6491999626159668


  0%|          | 0/40 [00:00<?, ?it/s]

Factor : 200, iteration : 40, Inner_val : 0.6415066719055176


  0%|          | 0/50 [00:00<?, ?it/s]

Factor : 200, iteration : 50, Inner_val : 0.6518236994743347


  0%|          | 0/10 [00:00<?, ?it/s]

Factor : 300, iteration : 10, Inner_val : 0.7709695100784302


  0%|          | 0/20 [00:00<?, ?it/s]

Factor : 300, iteration : 20, Inner_val : 0.7824692130088806


  0%|          | 0/30 [00:00<?, ?it/s]

Factor : 300, iteration : 30, Inner_val : 0.8110610842704773


  0%|          | 0/40 [00:00<?, ?it/s]

Factor : 300, iteration : 40, Inner_val : 0.7994949817657471


  0%|          | 0/50 [00:00<?, ?it/s]

Factor : 300, iteration : 50, Inner_val : 0.8121340274810791


  0%|          | 0/10 [00:00<?, ?it/s]

Factor : 400, iteration : 10, Inner_val : 0.8397751450538635


  0%|          | 0/20 [00:00<?, ?it/s]

Factor : 400, iteration : 20, Inner_val : 0.8771624565124512


  0%|          | 0/30 [00:00<?, ?it/s]

Factor : 400, iteration : 30, Inner_val : 0.8913618326187134


  0%|          | 0/40 [00:00<?, ?it/s]

Factor : 400, iteration : 40, Inner_val : 0.8971906304359436


  0%|          | 0/50 [00:00<?, ?it/s]

Factor : 400, iteration : 50, Inner_val : 0.8976351022720337


  0%|          | 0/10 [00:00<?, ?it/s]

Factor : 500, iteration : 10, Inner_val : 0.9072051644325256


  0%|          | 0/20 [00:00<?, ?it/s]

Factor : 500, iteration : 20, Inner_val : 0.9194056987762451


  0%|          | 0/30 [00:00<?, ?it/s]

Factor : 500, iteration : 30, Inner_val : 0.9217743873596191


  0%|          | 0/40 [00:00<?, ?it/s]

Factor : 500, iteration : 40, Inner_val : 0.9277904629707336


  0%|          | 0/50 [00:00<?, ?it/s]

Factor : 500, iteration : 50, Inner_val : 0.9263901114463806


  0%|          | 0/10 [00:00<?, ?it/s]

Factor : 600, iteration : 10, Inner_val : 0.9337536692619324


  0%|          | 0/20 [00:00<?, ?it/s]

Factor : 600, iteration : 20, Inner_val : 0.9431213736534119


  0%|          | 0/30 [00:00<?, ?it/s]

Factor : 600, iteration : 30, Inner_val : 0.9508850574493408


  0%|          | 0/40 [00:00<?, ?it/s]

Factor : 600, iteration : 40, Inner_val : 0.950366735458374


  0%|          | 0/50 [00:00<?, ?it/s]

Factor : 600, iteration : 50, Inner_val : 0.9485275149345398


In [23]:
recog_df = pd.DataFrame({'factor' : factor_list, 'iteration' : iter_list, 'inner_val' : inner_val, 'mat_sim' : mat_sim, 'recommend' : rec_list})

In [24]:
recog_df

Unnamed: 0,factor,iteration,inner_val,mat_sim,recommend
0,100,10,0.417638,"[matrix, the (1999), terminator 2: judgment da...","[silence of the lambs, the (1991), saving priv..."
1,100,20,0.423235,"[matrix, the (1999), terminator 2: judgment da...","[silence of the lambs, the (1991), saving priv..."
2,100,30,0.412413,"[matrix, the (1999), terminator 2: judgment da...","[silence of the lambs, the (1991), schindler's..."
3,100,40,0.390938,"[matrix, the (1999), terminator 2: judgment da...","[silence of the lambs, the (1991), saving priv..."
4,100,50,0.419054,"[matrix, the (1999), terminator 2: judgment da...","[silence of the lambs, the (1991), schindler's..."
5,200,10,0.604515,"[matrix, the (1999), terminator 2: judgment da...","[jerry maguire (1996), apollo 13 (1995), pulp ..."
6,200,20,0.620841,"[matrix, the (1999), terminator 2: judgment da...","[apollo 13 (1995), jerry maguire (1996), termi..."
7,200,30,0.6492,"[matrix, the (1999), terminator 2: judgment da...","[jerry maguire (1996), apollo 13 (1995), termi..."
8,200,40,0.641507,"[matrix, the (1999), terminator 2: judgment da...","[jerry maguire (1996), apollo 13 (1995), termi..."
9,200,50,0.651824,"[matrix, the (1999), terminator 2: judgment da...","[apollo 13 (1995), jerry maguire (1996), termi..."


In [25]:
recog_df.to_csv('exp.csv')

In [26]:
df = pd.read_csv('./exp.csv')

In [27]:
df

Unnamed: 0.1,Unnamed: 0,factor,iteration,inner_val,mat_sim,recommend
0,0,100,10,0.417638,"['matrix, the (1999)', 'terminator 2: judgment...","['silence of the lambs, the (1991)', 'saving p..."
1,1,100,20,0.423235,"['matrix, the (1999)', 'terminator 2: judgment...","['silence of the lambs, the (1991)', 'saving p..."
2,2,100,30,0.412413,"['matrix, the (1999)', 'terminator 2: judgment...","['silence of the lambs, the (1991)', ""schindle..."
3,3,100,40,0.390938,"['matrix, the (1999)', 'terminator 2: judgment...","['silence of the lambs, the (1991)', 'saving p..."
4,4,100,50,0.419054,"['matrix, the (1999)', 'terminator 2: judgment...","['silence of the lambs, the (1991)', ""schindle..."
5,5,200,10,0.604515,"['matrix, the (1999)', 'terminator 2: judgment...","['jerry maguire (1996)', 'apollo 13 (1995)', '..."
6,6,200,20,0.620841,"['matrix, the (1999)', 'terminator 2: judgment...","['apollo 13 (1995)', 'jerry maguire (1996)', '..."
7,7,200,30,0.6492,"['matrix, the (1999)', 'terminator 2: judgment...","['jerry maguire (1996)', 'apollo 13 (1995)', '..."
8,8,200,40,0.641507,"['matrix, the (1999)', 'terminator 2: judgment...","['jerry maguire (1996)', 'apollo 13 (1995)', '..."
9,9,200,50,0.651824,"['matrix, the (1999)', 'terminator 2: judgment...","['apollo 13 (1995)', 'jerry maguire (1996)', '..."


In [28]:
for i in range(df.shape[0]):
    print('=' * 50)
    print('Factor : {}, Iteration : {}, Inner_Val : {}'.format(df.loc[i, 'factor'], df.loc[i, 'iteration'], df.loc[i, 'inner_val']))
    print('=' * 50)
    print('- 매트릭스와 유사한 영화')
    print('=' * 50)
    print(df.loc[i, 'mat_sim'])
    print('=' * 50)
    print('- 추천 영화 받기')
    print('=' * 50)
    print(df.loc[i, 'recommend'])
    print('=' * 50)
    print('')

Factor : 100, Iteration : 10, Inner_Val : 0.41763848
- 매트릭스와 유사한 영화
['matrix, the (1999)', 'terminator 2: judgment day (1991)', 'total recall (1990)', 'fugitive, the (1993)', 'terminator, the (1984)', 'fifth element, the (1997)', 'jurassic park (1993)', 'face/off (1997)', 'men in black (1997)', 'star wars: episode iv - a new hope (1977)']
- 추천 영화 받기
['silence of the lambs, the (1991)', 'saving private ryan (1998)', "schindler's list (1993)", 'jerry maguire (1996)', 'apollo 13 (1995)', 'pulp fiction (1994)', 'braveheart (1995)', 'fargo (1996)', 'terminator 2: judgment day (1991)', 'sixth sense, the (1999)', 'goodfellas (1990)', 'dead man walking (1995)', 'fugitive, the (1993)', 'usual suspects, the (1995)', 'raiders of the lost ark (1981)', 'jurassic park (1993)', 'forrest gump (1994)', 'thelma & louise (1991)', "mr. holland's opus (1995)", 'scent of a woman (1992)']

Factor : 100, Iteration : 20, Inner_Val : 0.4232353
- 매트릭스와 유사한 영화
['matrix, the (1999)', 'terminator 2: judgment day (1

## 회고

#### 내적수치가 0.43인 경우, 추천영화가 나와 잘 맞았다. 내적수치가 0.95인 경우 내 취향은 아니었다. 따라서, 이 내적수치가 절대적인 기준점이 될 수 없음을 확인하였다.

#### 개인적인 추측으로는 내적수치가 기준이 되어서 내적수치가 높아질수록 추천의 정확도가 올라갔으면 좋았을 것 같다. 이것이 과연 단순히 모른다고 끝내도 되는 일인지 아니면 CSR_matrix에 들어가는 요소들이 올바르지 않아서인지는 살펴볼 계획이다.

#### 내적수치가 성능을 나타내는 지표가 될 수 없는 이러한 문제를 해결할 때는 grid search를 통해서 적절한 값을 선택해야 할 것 같다.

#### 더 좋은 성능의 추천시스템 모델이 존재하는지 찾아볼 계획이다.