## Imports

In [1]:
import os
import scipy
import implicit
import numpy as np
import pandas as pd

from scipy.sparse import csr_matrix
from implicit.als import AlternatingLeastSquares

## Preparation and Preprocessing

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
# ratings 컬럼의 이름을 counts로 바꿉니다.
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
ratings

Unnamed: 0,user_id,movie_id,counts,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
...,...,...,...,...
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648


In [6]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## EDA

In [7]:
uni_user = ratings['user_id'].nunique()
uni_movie = ratings['movie_id'].nunique()

print(f' # of unique users : {uni_user} ')
print(f' # of unique movies : {uni_movie} ')

 # of unique users : 6039 
 # of unique movies : 3628 


In [8]:
ratings = pd.merge(movies, ratings)
using_cols = [ 'user_id','title', 'counts']
ratings = ratings[using_cols]

In [9]:
# 30 Most Popular Movies
artist_count = ratings.groupby('title')['counts'].count()
temp1 = artist_count.sort_values(ascending=False).head(30)
temp2 = pd.DataFrame(temp1)
results = temp2.reset_index()
results.head()

Unnamed: 0,title,counts
0,American Beauty (1999),3211
1,Star Wars: Episode IV - A New Hope (1977),2910
2,Star Wars: Episode V - The Empire Strikes Back...,2885
3,Star Wars: Episode VI - Return of the Jedi (1983),2716
4,Saving Private Ryan (1998),2561


## Configuration

In [10]:
my_favorite = ['Matrix, The (1999)' , 'Terminator 2: Judgment Day (1991)' ,'Terminator, The (1984)' ,'Back to the Future (1985)' ,'Back to the Future Part II (1989)']

my_movies = pd.DataFrame({'user_id': [6041]*5, 'title': my_favorite, 'counts':[5]*5})

if not ratings.isin({'user_id':[6041]})['user_id'].any():
    ratings = ratings.append(my_movies) 

ratings.tail(10)

Unnamed: 0,user_id,title,counts
836473,5682,"Contender, The (2000)",3
836474,5812,"Contender, The (2000)",4
836475,5831,"Contender, The (2000)",3
836476,5837,"Contender, The (2000)",4
836477,5998,"Contender, The (2000)",4
0,6041,"Matrix, The (1999)",5
1,6041,Terminator 2: Judgment Day (1991),5
2,6041,"Terminator, The (1984)",5
3,6041,Back to the Future (1985),5
4,6041,Back to the Future Part II (1989),5


In [11]:
user_unique = ratings['user_id'].unique()
title_unique = ratings['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
title_to_idx = {v:k for k,v in enumerate(title_unique)}

In [12]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # 모든 row가 정상적으로 인덱싱되었다면
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # data['user_id']을 인덱싱된 Series로 교체해 줍니다. 
else:
    print('user_id column indexing Fail!!')


    
temp_title_data = ratings['title'].map(title_to_idx.get).dropna()
if len(temp_title_data) == len(ratings):
    print('title column indexing OK!!')
    ratings['title'] = temp_title_data
else:
    print('title column indexing Fail!!')
ratings

user_id column indexing OK!!
title column indexing OK!!


Unnamed: 0,user_id,title,counts
0,0,0,5
1,1,0,4
2,2,0,4
3,3,0,5
4,4,0,5
...,...,...,...
0,6039,2325,5
1,6039,569,5
2,6039,1122,5
3,6039,1152,5


## CSR matrix

In [13]:
num_user = ratings['user_id'].nunique()
num_title = ratings['title'].nunique()

print(f' # of unique users : {num_user} ')
print(f' # of unique movies : {num_title} ')

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.title)), shape= (num_user, num_title))
csr_data

 # of unique users : 6040 
 # of unique movies : 3628 


<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## MF model learning

In [14]:
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [15]:
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [16]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [17]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

In [18]:
my_index, matrix_index = user_to_idx[6041], title_to_idx['Matrix, The (1999)']
my_vector, matrix_vector = als_model.user_factors[my_index], als_model.item_factors[matrix_index]

## My preference

#### For "Matrix (1999)"

In [19]:
np.dot(my_vector, matrix_vector)

0.64945704

#### For "American Beauty (1999)"

In [20]:
american_beauti_vector = als_model.item_factors[3211]
np.dot(my_vector, american_beauti_vector)

0.004925551

## Recommend a movie that is similar to my favorite movie

In [21]:
idx_to_title = {v:k for k,v in title_to_idx.items()}

favorite_movie = title_to_idx['Matrix, The (1999)']

temp1 = als_model.similar_items(favorite_movie, N=15)
temp2 = [ (idx_to_title[i[0]], i[1]) for i in temp1]
similarity_df = pd.DataFrame(temp2, columns = ['title', 'similarity'])
similarity_df
#similarity_df = pd.merge(movies, temp1, on = 'movie_id')
#similarity_df[['title', 'similarity']].sort_values(ascending=False, by = 'similarity')

Unnamed: 0,title,similarity
0,"Matrix, The (1999)",1.0
1,Terminator 2: Judgment Day (1991),0.787006
2,Total Recall (1990),0.668307
3,"Fugitive, The (1993)",0.584623
4,"Terminator, The (1984)",0.574977
5,Jurassic Park (1993),0.557149
6,"Fifth Element, The (1997)",0.527218
7,Face/Off (1997),0.520345
8,Men in Black (1997),0.487031
9,Twelve Monkeys (1995),0.461051


## Recommend my favorite movies

In [22]:
user = user_to_idx[6041]

temp1 = als_model.recommend(user, csr_data, N=15, filter_already_liked_items=True)

temp2 = [ (idx_to_title[i[0]], i[1]) for i in temp1]
recommend_df = pd.DataFrame(temp2, columns = ['title', 'recommend_score'])
recommend_df

Unnamed: 0,title,recommend_score
0,Total Recall (1990),0.41525
1,Aliens (1986),0.388554
2,Jurassic Park (1993),0.384662
3,Star Wars: Episode V - The Empire Strikes Back...,0.374958
4,Alien (1979),0.363311
5,Twelve Monkeys (1995),0.354869
6,Star Wars: Episode IV - A New Hope (1977),0.344625
7,Star Wars: Episode VI - Return of the Jedi (1983),0.323031
8,Blade Runner (1982),0.320965
9,Face/Off (1997),0.320624


## Discussion

1. CSR matrix가 정상적으로 만들어졌다.  
 - **사용자와 아이템 개수를 바탕으로 정확한 사이즈로 만들었다.**  
 
 | **Raw** | **CSR matrix** |  
 | --- | --- |  
 | ![image-3.png](attachment:image-3.png) | ![image-4.png](attachment:image-4.png) |  
 
**'나의 계정'을 추가했으므로 # of user가 1개 증가된 결과를 확인 할 수 있다.**  

2. MF 모델이 정상적으로 훈련되어 그럴듯한 추천이 이루어졌다.  
 - **사용자와 아이템 벡터 내적수치가 의미있게 형성되었다.**  
 
 | **Matrix (1999)** | **American Beauty (1999)**|  
 | --- | --- |  
 | 0.649312 | 0.0035097427 |  
 
**내가 좋아하는 영화 리스트에 속한 Matrix와 속하지 않은 American Beauty에 대한 나의 선호도(내적수치)는 확실히 차이가 나는 것을 확인 할 수 있다.**

3. 비슷한 영화 찾기와 유저에게 추천하기의 과정이 정상적으로 진행되었다.  
 - **MF모델이 예측한 유저 선호도 및 아이템간 유사도, 기여도가 의미있게 측정되었다.**
 
 | **Finding a Similar Movie** | **Recommend to users** |  
 | --- | --- |  
 | ![image-7.png](attachment:image-7.png) | ![image-8.png](attachment:image-8.png) |  
 
**내가 좋아하는 영화(Matrix, Terminator 2, Terminator, Back to the Future, Back to the Future Part 2)와  
유사한 영화를 추천받았을 때는 내가 익히 알던 영화들이 추천되었으며,  
추천 받은 영화 중에서는 12몽키즈를 제외하고 모든 영화를 시청했기 때문에,  
사용자인 내가 좋아하는 장르의 훌륭한 작품들을 소개 받아 유의미하고 만족한 결과를 얻었다.**

## Reference

1. https://github.com/PEBpung/Aiffel/blob/master/Project/Exploration/E7.%20%EC%98%81%ED%99%94%20%EC%B6%94%EC%B2%9C%20%EC%8B%9C%EC%8A%A4%ED%85%9C.ipynb
2. https://github.com/JaeHeee/AIFFEL_Project/blob/master/EXPLORATION/EXPLORATION%207.%20%EC%95%84%EC%9D%B4%EC%9C%A0%ED%8C%AC%EC%9D%B4%20%EC%A2%8B%EC%95%84%ED%95%A0%20%EB%A7%8C%ED%95%9C%20%EB%8B%A4%EB%A5%B8%20%EC%95%84%ED%8B%B0%EC%8A%A4%ED%8A%B8%20%EC%B0%BE%EA%B8%B0.ipynb