These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
made by 6,040 MovieLens users who joined MovieLens in 2000.

In [1]:
import os
import numpy as np
import pandas as pd

rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python')
orginal_data_size = len(ratings)
ratings.head(10)

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291
5,1,1197,3,978302268
6,1,1287,5,978302039
7,1,2804,5,978300719
8,1,594,4,978302268
9,1,919,4,978301368


A timestamp is a sequence of characters or encoded information identifying when a certain event occurred, usually giving date and time of day, sometimes accurate to a small fraction of a second

In [2]:
# Leave ratings >=3
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
ratings.nunique()

user_id        6039
movie_id       3628
rating            3
timestamp    412911
dtype: int64

In [4]:
# Change column names into 'counts'
ratings.rename(columns={'rating':'counts'}, inplace=True)

In [5]:
# Retrieve metadata to see titles.
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movie_counts = ratings.groupby('movie_id')['user_id'].count()
movie_counts.sort_values(ascending=False).head(30)
print(len(movies.index)) # 3883 due to null samples

3883


In [7]:
my_favourite = ['Toy Story (1995)', 'Jumanji (1995)', 'Grumpier Old Men (1995)', 'Waiting to Exhale (1995)', 'Father of the Bride Part II (1995)']
my_favourite_nums = [1, 2, 3, 4, 5]
my_playlist = pd.DataFrame({'user_id': ['Hwijun']*5, 'movie_id': my_favourite_nums, 'counts':[5]*5})

if not ratings.isin({'user_id':['Hwijun']})['user_id'].any():  # If 'Hwijun' does not exist in 'user_id'
    ratings = ratings.append(my_playlist)     # add to my_favourite

ratings.tail(10)       # See if it's done correctly.

Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6040,1090,3,956715518.0
1000205,6040,1094,5,956704887.0
1000206,6040,562,5,956704746.0
1000207,6040,1096,4,956715648.0
1000208,6040,1097,4,956715569.0
0,Hwijun,1,5,
1,Hwijun,2,5,
2,Hwijun,3,5,
3,Hwijun,4,5,
4,Hwijun,5,5,


In [8]:
# Code to find unique user/movie data
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()
print(user_unique, movie_unique)
# Code for indexing user/movie data
user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

[1 2 3 ... 6039 6040 'Hwijun'] [1193  661  914 ...  690 2909 1360]


In [9]:
# See if indexing is done good. 
print(user_to_idx['Hwijun'])     
print(movie_to_idx[1])

6039
40


In [10]:
# Code to change data inside the column via indexing 
# dropna() removes undesirable rows. 
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):   # If every row is indexed normally
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data   # shift data['user_id'] into indexed Series. 
else:
    print('user_id column indexing Fail!!')

# movie column is indexed in the same way via movie_to_idx. 
temp_artist_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_artist_data) == len(ratings):
    print('movie column indexing OK!!')
    ratings['movie_id'] = temp_artist_data
else:
    print('movie column indexing Fail!!')

ratings.tail(10)

user_id column indexing OK!!
movie column indexing OK!!


Unnamed: 0,user_id,movie_id,counts,timestamp
1000203,6038,1030,3,956715518.0
1000205,6038,986,5,956704887.0
1000206,6038,311,5,956704746.0
1000207,6038,142,4,956715648.0
1000208,6038,26,4,956715569.0
0,6039,40,5,
1,6039,513,5,
2,6039,1862,5,
3,6039,397,5,
4,6039,1180,5,


## CSR Matrix

In [11]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings.counts, (ratings.user_id, ratings.movie_id)), shape= (num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## Train Matrix Fatorisation model

In [12]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

AlternatingLeastSquares 클래스의 __init__ 파라미터를 살펴보겠습니다.    
1. factors : 유저와 아이템의 벡터를 몇 차원으로 할 것인지 2. regularization : 과적합을 방지하기 위해 정규화 값을 얼마나 사용할 것인지 3. use_gpu : GPU를 사용할 것인지 4. iterations : epochs와 같은 의미입니다. 데이터를 몇 번 반복해서 학습할 것인지

1,4를 늘릴수록 학습데이터를 잘 학습하게 되지만 과적합의 우려가 있으니 좋은 값을 찾아야 합니다.

In [13]:
# Implicit AlternatingLeastSquares model
als_model = AlternatingLeastSquares(factors=1000, regularization=0.01, use_gpu=False, iterations=500, dtype=np.float32)

In [14]:
# Since als model takes input of item x user form, the maxrix is transposed.
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [15]:
# Train model
als_model.fit(csr_data_transpose)

HBox(children=(FloatProgress(value=0.0, max=500.0), HTML(value='')))




In [16]:
Hwijun, Toy_Story = user_to_idx['Hwijun'], movie_to_idx[1]
Hwijun_vector, Toy_Story_vector = als_model.user_factors[Hwijun], als_model.item_factors[Toy_Story]


## Predict

In [17]:
# Code to dot-product Hwijun with Toy_Story_vector
np.dot(Hwijun_vector, Toy_Story_vector)

0.98180616

In [18]:
Jumanji = movie_to_idx[2]
Jumanji_vector = als_model.item_factors[Jumanji]
np.dot(Hwijun_vector, Jumanji_vector)

0.90117997

## Find similar film + recommend to users

In [19]:
favorite_movie = 1
film_id = movie_to_idx[favorite_movie]
similar_movie = als_model.similar_items(film_id, N=15)
similar_movie

[(40, 1.4103717),
 (50, 0.2231927),
 (3589, 0.19972041),
 (2938, 0.15846212),
 (33, 0.14913622),
 (3385, 0.1480787),
 (4, 0.14478391),
 (2739, 0.13890174),
 (2970, 0.13881788),
 (3000, 0.13325748),
 (1492, 0.13146798),
 (322, 0.13061464),
 (3266, 0.12970476),
 (2633, 0.1291978),
 (3107, 0.12816566)]

In [50]:
# Flipping movie_to_idx, it creates a dictionary which receives movie id from index.
idx_to_movie = {v:k for k,v in movie_to_idx.items()}
[idx_to_movie[i[0]] for i in similar_movie]

[1,
 3114,
 3290,
 106,
 588,
 3817,
 2355,
 3562,
 3945,
 1312,
 2774,
 34,
 1324,
 2630,
 2892]

In [71]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
movie_cols = ['movie_id', 'title', 'genre']
films = pd.read_csv(movie_file_path, sep='::', names=movie_cols, engine='python')
orginal_data_size = len(films)
films.head(10)

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children's
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [76]:
def get_similar_movie(movie_code: int):
    movie_id = movie_to_idx[movie_code]
    similar_movie = als_model.similar_items(movie_id) # film_id = movie_to_idx[favorite_movie]
    similar_movie = [idx_to_movie[i[0]] for i in similar_movie]
    for idx, item in enumerate(similar_movie):
        similar_movie[idx] = films.loc[films['movie_id']==item,'title']
    print('Genre: ', films.loc[movie_code, 'genre'])
    return similar_movie


In [77]:
get_similar_movie(50)

Genre:  Action|Drama|Thriller


[49    Usual Suspects, The (1995)
 Name: title, dtype: object,
 1073    Reservoir Dogs (1992)
 Name: title, dtype: object,
 1575    L.A. Confidential (1997)
 Name: title, dtype: object,
 526    Second Best (1994)
 Name: title, dtype: object,
 604    Fargo (1996)
 Name: title, dtype: object,
 46    Seven (Se7en) (1995)
 Name: title, dtype: object,
 2604    Eternity and a Day (Mia eoniotita ke mia mera ...
 Name: title, dtype: object,
 660    All Things Fair (1996)
 Name: title, dtype: object,
 83    Last Summer in the Hamptons (1995)
 Name: title, dtype: object,
 619    Condition Red (1995)
 Name: title, dtype: object]

In [78]:
get_similar_movie(600)

Genre:  Documentary


[596    Love and a .45 (1994)
 Name: title, dtype: object,
 2523    Joyriders, The (1999)
 Name: title, dtype: object,
 3777    Ilsa, She Wolf of the SS (1974)
 Name: title, dtype: object,
 96    Shopping (1994)
 Name: title, dtype: object,
 150    Addiction, The (1995)
 Name: title, dtype: object,
 3403    Horror Hotel (a.k.a. The City of the Dead) (1960)
 Name: title, dtype: object,
 1098    Funeral, The (1996)
 Name: title, dtype: object,
 517    Romeo Is Bleeding (1993)
 Name: title, dtype: object,
 101    Unforgettable (1996)
 Name: title, dtype: object,
 1343    Zero Kelvin (Kj�rlighetens kj�tere) (1995)
 Name: title, dtype: object]

In [79]:
user = user_to_idx['Hwijun']
# 'recommend'takes user*item form CSR Matrix.
movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(1045, 0.209936),
 (1511, 0.1891703),
 (1165, 0.14502376),
 (1186, 0.14014792),
 (1207, 0.13254151),
 (560, 0.12889057),
 (1586, 0.12768185),
 (285, 0.1235433),
 (610, 0.1229843),
 (530, 0.119756624),
 (1671, 0.11141665),
 (1556, 0.11061254),
 (770, 0.11052165),
 (2046, 0.11051899),
 (2452, 0.10693722),
 (1409, 0.10451311),
 (392, 0.10382391),
 (934, 0.10220136),
 (398, 0.101915196),
 (1649, 0.100624286)]

In [80]:
[idx_to_movie[i[0]] for i in movie_recommended]

[3450,
 1353,
 2253,
 236,
 276,
 186,
 3248,
 1513,
 2693,
 1918,
 3484,
 1064,
 434,
 3324,
 1126,
 2467,
 2702,
 222,
 1621,
 372]

In [81]:
what_movie = movie_to_idx[3450]
explain = als_model.explain(user, csr_data, itemid=39)

In [82]:
[(idx_to_movie[i[0]], i[1]) for i in explain[1]]

[(5, 0.014660119072697835),
 (2, 0.010647938257443788),
 (1, 0.0010763594368420112),
 (4, -0.0018733691674736072),
 (3, -0.0032938185424454675)]