In [1]:
import os
import pandas as pd
import numpy as np

## 1. 데이터 불러오기 및 전처리

In [2]:
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'ratings', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,ratings,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
ratings = ratings[ratings['ratings']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [4]:
ratings.rename(columns={'ratings':'counts'}, inplace=True)

In [5]:
ratings['counts']

0          5
1          3
2          3
3          4
4          5
          ..
1000203    3
1000205    5
1000206    5
1000207    4
1000208    4
Name: counts, Length: 836478, dtype: int64

In [6]:
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [7]:
ratings = pd.merge(ratings, movies)
ratings = ratings[["user_id", "movie_id", "title", "counts"]]
ratings.sort_values('user_id')

Unnamed: 0,user_id,movie_id,title,counts
0,1,1193,One Flew Over the Cuckoo's Nest (1975),5
31113,1,2294,Antz (1998),4
31674,1,3186,"Girl, Interrupted (1999)",4
32044,1,1566,Hercules (1997),4
32415,1,588,Aladdin (1992),4
...,...,...,...,...
657728,6040,334,Vanya on 42nd Street (1994),4
393446,6040,1294,M*A*S*H (1970),4
253075,6040,994,Big Night (1996),3
127665,6040,2396,Shakespeare in Love (1998),3


## 2. 분석

In [8]:
print('ratings에 있는 유니크한 영화 개수 : {}'.format(ratings['movie_id'].nunique()))
print('ratings에 있는 유니크한 사용자 수 : {}'.format(ratings['user_id'].nunique()))

ratings에 있는 유니크한 영화 개수 : 3628
ratings에 있는 유니크한 사용자 수 : 6039


In [9]:
movie_count = ratings.groupby('title')['user_id'].count()
movie_count.sort_values(ascending=False).head(30)

title
American Beauty (1999)                                   3211
Star Wars: Episode IV - A New Hope (1977)                2910
Star Wars: Episode V - The Empire Strikes Back (1980)    2885
Star Wars: Episode VI - Return of the Jedi (1983)        2716
Saving Private Ryan (1998)                               2561
Terminator 2: Judgment Day (1991)                        2509
Silence of the Lambs, The (1991)                         2498
Raiders of the Lost Ark (1981)                           2473
Back to the Future (1985)                                2460
Matrix, The (1999)                                       2434
Jurassic Park (1993)                                     2413
Sixth Sense, The (1999)                                  2385
Fargo (1996)                                             2371
Braveheart (1995)                                        2314
Men in Black (1997)                                      2297
Schindler's List (1993)                                  2257
Pr

## 3. 선호하는 영화

In [10]:
my_movie_genre = movies.loc[movies['genre'].str.contains('Romance')].copy()
my_movie_genre['title'].unique()

array(['Grumpier Old Men (1995)', 'Sabrina (1995)',
       'American President, The (1995)', 'Cutthroat Island (1995)',
       'Sense and Sensibility (1995)', 'Leaving Las Vegas (1995)',
       'Persuasion (1995)', 'Wings of Courage (1995)',
       'Carrington (1995)', 'Clueless (1995)',
       'How to Make an American Quilt (1995)', 'Pocahontas (1995)',
       'When Night Is Falling (1995)', 'Postino, Il (The Postman) (1994)',
       'Two if by Sea (1996)', 'French Twist (Gazon maudit) (1995)',
       'Bed of Roses (1996)',
       "Things to Do in Denver when You're Dead (1995)",
       'Angels and Insects (1995)', 'Vampire in Brooklyn (1995)',
       'Bridges of Madison County, The (1995)', 'If Lucy Fell (1996)',
       'Boomerang (1992)', 'Chungking Express (1994)',
       'Pie in the Sky (1995)', 'Frankie Starlight (1995)',
       'Up Close and Personal (1996)', 'Rob Roy (1995)',
       'Desperado (1995)', 'First Knight (1995)', 'Mad Love (1995)',
       'Moonlight and Valentino (1

In [11]:
my_movie_title = ['Here on Earth (2000)',
                'Place in the Sun, A (1951)',
                'Couch in New York, A (1996)',
                'Love & Sex (2000)',
                'Breathless (1983)']
my_movies = pd.DataFrame({'user_id': ['6039']*5, 'title': my_movie_title, 
                        'movie_id': movies[movies['title'].isin(my_movie_title)]['movie_id'], 'counts':[5]*5})
my_movies

Unnamed: 0,user_id,title,movie_id,counts
3384,6039,Here on Earth (2000),3453,5
3406,6039,"Place in the Sun, A (1951)",3475,5
3515,6039,"Couch in New York, A (1996)",3584,5
3804,6039,Love & Sex (2000),3874,5
3815,6039,Breathless (1983),3885,5


In [14]:
if not ratings.isin({'user_id':['6039']})['user_id'].any():
    ratings = ratings.append(my_movies, ignore_index=True)

ratings.tail(10)

Unnamed: 0,user_id,movie_id,title,counts
836473,5851,3607,One Little Indian (1973),5
836474,5854,3026,Slaughterhouse (1987),4
836475,5854,690,"Promise, The (Versprechen, Das) (1994)",3
836476,5938,2909,"Five Wives, Three Secretaries and Me (1998)",4
836477,5948,1360,Identification of a Woman (Identificazione di ...,5
836478,6039,3453,Here on Earth (2000),5
836479,6039,3475,"Place in the Sun, A (1951)",5
836480,6039,3584,"Couch in New York, A (1996)",5
836481,6039,3874,Love & Sex (2000),5
836482,6039,3885,Breathless (1983),5


In [15]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['title'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [16]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):
    print('user_id column indexing OK!!')
    ratings['user_id'] = temp_user_data
else:
    print('user_id column indexing Fail!!')

temp_movie_data = ratings['title'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie_id column indexing OK!!')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie_id column indexing Fail!!')

ratings

user_id column indexing OK!!
movie_id column indexing OK!!


Unnamed: 0,user_id,movie_id,title,counts
0,0,0,One Flew Over the Cuckoo's Nest (1975),5
1,1,0,One Flew Over the Cuckoo's Nest (1975),5
2,2,0,One Flew Over the Cuckoo's Nest (1975),4
3,3,0,One Flew Over the Cuckoo's Nest (1975),4
4,4,0,One Flew Over the Cuckoo's Nest (1975),5
...,...,...,...,...
836478,6039,2603,Here on Earth (2000),5
836479,6039,2491,"Place in the Sun, A (1951)",5
836480,6039,1567,"Couch in New York, A (1996)",5
836481,6039,3274,Love & Sex (2000),5


## 4. CSR Matrix

In [17]:
from scipy.sparse import csr_matrix

num_user = ratings['user_id'].nunique()
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['counts'], (ratings['user_id'], ratings['movie_id'])), shape=(num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Row format>

## 5. 모델 생성 및 훈련

In [18]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [19]:
als_model = AlternatingLeastSquares(factors=2000, regularization=0.01, use_gpu=False, iterations=50, dtype=np.float32)

In [20]:
csr_data_transpose = csr_data.T
csr_data_transpose

<3628x6040 sparse matrix of type '<class 'numpy.int64'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [21]:
als_model.fit(csr_data_transpose)

  0%|          | 0/50 [00:00<?, ?it/s]

## 6. 나의 선호도 파악

In [22]:
how, breathless = user_to_idx[6039], movie_to_idx['Breathless (1983)']
how_vector, breathless_vector = als_model.user_factors[how], als_model.item_factors[breathless]

In [23]:
how_vector

array([-1.0513887 , -0.30856252,  0.37702554, ...,  0.5163862 ,
        0.09233084,  0.00332553], dtype=float32)

In [24]:
breathless_vector

array([-0.00828305,  0.00283687,  0.00706048, ..., -0.00178244,
       -0.00069456,  0.00900057], dtype=float32)

In [25]:
np.dot(how_vector, breathless_vector)

0.023155937

## 7. 내가 좋아하는 영화와 비슷한 영화 추천

In [28]:
my_movie = 'Breathless (1983)'
movie_id = movie_to_idx['Breathless (1983)']
similar_movie = als_model.similar_items(movie_id, N=15)
similar_movie

[(1232, 0.9999998),
 (3241, 0.6221039),
 (3552, 0.62149143),
 (3525, 0.620972),
 (3242, 0.62020826),
 (3246, 0.6200816),
 (3587, 0.6192811),
 (3383, 0.6191497),
 (3551, 0.61824864),
 (3620, 0.6175055),
 (3520, 0.6174341),
 (3035, 0.61676586),
 (3471, 0.6167414),
 (3572, 0.61673516),
 (2960, 0.6162924)]

In [29]:
idx_to_title = {v:k for k,v in movie_to_idx.items()}
[idx_to_title[i[0]] for i in similar_movie]

['Breathless (1983)',
 'American Strays (1996)',
 'Relative Fear (1994)',
 'Penitentiary II (1982)',
 'Boy Called Hate, A (1995)',
 'Brothers in Trouble (1995)',
 'Shopping (1994)',
 'Little Men (1998)',
 'Foreign Student (1994)',
 'Modulations (1998)',
 'Rhyme & Reason (1997)',
 'For Ever Mozart (1996)',
 '3 Ninjas: High Noon On Mega Mountain (1998)',
 "I Don't Want to Talk About It (De eso no se habla) (1993)",
 'Nightmares (1983)']

In [30]:
def get_similar_movie(movie_title: str):
    movie_id = movie_to_idx[movie_title]
    similar_movie = als_model.similar_items(movie_id)
    similar_movie = [idx_to_title[i[0]] for i in similar_movie]
    return similar_movie

In [33]:
get_similar_movie('American Strays (1996)')

['American Strays (1996)',
 'Boy Called Hate, A (1995)',
 'Kicked in the Head (1997)',
 'Male and Female (1919)',
 "I Don't Want to Talk About It (De eso no se habla) (1993)",
 "Brother's Kiss, A (1997)",
 'Neon Bible, The (1995)',
 'War at Home, The (1996)',
 'Number Seventeen (1932)',
 'Relative Fear (1994)']

## 8. 좋아할만한 영화 추천

In [34]:
user = user_to_idx[6039]

movie_recommended = als_model.recommend(user, csr_data, N=20, filter_already_liked_items=True)
movie_recommended

[(2889, 0.1636061),
 (1951, 0.1520332),
 (2306, 0.14124379),
 (2582, 0.11640931),
 (1920, 0.114588566),
 (1342, 0.11115833),
 (1455, 0.110237464),
 (627, 0.109456874),
 (3291, 0.10658172),
 (2430, 0.101728216),
 (2944, 0.09941157),
 (3144, 0.09717184),
 (658, 0.09472695),
 (2040, 0.091856495),
 (1954, 0.09061955),
 (1866, 0.090391494),
 (3163, 0.08797023),
 (2697, 0.08772649),
 (2832, 0.08756846),
 (905, 0.08718266)]

In [35]:
[idx_to_title[i[0]] for i in movie_recommended]

['Damsel in Distress, A (1937)',
 'Broadway Melody, The (1929)',
 'Great Ziegfeld, The (1936)',
 'Harmonists, The (1997)',
 'Gilda (1946)',
 'My Man Godfrey (1957)',
 'Holiday Inn (1942)',
 'March of the Wooden Soldiers (a.k.a. Laurel & Hardy in Toyland) (1934)',
 'Battling Butler (1926)',
 'Sanjuro (1962)',
 'Trouble in Paradise (1932)',
 'Vibes (1988)',
 'White Christmas (1954)',
 'Bhaji on the Beach (1993)',
 'Henry Fool (1997)',
 'Trip to Bountiful, The (1985)',
 'Stage Fright (1950)',
 'Little Princess, The (1939)',
 'Bridge at Remagen, The (1969)',
 'Murder, My Sweet (1944)']

## 9. 기여도 확인

In [41]:
movie_id = movie_to_idx['Henry Fool (1997)']

explain = als_model.explain(user, csr_data, itemid=movie_id)
[(idx_to_title[i[0]], i[1]) for i in explain[1]]

[("You Can't Take It With You (1938)", 0.05419891972320865),
 ('Wings of Desire (Der Himmel über Berlin) (1987)', 0.023824934099730785),
 ('Shall We Dance? (1937)', 0.01769335214770872),
 ('Muppet Treasure Island (1996)', 0.016953140373815675),
 ('Inspector General, The (1949)', 0.015346675365114935),
 ('Victor/Victoria (1982)', 0.01153695075429589),
 ('Arsenic and Old Lace (1944)', 0.009288890767776791),
 ('Around the World in 80 Days (1956)', 0.008886366585718949),
 ('Cinderella (1950)', 0.008839944093450618),
 ('To Catch a Thief (1955)', 0.008009798797283953)]

## 마치며
* 정확도가 그렇게 높지는 않음.
     + 여러 개의 영화를 넣고 돌려봤을 때, 첫 번째의 영화는 대략 0.04 ~ 0.06 정도의 유사도를 보야주고 있음.
* 내가 좋아하는 영화와 추천하는 영화가 비슷한지는 미지수.