# 프로젝트: 영화 추천하기
#### Movielens
* MovieLens 1M Dataset 사용을 권장
* 별점데이터는 explicit 데이터이지만 implicit 데이터로 간주하고 테스트 해볼 수 있다.
* 별점을 시청횟수로 해석해서 생각
* 3점 미만의 별점은 선호하지 않는다고 가정하고 제외

In [172]:
import pandas as pd

## 1. 데이터 준비와 전처리

In [173]:
import os
rating_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [174]:
# 3점 이상만 남긴다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [175]:
# rating => count로 컬럼명을 변경한다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [176]:
# timestamp는 사용하지 않으므로 삭제한다.
ratings.drop('timestamp', axis=1, inplace=True)

In [177]:
ratings.head()

Unnamed: 0,user_id,movie_id,count
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [178]:
# 영화 제목 메타데이터 로드
movie_file_path=os.getenv('HOME') + '/aiffel/recommendata_iu/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


title에 개봉년도와 같이 붙어있어서 분리해주었다.

In [179]:
movies['title'] = movies.title.str.replace('(','_')
movies['title'] = movies.title.str.replace(')','')
movies.head()

  """Entry point for launching an IPython kernel.
  


Unnamed: 0,movie_id,title,genre
0,1,Toy Story _1995,Animation|Children's|Comedy
1,2,Jumanji _1995,Adventure|Children's|Fantasy
2,3,Grumpier Old Men _1995,Comedy|Romance
3,4,Waiting to Exhale _1995,Comedy|Drama
4,5,Father of the Bride Part II _1995,Comedy


In [180]:
movies['release_year'] = movies.title.str.split('_', n=1,expand=True)[1]
movies['title'] = movies.title.str.split('_', n=1,expand=True)[0]
movies['title'] = movies.title.str.strip()
movies['release_year'] = movies.release_year.str.strip()
movies.head()

Unnamed: 0,movie_id,title,genre,release_year
0,1,Toy Story,Animation|Children's|Comedy,1995
1,2,Jumanji,Adventure|Children's|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama,1995
4,5,Father of the Bride Part II,Comedy,1995


In [181]:
# 나중에 movie title을 가지고 movie id를 가져오기 위해 메서드를 만들었다.
def get_movie_id(movie_title):
    find_result = movies.loc[movies['title']==movie_title, 'movie_id'].values
    if len(find_result) == 0:
        return None
    else:
        return find_result[0]
    
# 나중에 movie id를 가지고 movie title을 가져오기 위해 메서드를 만들었다.
def get_movie_title(movie_id):
    find_result = movies.loc[movies['movie_id']==movie_id, 'title'].values
    if len(find_result) == 0:
        return None
    else:
        return find_result[0]

## 2.  EDA

In [182]:
# ratings에 있는 유니크한 영화 개수
num_movie = ratings.movie_id.nunique()
# ratings에 있는 유니크한 사용자 수
num_user = ratings.user_id.nunique()

merge_df = pd.merge(ratings,movies, on='movie_id', how='left')
popular = merge_df.groupby('movie_id')['count'].sum().to_frame().reset_index()
popular = pd.merge(popular, movies, on='movie_id', how='left')
# 인기가 많은 영화 top 30
popular.sort_values(by='count', ascending=False)[:30]

Unnamed: 0,movie_id,count,title,genre,release_year
2600,2858,14449,American Beauty,Comedy|Drama,1999
249,260,13178,Star Wars: Episode IV - A New Hope,Action|Adventure|Fantasy|Sci-Fi,1977
1080,1196,12648,Star Wars: Episode V - The Empire Strikes Back,Action|Adventure|Drama|Sci-Fi|War,1980
1810,2028,11348,Saving Private Ryan,Action|Drama|War,1998
1094,1210,11303,Star Wars: Episode VI - Return of the Jedi,Action|Adventure|Romance|Sci-Fi|War,1983
1082,1198,11179,Raiders of the Lost Ark,Action|Adventure,1981
573,593,11096,"Silence of the Lambs, The",Drama|Thriller,1991
2325,2571,10903,"Matrix, The",Action|Sci-Fi|Thriller,1999
2507,2762,10703,"Sixth Sense, The",Thriller,1999
569,589,10513,Terminator 2: Judgment Day,Action|Sci-Fi|Thriller,1991


In [183]:
merge_df.head()

Unnamed: 0,user_id,movie_id,count,title,genre,release_year
0,1,1193,5,One Flew Over the Cuckoo's Nest,Drama,1975
1,1,661,3,James and the Giant Peach,Animation|Children's|Musical,1996
2,1,914,3,My Fair Lady,Musical|Romance,1964
3,1,3408,4,Erin Brockovich,Drama,2000
4,1,2355,5,"Bug's Life, A",Animation|Children's|Comedy,1998


In [184]:
# 내가 좋아하는 5개 영화
my_favorite = ['Superman', 'Jumanji', 'Toy Story', 'Batman', 'Back to the Future']
my_favorite_id = []
for favorite in my_favorite:
    find_movie_id = movies.loc[movies.title == favorite, 'movie_id'].astype(int).values    
    my_favorite_id.append(find_movie_id[0])

print(my_favorite_id)

[2640, 2, 1, 592, 1270]


In [185]:
my_playlist = pd.DataFrame({'user_id': [931223]*5, 'movie_id': my_favorite_id, 'count':[30]*5})
if not ratings.isin({'user_id':[931223]})['user_id'].any():  
    ratings = ratings.append(my_playlist) 

ratings.reset_index(inplace=True, drop=True)
ratings.tail(10)

Unnamed: 0,user_id,movie_id,count
836473,6040,1090,3
836474,6040,1094,5
836475,6040,562,5
836476,6040,1096,4
836477,6040,1097,4
836478,931223,2640,30
836479,931223,2,30
836480,931223,1,30
836481,931223,592,30
836482,931223,1270,30


## 3. 전처리

In [186]:
user_unique = ratings['user_id'].unique()
movie_unique = ratings['movie_id'].unique()

user_to_idx = {v:k for k,v in enumerate(user_unique)}
movie_to_idx = {v:k for k,v in enumerate(movie_unique)}

In [187]:
temp_user_data = ratings['user_id'].map(user_to_idx.get).dropna()
if len(temp_user_data) == len(ratings):
    print('user_id indexing succes')
    ratings['user_id'] = temp_user_data
else:
    print('user_id indexing fail')
    
temp_movie_data = ratings['movie_id'].map(movie_to_idx.get).dropna()
if len(temp_movie_data) == len(ratings):
    print('movie indexing succes')
    ratings['movie_id'] = temp_movie_data
else:
    print('movie indexing fail')

user_id indexing succes
movie indexing succes


## 4. CSR matrix

In [188]:
from scipy.sparse import csr_matrix
# user 수
num_user = ratings['user_id'].nunique()
# movie 수
num_movie = ratings['movie_id'].nunique()

csr_data = csr_matrix((ratings['count'], (ratings.user_id, ratings.movie_id)), shape=(num_user, num_movie))
csr_data

<6040x3628 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

## 5. 모델 구성 및 훈련

In [189]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [190]:
# 모델 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=20, dtype=np.float32)

In [191]:
csr_data_transpose = csr_data.T

# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/20 [00:00<?, ?it/s]

## 6. 모델 평가

In [192]:
hyelim, Jumanji = user_to_idx[931223], movie_to_idx[get_movie_id('Jumanji')]
hyelim_vector, Jumanji_vector = als_model.user_factors[hyelim], als_model.item_factors[Jumanji]
print(f'내가 선호하는 영화')
print(f'쥬만지: {np.dot(hyelim_vector, Jumanji_vector)}')
jurassic = movie_to_idx[get_movie_id('Jurassic Park')]
jurassic_vector = als_model.item_factors[jurassic]
print(f'Jurassic Park와의 선호도')
print(np.dot(hyelim_vector, jurassic_vector))

내가 선호하는 영화
쥬만지: 0.7112392783164978
Jurassic Park와의 선호도
0.26388496


In [193]:
idx_to_movie_id = {v:k for k,v in movie_to_idx.items()}
# 입력한 영화 타이틀과 비슷한 영화 찾기
def get_similar_movie(movie_title: str):
    movie_idx = movie_to_idx[get_movie_id('Jumanji')]
    similar_movie = als_model.similar_items(movie_idx, N=15)        
    return [get_movie_title(idx_to_movie_id[i[0]]) for i in similar_movie]

In [194]:
# 쥬만지와 비슷한 영화 찾기
get_similar_movie('Jumanji')

['Jumanji',
 'Hook',
 'Indian in the Cupboard, The',
 'Dragonheart',
 'Santa Clause, The',
 'Flubber',
 'Space Jam',
 'NeverEnding Story II: The Next Chapter, The',
 'Small Soldiers',
 'Borrowers, The',
 'NeverEnding Story, The',
 'Escape to Witch Mountain',
 'Legend',
 'Pagemaster, The',
 'Labyrinth']

In [195]:
# 비슷한 영화의 장르 찾아보기
from collections import Counter

simiar_genre = movies.loc[movies['title'].isin(get_similar_movie('Jumanji')),'genre'].to_frame()
simiar_genre = simiar_genre.genre.str.split('|', expand=True)
count_genre= []
for item in simiar_genre.columns:
    count_genre+=simiar_genre[item].to_numpy().tolist()
    
count_genre = [genre for genre in count_genre if genre != None]

count_df = dict(Counter(count_genre))
count_df = pd.DataFrame({'genre':count_df.keys(), 'count_':count_df.values()})
count_df.sort_values(by='count_', ascending=False)

['Adventure',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Adventure',
 'Adventure',
 "Children's",
 'Adventure',
 'Animation',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 "Children's",
 "Children's",
 'Comedy',
 'Adventure',
 'Adventure',
 'Animation',
 "Children's",
 'Comedy',
 "Children's",
 "Children's",
 "Children's",
 'Fantasy',
 "Children's",
 "Children's",
 'Fantasy',
 'Fantasy',
 'Fantasy',
 'Fantasy',
 'Animation',
 'Fantasy',
 "Children's",
 'Fantasy',
 'Fantasy',
 'Comedy',
 'Fantasy',
 'Fantasy',
 'Romance',
 'Fantasy',
 'Fantasy',
 "Children's",
 'Comedy',
 'Fantasy',
 'War',
 'Fantasy',
 'Fantasy']

In [196]:
# Jumanji
movies.loc[movies.movie_id==idx_to_movie_id[movie_to_idx[get_movie_id('Jumanji')]]]

Unnamed: 0,movie_id,title,genre,release_year
1,2,Jumanji,Adventure|Children's|Fantasy,1995


Jumanji의 장르가 Adventure, Children's, Fantasy인데 비슷한 장르로 Fantasy, Adventure, Children's가 나온 것을 보니 알맞게 찾았다는 것을 알 수 있다.

In [198]:
# 사용자에게 영화 추천하기
def recommend_movie(user_id):
    # recommend에서는 user*item CSR Matrix를 받습니다.
    movie_recommended = als_model.recommend(user_id, csr_data, N=20, filter_already_liked_items=True)
    return [get_movie_title(idx_to_movie_id[i[0]]) for i in movie_recommended]

In [199]:
user = user_to_idx[931223]
recommend = recommend_movie(user)
recommend

['Toy Story 2',
 'Indiana Jones and the Temple of Doom',
 'Indiana Jones and the Last Crusade',
 'Honey, I Shrunk the Kids',
 'E.T. the Extra-Terrestrial',
 'Star Wars: Episode VI - Return of the Jedi',
 'Robocop',
 'Santa Clause, The',
 'Babe',
 'Mask, The',
 'Big',
 'Hook',
 'Back to the Future Part II',
 'Superman II',
 'Willy Wonka and the Chocolate Factory',
 'Tron',
 'Star Trek IV: The Voyage Home',
 'Star Trek: The Wrath of Khan',
 'Mission: Impossible',
 'Rocky']

In [200]:
# 추천한 영화의 장르 찾아보기
recommend_genre = movies.loc[movies['title'].isin(recommend),'genre'].to_frame()
recommend_genre = recommend_genre.genre.str.split('|', expand=True)
count_genre= []
for item in recommend_genre.columns:
    count_genre+=recommend_genre[item].to_numpy().tolist()
    
count_genre = [genre for genre in count_genre if genre != None]

count_df = dict(Counter(count_genre))
count_df = pd.DataFrame({'genre':count_df.keys(), 'count_':count_df.values()})
count_df.sort_values(by='count_', ascending=False)

["Children's",
 "Children's",
 'Comedy',
 'Action',
 'Adventure',
 "Children's",
 'Action',
 'Action',
 'Action',
 'Action',
 'Action',
 'Comedy',
 'Adventure',
 'Action',
 'Action',
 'Action',
 'Comedy',
 'Action',
 'Animation',
 'Adventure',
 'Comedy',
 'Comedy',
 'Crime',
 'Adventure',
 "Children's",
 'Drama',
 'Adventure',
 'Adventure',
 'Adventure',
 'Adventure',
 'Drama',
 'Sci-Fi',
 "Children's",
 'Adventure',
 'Adventure',
 'Adventure',
 'Fantasy',
 'Crime',
 "Children's",
 'Fantasy',
 'Drama',
 'Fantasy',
 'Fantasy',
 'Mystery',
 'Comedy',
 'Fantasy',
 'Romance',
 'Sci-Fi',
 'Sci-Fi',
 'Comedy',
 'Fantasy',
 'Sci-Fi',
 'Sci-Fi',
 'Comedy',
 'Fantasy',
 'Sci-Fi',
 'Sci-Fi',
 'Fantasy',
 'Sci-Fi',
 'War',
 'Sci-Fi']

In [202]:
# 초기설정한 영화 장르 확인하기
inital_genre = movies.loc[movies.movie_id.isin(my_favorite_id), 'genre'].to_frame()
inital_genre = inital_genre.genre.str.split('|', expand=True)
count_genre= []
for item in inital_genre.columns:
    count_genre+=inital_genre[item].to_numpy().tolist()
    
count_genre = [genre for genre in count_genre if genre != None]
count_df = dict(Counter(count_genre))
count_df = pd.DataFrame({'genre':count_df.keys(), 'count_':count_df.values()})
count_df.sort_values(by='count_', ascending=False)

Unnamed: 0,genre,count_
1,Adventure,3
2,Action,2
3,Comedy,2
4,Children's,2
5,Sci-Fi,2
0,Animation,1
6,Fantasy,1
7,Crime,1
8,Drama,1


초기 설정한 장르와 비슷하게 추천한 영화의 장르가 분포해있다는 것을 알 수 있다.    
실제로 추천받은 것 중에 'Willy Wonka and the Chocolate Factory', 'Mission: Impossible'는 재밌게 본 영화이다.

# 회고

1. 처음으로 추천 시스템에 대해서 공부해봤는데 하면서 내가 관심이 있고, 잘 아는 장르에 대해서 개발을 해야한다는 것을 알았다.
2. 이번엔 영화로 추천 시스템을 만들었지만 내가 관심있어하는 음악에 대해서도 만들고 싶다.
3. 추천한 결과에 대해서 재미있게 시각화를 해보는 것도 재밌을 것 같다.
4. 게임 상 직업 추천, 소설책 추천 등과 같은 재밌는 작업도 할 수 있을 것 같다.

# Reference

* https://lsjsj92.tistory.com/570