## Load data

## 데이터 준비와 전처리

In [1]:
import os
import pandas as pd

rating_file_path=os.getenv('HOME') + '/aiffel/exploration/ex9/data/ml-1m/ratings.dat'
ratings_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv(rating_file_path, sep='::', names=ratings_cols, engine='python', encoding = "ISO-8859-1")
orginal_data_size = len(ratings)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [2]:
# 3점 이상만 남깁니다.
ratings = ratings[ratings['rating']>=3]
filtered_data_size = len(ratings)

print(f'orginal_data_size: {orginal_data_size}, filtered_data_size: {filtered_data_size}')
print(f'Ratio of Remaining Data is {filtered_data_size / orginal_data_size:.2%}')

orginal_data_size: 1000209, filtered_data_size: 836478
Ratio of Remaining Data is 83.63%


In [3]:
# rating 컬럼의 이름을 count로 바꿉니다.
ratings.rename(columns={'rating':'count'}, inplace=True)

In [4]:
ratings['count'].value_counts()

4    348971
3    261197
5    226310
Name: count, dtype: int64

In [5]:
# 영화 제목을 보기 위해 메타 데이터를 읽어옵니다.
movie_file_path=os.getenv('HOME') + '/aiffel/exploration/ex9/data/ml-1m/movies.dat'
cols = ['movie_id', 'title', 'genre'] 
movies = pd.read_csv(movie_file_path, sep='::', names=cols, engine='python', encoding='ISO-8859-1')
movies.head()

Unnamed: 0,movie_id,title,genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
# 장르 분리
# genres = movies["genre"].str.split("|").apply(pd.Series)

genre_dict = {}
for i in movies["genre"].str.split("|"):
    for j in i:
        if j not in genre_dict:
            genre_dict[j] = 1
        else:
            genre_dict[j] += 1
genre_dict

{'Animation': 105,
 "Children's": 251,
 'Comedy': 1200,
 'Adventure': 283,
 'Fantasy': 68,
 'Romance': 471,
 'Drama': 1603,
 'Action': 503,
 'Crime': 211,
 'Thriller': 492,
 'Horror': 343,
 'Sci-Fi': 276,
 'Documentary': 127,
 'War': 143,
 'Musical': 114,
 'Mystery': 106,
 'Film-Noir': 44,
 'Western': 68}

In [7]:
def boolean_df(item_lists, unique_items):
    
    bool_dict = {}
    
    for i, item in enumerate(unique_items):
        bool_dict[item] = item_lists.apply(lambda x: item in x)
        
    return pd.DataFrame(bool_dict)

unique_genre = list(genre_dict.keys())
genre_one_hot = boolean_df(movies["genre"].str.split("|"), unique_genre)

movies = pd.concat([movies, genre_one_hot], axis=1)
movies.head()

Unnamed: 0,movie_id,title,genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,...,Crime,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western
0,1,Toy Story (1995),Animation|Children's|Comedy,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,2,Jumanji (1995),Adventure|Children's|Fantasy,False,True,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,False
2,3,Grumpier Old Men (1995),Comedy|Romance,False,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
3,4,Waiting to Exhale (1995),Comedy|Drama,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
4,5,Father of the Bride Part II (1995),Comedy,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [8]:
# 개봉연도 분리
year = movies["title"].str.extract(r"(\d{4})")
movies["year"] = movies["title"].str.extract(r"(\d{4})")
movies["title"] = movies["title"].str.replace(r" \(\d{4}\)","")
movies.head()

  after removing the cwd from sys.path.


Unnamed: 0,movie_id,title,genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,...,Thriller,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western,year
0,1,Toy Story,Animation|Children's|Comedy,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1995
1,2,Jumanji,Adventure|Children's|Fantasy,False,True,False,True,True,False,False,...,False,False,False,False,False,False,False,False,False,1995
2,3,Grumpier Old Men,Comedy|Romance,False,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,False,1995
3,4,Waiting to Exhale,Comedy|Drama,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,False,1995
4,5,Father of the Bride Part II,Comedy,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,1995


In [9]:
# view count 추가
merged_data = pd.merge(ratings, movies, how="left", on="movie_id")
movie_play_count = merged_data.groupby("movie_id")["count"].sum()
movies = pd.merge(movies, movie_play_count, how="left", on="movie_id")
movies = movies.rename(columns={'count':'view_count'})
movies.head()

Unnamed: 0,movie_id,title,genre,Animation,Children's,Comedy,Adventure,Fantasy,Romance,Drama,...,Horror,Sci-Fi,Documentary,War,Musical,Mystery,Film-Noir,Western,year,view_count
0,1,Toy Story,Animation|Children's|Comedy,True,True,True,False,False,False,False,...,False,False,False,False,False,False,False,False,1995,8475.0
1,2,Jumanji,Adventure|Children's|Fantasy,False,True,False,True,True,False,False,...,False,False,False,False,False,False,False,False,1995,1986.0
2,3,Grumpier Old Men,Comedy|Romance,False,False,True,False,False,True,False,...,False,False,False,False,False,False,False,False,1995,1208.0
3,4,Waiting to Exhale,Comedy|Drama,False,False,True,False,False,False,True,...,False,False,False,False,False,False,False,False,1995,349.0
4,5,Father of the Bride Part II,Comedy,False,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,1995,754.0


In [10]:
id_to_movie = {x[0]:x[1] for x in movies[["movie_id", "title"]].to_numpy()}
movie_to_id = {v:k for k,v in id_to_movie.items()}

In [11]:
print(id_to_movie)

{1: 'Toy Story', 2: 'Jumanji', 3: 'Grumpier Old Men', 4: 'Waiting to Exhale', 5: 'Father of the Bride Part II', 6: 'Heat', 7: 'Sabrina', 8: 'Tom and Huck', 9: 'Sudden Death', 10: 'GoldenEye', 11: 'American President, The', 12: 'Dracula: Dead and Loving It', 13: 'Balto', 14: 'Nixon', 15: 'Cutthroat Island', 16: 'Casino', 17: 'Sense and Sensibility', 18: 'Four Rooms', 19: 'Ace Ventura: When Nature Calls', 20: 'Money Train', 21: 'Get Shorty', 22: 'Copycat', 23: 'Assassins', 24: 'Powder', 25: 'Leaving Las Vegas', 26: 'Othello', 27: 'Now and Then', 28: 'Persuasion', 29: 'City of Lost Children, The', 30: 'Shanghai Triad (Yao a yao yao dao waipo qiao)', 31: 'Dangerous Minds', 32: 'Twelve Monkeys', 33: 'Wings of Courage', 34: 'Babe', 35: 'Carrington', 36: 'Dead Man Walking', 37: 'Across the Sea of Time', 38: 'It Takes Two', 39: 'Clueless', 40: 'Cry, the Beloved Country', 41: 'Richard III', 42: 'Dead Presidents', 43: 'Restoration', 44: 'Mortal Kombat', 45: 'To Die For', 46: 'How to Make an Amer

### 분석

In [12]:
# ratings에 있는 유니크한 영화 개수
ratings["movie_id"].nunique()

3628

In [13]:
# rating에 있는 유니크한 사용자 수
ratings["user_id"].nunique()

6039

In [14]:
# 가장 인기 있는 영화 30개(인기순)
best_movies = movies.sort_values("view_count", ascending=False)[0:50]
best_movies[["movie_id","title", "view_count"]]

Unnamed: 0,movie_id,title,view_count
2789,2858,American Beauty,14449.0
257,260,Star Wars: Episode IV - A New Hope,13178.0
1178,1196,Star Wars: Episode V - The Empire Strikes Back,12648.0
1959,2028,Saving Private Ryan,11348.0
1192,1210,Star Wars: Episode VI - Return of the Jedi,11303.0
1180,1198,Raiders of the Lost Ark,11179.0
589,593,"Silence of the Lambs, The",11096.0
2502,2571,"Matrix, The",10903.0
2693,2762,"Sixth Sense, The",10703.0
585,589,Terminator 2: Judgment Day,10513.0


### 임의로 시청한 영화 데이터 추가

In [15]:
junil = pd.DataFrame([
    {"user_id":99999, "movie_id":2571, "count":3, "timestamp":978301968},
    {"user_id":99999, "movie_id":2762, "count":5, "timestamp":978301968},
    {"user_id":99999, "movie_id":541, "count":5, "timestamp":978301968},
    {"user_id":99999, "movie_id":1214, "count":4, "timestamp":978301968},
    {"user_id":99999, "movie_id":110, "count":3, "timestamp":978301968}
])

junil_movies = [id_to_movie[x] for x in junil["movie_id"]]
print(junil_movies)

ratings = ratings.append(junil)
ratings.tail(10)

['Matrix, The', 'Sixth Sense, The', 'Blade Runner', 'Alien', 'Braveheart']


Unnamed: 0,user_id,movie_id,count,timestamp
1000203,6040,1090,3,956715518
1000205,6040,1094,5,956704887
1000206,6040,562,5,956704746
1000207,6040,1096,4,956715648
1000208,6040,1097,4,956715569
0,99999,2571,3,978301968
1,99999,2762,5,978301968
2,99999,541,5,978301968
3,99999,1214,4,978301968
4,99999,110,3,978301968


### csr_matrix

In [16]:
from scipy.sparse import csr_matrix

num_users = ratings["user_id"].nunique()
num_movies = ratings["movie_id"].nunique()
ratings["user_id"]

csr_data = csr_matrix((ratings["count"], (ratings.user_id, ratings.movie_id)))
csr_data

<100000x3953 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Row format>

### AlternatingLeastSquares

In [17]:
from implicit.als import AlternatingLeastSquares
import os
import numpy as np

# implicit 라이브러리에서 권장하고 있는 부분입니다. 학습 내용과는 무관합니다.
os.environ['OPENBLAS_NUM_THREADS']='1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'
os.environ['MKL_NUM_THREADS']='1'

In [18]:
# Implicit AlternatingLeastSquares 모델의 선언
als_model = AlternatingLeastSquares(factors=100, regularization=0.01, use_gpu=False, iterations=15, dtype=np.float32)

In [19]:
# als 모델은 input으로 (item X user 꼴의 matrix를 받기 때문에 Transpose해줍니다.)
csr_data_transpose = csr_data.T
csr_data_transpose

<3953x100000 sparse matrix of type '<class 'numpy.longlong'>'
	with 836483 stored elements in Compressed Sparse Column format>

In [20]:
# 모델 훈련
als_model.fit(csr_data_transpose)

  0%|          | 0/15 [00:00<?, ?it/s]

### 비슷한 영화 찾기

In [21]:
favorite_movie = "Toy Story"
favorite_movie_id = movie_to_id[favorite_movie]

similar_movie_idx = als_model.similar_items(favorite_movie_id, N=15)
similar_movies = [id_to_movie[x[0]] for x in similar_movie_idx]
similar_movies

['Toy Story',
 'Toy Story 2',
 'Aladdin',
 'Babe',
 "Bug's Life, A",
 'Groundhog Day',
 'Lion King, The',
 'Pleasantville',
 'Beauty and the Beast',
 'Shakespeare in Love',
 "There's Something About Mary",
 'Hercules',
 'Forrest Gump',
 'Mulan',
 'Tarzan']

### 영화 추천

In [22]:
# 99999 -> junil
movie_recommended = als_model.recommend(99999, csr_data, N=20, filter_already_liked_items=True)
movie_recommended = {id_to_movie[x[0]]:x[1] for x in movie_recommended}
movie_recommended

{'Terminator 2: Judgment Day': 0.53792906,
 'Terminator, The': 0.46004567,
 'Aliens': 0.4243127,
 'Star Wars: Episode IV - A New Hope': 0.33461183,
 'L.A. Confidential': 0.31554013,
 'Saving Private Ryan': 0.3077411,
 'Silence of the Lambs, The': 0.29944456,
 'Fugitive, The': 0.27608258,
 'Back to the Future': 0.2646887,
 'Jurassic Park': 0.25932547,
 'Jaws': 0.2587511,
 'Star Wars: Episode V - The Empire Strikes Back': 0.25185767,
 'Total Recall': 0.24622962,
 'Die Hard': 0.23829868,
 '2001: A Space Odyssey': 0.23264116,
 'Star Wars: Episode VI - Return of the Jedi': 0.2246325,
 'Gladiator': 0.21980107,
 'Maltese Falcon, The': 0.2135589,
 'Mad Max 2 (a.k.a. The Road Warrior)': 0.20139512,
 'Usual Suspects, The': 0.19274387}

### 추천 기여도 확인

In [29]:
terminator_id = movie_to_id["Terminator, The"]
explain = als_model.explain(99999, csr_data, itemid=terminator2_id)
explain
[(id_to_movie[i[0]], i[1]) for i in explain[1]]

[('Alien', 0.1717909793686192),
 ('Blade Runner', 0.14034365654506648),
 ('Matrix, The', 0.09731298564983687),
 ('Sixth Sense, The', 0.05116847896795673),
 ('Braveheart', -0.009175300271301604)]