# 실습 2 - 영화 추천시스템


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

* Movie recommendation is one of the first step to start learning recommendation systems.
* MovieLens dataset is a famous one for learning to build the recommendation systems.
    * https://grouplens.org/datasets/movielens/
    * https://kaggle.com/grouplens/movielens-20m-dataset

In [None]:
import io
from google.colab import files

uploaded = files.upload()
ratings = pd.read_csv(io.StringIO(uploaded['ratings.csv'].decode('utf-8')))

In [None]:
uploaded = files.upload()
movies = pd.read_csv(io.StringIO(uploaded['movies.csv'].decode('utf-8')))

* Let's first merge those two dataframes.

In [None]:
df_movies = pd.merge(ratings, movies, on='movieId')

* Which movie has the highest user ratings on average?

In [None]:
ratings_sort = df_movies.groupby('title')['rating'].mean().sort_values(ascending=False)
ratings_sort.head(10)

* Which movies received the most ratings from users?

In [None]:
counting_sort = df_movies.groupby('title')['rating'].count().sort_values(ascending=False)
counting_sort.head(10)

* Let's combine of those two results.

In [None]:
movie_ratings = pd.DataFrame(df_movies.groupby('title')['rating'].mean())
movie_ratings['numbers'] = pd.DataFrame(df_movies.groupby('title')['rating'].count())
movie_ratings.sort_values('numbers', ascending=False).head(10)

* How about the distribution of each rating or number of ratings?

In [None]:
sns.jointplot(x='rating', y='numbers', data=movie_ratings, alpha=0.2)
plt.show()

* Now, reshape the dataframe with using pivot_table.

In [None]:
df_movies.head()

In [None]:
user_movie_matrix = df_movies.pivot_table(index='userId', columns='title', values='rating')
user_movie_matrix.head()

* Fill the NaN values to 0.

In [None]:
user_movie_matrix.fillna(0, inplace=True)
user_movie_matrix.head()

* Let's take two examples of movies.

In [None]:
Matrix = user_movie_matrix['Matrix, The (1999)']
Matrix.head(10)

In [None]:
Terminator = user_movie_matrix['Terminator 2: Judgment Day (1991)']
Terminator.head(10)

* How similar with those two movies?

In [None]:
Matrix.corr(Terminator)

* Which movie is the most similar with the "Matrix, The (1999)"?

In [None]:
Matrix_corr = pd.DataFrame(user_movie_matrix.corrwith(Matrix), columns=['correl'])
Matrix_corr.sort_values('correl', ascending=False).head(15)

* Now, let's use one more metric for similarity - Cosine similarity

In [None]:
movie_user_matrix = df_movies.pivot_table(index='title', columns='userId', values='rating')
movie_user_matrix.head()

In [None]:
movie_user_matrix.fillna(0, inplace=True)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

item_based_filter = cosine_similarity(movie_user_matrix)

In [None]:
item_based_matrix = pd.DataFrame(index=movie_user_matrix.index, columns=movie_user_matrix.index, data=item_based_filter)

In [None]:
item_based_matrix.head()

In [None]:
item_based_matrix['Matrix, The (1999)'].sort_values(ascending=False).head(20)

* You can also check out further materials to learn.
    * "Recommender Systems: An Introduction" by D. Jannach et al. (https://www.amazon.com/Recommender-Systems-Introduction-Dietmar-Jannach/dp/0521493366).
    * https://www.youtube.com/watch?v=9siFuMMHNIA
    * https://www.youtube.com/watch?v=39vJRxIPSxw