In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Prepare the Dataset

In [2]:
df_movies = pd.read_csv('movies_metadata.csv')
df_ratings = pd.read_csv('ratings_small.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df_movies = df_movies[~df_movies['id'].str.contains('-')].iloc[:10000]

In [4]:
# create rating list with movie titles
df_ratings['movieId'] = df_ratings['movieId'].apply(str)
df_ratings_with_titles = pd.merge(
    left=df_ratings,
    right=df_movies[['id', 'title']],
    how='inner',
    left_on='movieId',
    right_on='id'
)

df_ratings_with_titles.head()

Unnamed: 0,userId,movieId,rating,timestamp,id,title
0,1,1371,2.5,1260759135,1371,Rocky III
1,4,1371,4.0,949810302,1371,Rocky III
2,7,1371,3.0,851869160,1371,Rocky III
3,19,1371,4.0,855193404,1371,Rocky III
4,21,1371,3.0,853852263,1371,Rocky III


In [5]:
df_ratings_with_titles['rating'].describe()

count    30162.000000
mean         3.574929
std          1.049331
min          0.500000
25%          3.000000
50%          4.000000
75%          4.000000
max          5.000000
Name: rating, dtype: float64

In [6]:
df_user_movie_matrix = df_ratings_with_titles.pivot_table(
    index='userId', columns='title',
    values='rating', fill_value=0
)

df_user_movie_id_matrix = df_ratings_with_titles.pivot_table(
    index='userId', columns='movieId',
    values='rating', fill_value=0
)

In [7]:
df_movies['genres'][0]

"[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]"

In [8]:
from ast import literal_eval

def get_genre_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        
        if len(names) > 5:
            names = names[:5]
        return names
    
    return []

df_movie_genres = df_movies[['id', 'genres']].copy()
df_movie_genres['genres'] = df_movie_genres['genres'].apply(literal_eval).apply(get_genre_list)

In [9]:
df_movie_genres_ix = df_movie_genres.set_index('id')

genre_list = df_movie_genres_ix['genres'].apply(pd.Series).stack()
df_movie_feature_matrix =  pd.get_dummies(genre_list).sum(level=0)

# Content-based Recommendation

## 1. Similarity Measure

In [10]:
df_ratings_with_titles['userId'].value_counts()

564    616
547    415
452    351
15     333
311    329
      ... 
566      3
498      2
227      2
71       2
29       1
Name: userId, Length: 671, dtype: int64

In [11]:
# current user and a movie he/she has watched
current_user = 15
current_movie = '296'

In [12]:
current_user_watched_movies = df_ratings_with_titles[df_ratings_with_titles['userId'] == current_user]['movieId']

In [13]:
current_user_watched_movies_rating = df_ratings_with_titles[df_ratings_with_titles['userId'] == current_user][['movieId', 'rating']]

In [14]:
from sklearn.metrics.pairwise import cosine_similarity

df_cosine_matrix = pd.DataFrame(
    data=cosine_similarity(df_movie_feature_matrix),
    columns=df_movie_feature_matrix.index.tolist(),
    index=df_movie_feature_matrix.index.tolist()
)

df_cosine_matrix.head()

Unnamed: 0,862,8844,15602,31357,11862,949,11860,45325,9091,710,...,31065,22033,37992,32850,68883,43379,9393,16972,21325,13409
862,1.0,0.333333,0.408248,0.333333,0.57735,0.0,0.408248,0.288675,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.333333,0.0,0.408248,0.0,0.0
8844,0.333333,1.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.333333,0.333333,...,0.288675,0.0,0.0,0.0,0.288675,0.333333,0.0,0.0,0.333333,0.288675
15602,0.408248,0.0,1.0,0.816497,0.707107,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.353553,0.408248,0.0,0.5,0.0,0.0
31357,0.333333,0.0,0.816497,1.0,0.57735,0.288675,0.816497,0.288675,0.0,0.0,...,0.288675,0.0,0.408248,0.408248,0.57735,0.666667,0.57735,0.408248,0.333333,0.288675
11862,0.57735,0.0,0.707107,0.57735,1.0,0.0,0.707107,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.707107,0.0,0.0


In [15]:
# get most similar movies with current movie
df_current_user_similar_movie = df_cosine_matrix[current_movie].sort_values(ascending=False).reset_index().rename(
    columns={'index': 'id', '296': 'sim_score'}
)

df_current_user_similar_movie = df_current_user_similar_movie[
    ~df_current_user_similar_movie['id'].isin(current_user_watched_movies)
]

top_5_recommendation = df_current_user_similar_movie.iloc[:5]['id'].tolist()

In [16]:
df_movies[df_movies['id'].isin(top_5_recommendation)]['title']

282      Nemesis 2 - Nebula
1195         The Terminator
1773               Godzilla
3441    The Crow: Salvation
5765            Equilibrium
Name: title, dtype: object

## 2. Content-based Filtering

In [17]:
# movie-feature matrix for current user

df_movie_feature_matrix_current_user = df_movie_feature_matrix[
    df_movie_feature_matrix.index.isin(current_user_watched_movies)
]

df_movie_feature_matrix_current_user

Unnamed: 0_level_0,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,Foreign,History,Horror,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
524,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
5,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2054,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0
3512,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1909,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4235,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6593,1,0,0,0,0,0,1,0,1,1,0,0,0,0,0,1,0,0,0,0
1093,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0
1589,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0


In [18]:
# movie feature matrix with ratings

df_movie_feature_matrix_current_user_pref = pd.merge(
    left=df_movie_feature_matrix_current_user.reset_index(),
    right=current_user_watched_movies_rating,
    left_on='id',
    right_on='movieId',
    how='left'
)

genres = df_movie_feature_matrix_current_user_pref.columns.tolist()
genres.remove('id')
genres.remove('movieId')
genres.remove('rating')

for genre in genres:
    df_movie_feature_matrix_current_user_pref[genre] = df_movie_feature_matrix_current_user_pref[genre].apply(int) * df_movie_feature_matrix_current_user_pref['rating']
    
df_movie_feature_matrix_current_user_pref.head()

Unnamed: 0,id,Action,Adventure,Animation,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Music,Mystery,Romance,Science Fiction,TV Movie,Thriller,War,Western,movieId,rating
0,524,0.0,0.0,0.0,0.0,2.5,0.0,2.5,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,524,2.5
1,5,0.0,0.0,0.0,4.5,4.5,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5,4.5
2,2054,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2054,2.0
3,3512,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,3512,3.0
4,1909,0.0,0.0,0.0,2.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,1909,2.0


In [19]:
# user feature vector
user_feature_vector = df_movie_feature_matrix_current_user_pref[genres].sum()/df_movie_feature_matrix_current_user_pref[genres].sum().sum()

In [20]:
df_movie_feature_matrix_not_watched = df_movie_feature_matrix[
    ~df_movie_feature_matrix.index.isin(current_user_watched_movies)
]

In [21]:
# estimate user's preference based on user-feature vector
genres = df_movie_feature_matrix_not_watched.columns.tolist()

for genre in genres:
    df_movie_feature_matrix_not_watched[genre] = df_movie_feature_matrix_not_watched[genre] * user_feature_vector[genre]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [22]:
curr_user_est_pref_score = (df_movie_feature_matrix_not_watched.sum(axis=1) 
                            / df_movie_feature_matrix_not_watched.sum(axis=1).sum())

In [23]:
df_curr_user_est_pref_score = curr_user_est_pref_score.reset_index().rename(columns={0: 'est_pref_score'})
df_curr_user_est_pref_score = pd.merge(
    left=df_curr_user_est_pref_score,
    right=df_movies[['id', 'title']],
    how='left',
    on='id'
)

df_curr_user_est_pref_score.head()

Unnamed: 0,id,est_pref_score,title
0,862,5.3e-05,Toy Story
1,8844,3.9e-05,Jumanji
2,15602,8.3e-05,Grumpier Old Men
3,31357,0.000177,Waiting to Exhale
4,11862,4.5e-05,Father of the Bride Part II


In [24]:
# get top 5 movies based on estimated score
# top_5_recommendation = curr_user_est_pref_score.sort_values(ascending=False).index[:5]
top_5_recommendation = df_curr_user_est_pref_score.drop_duplicates().sort_values(by='est_pref_score', ascending=False)[:5]
# top_5_recommendation

In [25]:
# df_movies[df_movies['id'].isin(top_5_recommendation)]['title'].unique()
top_5_recommendation

Unnamed: 0,id,est_pref_score,title
6940,5511,0.000355,Le Samouraï
7634,23305,0.000307,The Warrior
637,105045,0.000264,The Promise
5820,9930,0.00026,Legal Eagles
5526,4912,0.00026,Confessions of a Dangerous Mind


In [26]:
user_feature_vector

Action             0.076714
Adventure          0.056647
Animation          0.006271
Comedy             0.109532
Crime              0.089047
Documentary        0.004599
Drama              0.228470
Family             0.013796
Fantasy            0.025084
Foreign            0.004390
History            0.026547
Horror             0.028637
Music              0.017559
Mystery            0.047868
Romance            0.091346
Science Fiction    0.041597
TV Movie           0.000418
Thriller           0.112667
War                0.012542
Western            0.006271
dtype: float64

# Collaborative Filtering

## 1. Memory-based Filtering

### User-based

In [123]:
# which user we'll give a recommendation to
current_user

# create user similarity matrix
df_user_similarity_score = pd.DataFrame(
    data=cosine_similarity(df_user_movie_matrix),
    index=df_user_movie_matrix.index.tolist(),
    columns=df_user_movie_matrix.index.tolist()
)

# get users with similar preference with current user
df_users_similar_w_curr = df_user_similarity_score[current_user].sort_values(
    ascending=False
).reset_index().rename(
    columns={'index': 'userId', current_user: 'sim_score'}
)

top_n = 3
df_top_n_similar_users = df_users_similar_w_curr[:top_n + 1]

# get movies who have been watched by top n similar users
df_top_n_movie_matrix = df_user_movie_matrix[
    df_user_movie_matrix.index.isin(df_top_n_similar_users['userId'])
].T

# sudah ditonton oleh top n tapi belum ditonton oleh current user
user_id_tmp = df_top_n_movie_matrix.columns.tolist()
user_id_tmp.remove(current_user)

df_curr_user_unwatched_movies = df_top_n_movie_matrix[
    (df_top_n_movie_matrix[current_user] == 0)
    & ((df_top_n_movie_matrix[user_id_tmp[0]] > 0)
       & (df_top_n_movie_matrix[user_id_tmp[1]] > 0)
       & (df_top_n_movie_matrix[user_id_tmp[2]] > 0)
      )
]

# order and filter
df_curr_user_unwatched_movies['est_rating_by_curr_user'] = df_curr_user_unwatched_movies.sum(axis=1) / top_n
df_curr_user_unwatched_movies.sort_values(by='est_rating_by_curr_user', ascending=False)[:5]['est_rating_by_curr_user']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


title
Dogville                 4.000000
One Night at McCool's    3.833333
The Thirteenth Floor     3.666667
The War of the Roses     3.666667
To Kill a Mockingbird    3.500000
Name: est_rating_by_curr_user, dtype: float64

## Item-based

In [125]:
# which movie we'd like to recommend to a user
current_movie_title = df_movies[df_movies['id'] == current_movie]['title'].values[0]

# create movie similarity matrix
df_movie_similarity_score = pd.DataFrame(
    data=cosine_similarity(df_user_movie_matrix.T),
    index=df_user_movie_matrix.T.index.tolist(),
    columns=df_user_movie_matrix.T.index.tolist()
)

# get movies which are similarly preferenced as current movie
df_movies_similar_w_curr = df_movie_similarity_score[current_movie_title].sort_values(
    ascending=False
).reset_index().rename(
    columns={'index': 'title', current_movie_title: 'sim_score'}
)

top_n = 3
df_top_n_similar_movies = df_movies_similar_w_curr[:top_n + 1]

# get users who have watched top n movies but haven't watched current movie
df_top_n_users_matrix = df_user_movie_matrix.T[
    df_user_movie_matrix.T.index.isin(df_top_n_similar_movies['title'])
].T

# sudah menonton top n tapi belum menonton current movie
movie_tmp = df_top_n_users_matrix.columns.tolist()
movie_tmp.remove(current_movie_title)

df_curr_movie_unwatched_users = df_top_n_users_matrix[
    (df_top_n_users_matrix[current_movie_title] == 0)
    & ((df_top_n_users_matrix[movie_tmp[0]] > 0)
       & (df_top_n_users_matrix[movie_tmp[1]] > 0)
       & (df_top_n_users_matrix[movie_tmp[2]] > 0)
      )
]

df_curr_movie_unwatched_users['est_rating_for_curr_movie'] = df_curr_movie_unwatched_users.sum(axis=1) / top_n
df_curr_movie_unwatched_users.sort_values(
    by='est_rating_for_curr_movie', ascending=False
)[:5]['est_rating_for_curr_movie']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


userId
496    5.000000
63     4.666667
314    4.666667
354    4.333333
126    4.333333
Name: est_rating_for_curr_movie, dtype: float64