### Content Based Filtering

In [29]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [30]:
movies_df = pd.read_csv("./ml-latest/movies.csv")
ratings_df = pd.read_csv("./ml-latest/ratings.csv")
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [31]:
# clean the data
# Remove year from title and make a new column
# Split genre column
# 
print(movies_df.shape)
print(movies_df.size)
print(movies_df.dtypes)
# print(movies_df.info())
# print(movies_df.describe())

# Extracting Year from title column
movies_df["year"] = movies_df.title.str.extract("(\(\d{4}\))", expand=False)
# Removing parenthesses from year column
movies_df["year"] = movies_df.year.str.extract("(\d{4})", expand=False)
# movies_df["year"] = movies_df.year.str.extract("(\d{4})", expand=False)
# Removing Year from title
movies_df["title"] = movies_df.title.str.replace(r"(\(\d{4}\))", "", regex=True)
movies_df["title"] = movies_df["title"].map(lambda x : x.strip())
movies_df.head()

(34208, 3)
102624
movieId     int64
title      object
genres     object
dtype: object


Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


In [32]:
# Split the gener column to the list
# movies_df["genres"] = movies_df["genres"].map(lambda x : x.split("|"))
# other way
movies_df["genres"] = movies_df.genres.str.split('|')

movies_df.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [33]:

movies_with_genres_df = movies_df.copy()
movies_with_genres_df.head()


for index, row in movies_df.iterrows():
    for genre in row["genres"]:
        movies_with_genres_df.at[index, genre] = 1
movies_with_genres_df = movies_with_genres_df.fillna(0)
movies_with_genres_df.head()

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [34]:
# Refining rating dataset
ratings_df.head()


Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [35]:
# Remove timestamps - Not needed
ratings_df = ratings_df.drop("timestamp", axis= 1)
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


### Content Based Recommendation System

In [36]:
# Two diff types of user input to play with
userInput = [
            {'title':'Breakfast Club, The', 'rating':5},
            {'title':'Toy Story', 'rating':3.5},
            {'title':'Jumanji', 'rating':2},
            {'title':"Pulp Fiction", 'rating':5},
            {'title':'Akira', 'rating':4.5}
         ] 
         
# Predicting based on this input 
userInput1 = [
            {'title':'Interstellar', 'rating':5},
            {'title':'Matrix, The', 'rating':5},
            {'title':'Edge of Tomorrow', 'rating':3.5},
            {'title':"Source Code", 'rating':5}
         ] 
inputMovies = pd.DataFrame(userInput1)
inputMovies


Unnamed: 0,title,rating
0,Interstellar,5.0
1,"Matrix, The",5.0
2,Edge of Tomorrow,3.5
3,Source Code,5.0


In [37]:
inputId = movies_df[movies_df["title"].isin(inputMovies["title"].tolist())]
# print(inputId.head())
inputMovies = pd.merge(inputId, inputMovies)
# inputMovies.head()
inputMovies = inputMovies.drop("genres", axis= 1).drop("year", axis= 1)
inputMovies

Unnamed: 0,movieId,title,rating
0,2571,"Matrix, The",5.0
1,85414,Source Code,5.0
2,109487,Interstellar,5.0
3,111759,Edge of Tomorrow,3.5


In [38]:
userMovies = movies_with_genres_df[movies_with_genres_df["movieId"].isin(inputMovies["movieId"].tolist())]
userMovies

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
2487,2571,"Matrix, The","[Action, Sci-Fi, Thriller]",1999,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16929,85414,Source Code,"[Action, Drama, Mystery, Sci-Fi, Thriller]",2011,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23044,109487,Interstellar,"[Sci-Fi, IMAX]",2014,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
23619,111759,Edge of Tomorrow,"[Action, Sci-Fi, IMAX]",2014,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
# Resetting the index
userMovies = userMovies.reset_index(drop= True)
# Dropping unnecessary columns to save memory
user_genre_table = userMovies.drop("movieId", axis= 1).drop("title", axis= 1).drop("genres", axis= 1).drop("year", axis= 1)
user_genre_table

Unnamed: 0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
inputMovies["rating"]

0    5.0
1    5.0
2    5.0
3    3.5
Name: rating, dtype: float64

In [41]:
# Dot product to get weights

userProfile = user_genre_table.transpose().dot(inputMovies['rating'])

print(user_genre_table.transpose())
print(inputMovies["rating"])
userProfile

                      0    1    2    3
Adventure           0.0  0.0  0.0  0.0
Animation           0.0  0.0  0.0  0.0
Children            0.0  0.0  0.0  0.0
Comedy              0.0  0.0  0.0  0.0
Fantasy             0.0  0.0  0.0  0.0
Romance             0.0  0.0  0.0  0.0
Drama               0.0  1.0  0.0  0.0
Action              1.0  1.0  0.0  1.0
Crime               0.0  0.0  0.0  0.0
Thriller            1.0  1.0  0.0  0.0
Horror              0.0  0.0  0.0  0.0
Mystery             0.0  1.0  0.0  0.0
Sci-Fi              1.0  1.0  1.0  1.0
IMAX                0.0  0.0  1.0  1.0
Documentary         0.0  0.0  0.0  0.0
War                 0.0  0.0  0.0  0.0
Musical             0.0  0.0  0.0  0.0
Western             0.0  0.0  0.0  0.0
Film-Noir           0.0  0.0  0.0  0.0
(no genres listed)  0.0  0.0  0.0  0.0
0    5.0
1    5.0
2    5.0
3    3.5
Name: rating, dtype: float64


Adventure              0.0
Animation              0.0
Children               0.0
Comedy                 0.0
Fantasy                0.0
Romance                0.0
Drama                  5.0
Action                13.5
Crime                  0.0
Thriller              10.0
Horror                 0.0
Mystery                5.0
Sci-Fi                18.5
IMAX                   8.5
Documentary            0.0
War                    0.0
Musical                0.0
Western                0.0
Film-Noir              0.0
(no genres listed)     0.0
dtype: float64

In [42]:
movies_with_genres_df
genreTable = movies_with_genres_df.set_index(movies_with_genres_df['movieId'])
genreTable = genreTable.drop("movieId", axis= 1).drop("title", axis= 1).drop("genres", axis= 1).drop("year", axis= 1)
genreTable.head()

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,Documentary,War,Musical,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [43]:
genreTable.shape

(34208, 20)

In [44]:

print(userProfile.sum())
recommendation_df = ((genreTable*userProfile).sum(axis= 1))/(userProfile.sum())
recommendation_df.head()

60.5


movieId
1    0.000000
2    0.000000
3    0.000000
4    0.082645
5    0.000000
dtype: float64

In [45]:
# sorting recommendation
recommendation_df = recommendation_df.sort_values(ascending= False)
recommendation_df.head()

movieId
60684    1.000000
79132    1.000000
26701    0.859504
198      0.859504
85414    0.859504
dtype: float64

In [46]:
# Movies recommendation for the new user
movies_df[movies_df["movieId"].isin(recommendation_df.head(20).keys())]

Unnamed: 0,movieId,title,genres,year
196,198,Strange Days,"[Action, Crime, Drama, Mystery, Sci-Fi, Thriller]",1995
6261,6365,"Matrix Reloaded, The","[Action, Adventure, Sci-Fi, Thriller, IMAX]",2003
6823,6934,"Matrix Revolutions, The","[Action, Adventure, Sci-Fi, Thriller, IMAX]",2003
9000,26701,Patlabor: The Movie (Kidô keisatsu patorebâ: T...,"[Action, Animation, Crime, Drama, Film-Noir, M...",1989
10872,43932,Pulse,"[Action, Drama, Fantasy, Horror, Mystery, Sci-...",2006
10896,44191,V for Vendetta,"[Action, Sci-Fi, Thriller, IMAX]",2006
11838,52722,Spider-Man 3,"[Action, Adventure, Sci-Fi, Thriller, IMAX]",2007
11984,53996,Transformers,"[Action, Sci-Fi, Thriller, IMAX]",2007
12276,56174,I Am Legend,"[Action, Horror, Sci-Fi, Thriller, IMAX]",2007
12873,60684,Watchmen,"[Action, Drama, Mystery, Sci-Fi, Thriller, IMAX]",2009


In [47]:
# summary
# Learned how to do movie recommendation
# More data cleaning and visualization


In [48]:
print("The, End!")

The, End!
