### Recommender System: Content Based Filtering Using Cosine Similarity

- #### Model 1: Recommendation based on Genre
- #### Model 2: Recommendation based on Title

#### Importing Libraries and Loading Data 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

#### Load Data

In [2]:
# Reading movies file
movies = pd.read_csv('movies.csv')

movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


In [3]:
# Reading movies file
movies = pd.read_csv('movies.csv', sep=',', encoding='latin-1', usecols=['title', 'genres'])

movies.head()

Unnamed: 0,title,genres
0,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,Jumanji,Adventure|Children|Fantasy
2,Grumpier Old Men,Comedy|Romance
3,Waiting to Exhale,Comedy|Drama|Romance
4,Father of the Bride Part II,Comedy


In [4]:
movies.shape

(9742, 2)

#### Preprocessing
- Break up the big genre string into a string array
- Convert genres to string value

In [5]:
movies['genres'] = movies['genres'].str.split('|')

movies['genres'] = movies['genres'].fillna("").astype('str')

movies.head()

Unnamed: 0,title,genres
0,Toy Story,"['Adventure', 'Animation', 'Children', 'Comedy..."
1,Jumanji,"['Adventure', 'Children', 'Fantasy']"
2,Grumpier Old Men,"['Comedy', 'Romance']"
3,Waiting to Exhale,"['Comedy', 'Drama', 'Romance']"
4,Father of the Bride Part II,['Comedy']


### Model 1: Recommendation based on Genre

- Convert a collection of raw documents to a matrix of TF-IDF features - expressing the importance of a word to a document 
- Using TfidfVectorizer to convert genres in 2-gram words excluding stopwords.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=0, stop_words='english')

tfidf_matrix = tf.fit_transform(movies['genres'])

tfidf_matrix.shape

(9742, 177)

#### Compute & display the tf-idf scores/weight of the words

In [7]:
df = pd.DataFrame(tfidf_matrix[0].T.todense(), index=tf.get_feature_names(), columns=["TF-IDF"])

df = df.sort_values('TF-IDF', ascending=False)

df.head(10)

Unnamed: 0,TF-IDF
comedy fantasy,0.405143
adventure animation,0.400886
animation children,0.383695
children comedy,0.368188
animation,0.323359
children,0.31623
fantasy,0.30254
adventure,0.261108
comedy,0.167614
film noir,0.0


#### Cosine similarity is taken between matrix which is transformed.

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

#### Display first 5 values of the cosine similarity matrix

In [9]:
cosine_sim[:4, :4]

array([[1.        , 0.31379419, 0.0611029 , 0.05271111],
       [0.31379419, 1.        , 0.        , 0.        ],
       [0.0611029 , 0.        , 1.        , 0.35172407],
       [0.05271111, 0.        , 0.35172407, 1.        ]])

#### Build a 1-dimensional array with movie titles


In [10]:
titles = movies['title']

indices = pd.Series(movies.index, index=movies['title'])

indices.head()

title
Toy Story                       0
Jumanji                         1
Grumpier Old Men                2
Waiting to Exhale               3
Father of the Bride Part II     4
dtype: int64

#### Test model

- Generating recommendations based on similar genres and having high cosine similarity.



In [11]:
def genre_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    
    return titles.iloc[movie_indices]

#### Get top 10 movie recommendations based on the cosine similarity score of movie genres


In [12]:
genre_recommendations('Dark Knight ').head(10)

8387                        Need for Speed 
8149    Grandmaster, The (Yi dai zong shi) 
123                              Apollo 13 
8026                            Life of Pi 
8396                                  Noah 
38                         Dead Presidents 
341                            Bad Company 
347           Faster Pussycat! Kill! Kill! 
430                      Menace II Society 
568                        Substitute, The 
Name: title, dtype: object

In [13]:
genre_recommendations('Toy Story ').head(10)

1706                                                Antz 
2355                                         Toy Story 2 
2809             Adventures of Rocky and Bullwinkle, The 
3000                           Emperor's New Groove, The 
3568                                      Monsters, Inc. 
6194                                           Wild, The 
6486                                     Shrek the Third 
6948                             Tale of Despereaux, The 
7760    Asterix and the Vikings (AstÃ©rix et les Vikin...
8219                                               Turbo 
Name: title, dtype: object

In [14]:
genre_recommendations('Jumanji ').head(10)

53                           Indian in the Cupboard, The 
109                           NeverEnding Story III, The 
767                             Escape to Witch Mountain 
1514                  Darby O'Gill and the Little People 
1556                                        Return to Oz 
1617                              NeverEnding Story, The 
1618         NeverEnding Story II: The Next Chapter, The 
1799                              Santa Claus: The Movie 
3574    Harry Potter and the Sorcerer's Stone (a.k.a. ...
6075    Chronicles of Narnia: The Lion, the Witch and ...
Name: title, dtype: object

### Model 2: Recommendation based on Title

- Using TfidfVectorizer to convert genres in 2-gram words excluding stopwords to a matrix of TF-IDF features.

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(movies['title'])

tfidf_matrix.shape

(9742, 20558)

#### Compute & display the tf-idf scores/weight of the words

In [17]:
df = pd.DataFrame(tfidf_matrix[0].T.todense(), index=tf.get_feature_names(), columns=["TF-IDF"])

df = df.sort_values('TF-IDF', ascending=False)

df.head(10)

Unnamed: 0,TF-IDF
toy story,0.653298
toy,0.62319
story,0.429927
00,0.0
outpost,0.0
outsiders,0.0
outside providence,0.0
outside ebbing,0.0
outside,0.0
outro,0.0


#### Cosine similarity is taken between matrix which is transformed.

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

cosine_sim[:4, :4]

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

#### Build a 1-dimensional array with movie titles


In [19]:
titles = movies['title']

indices = pd.Series(movies.index, index=movies['title'])

indices.head()

title
Toy Story                       0
Jumanji                         1
Grumpier Old Men                2
Waiting to Exhale               3
Father of the Bride Part II     4
dtype: int64

#### Function that get movie recommendations based on the cosine similarity score of movie genres


In [20]:
def title_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    movie_indices = [i[0] for i in sim_scores]
    
    return titles.iloc[movie_indices]

#### Get top 10 movie recommendations based on the cosine similarity score of movie genres

In [21]:
title_recommendations('Dark Knight ').head(10)

7768                     Dark Knight Rises, The 
8032    Batman: The Dark Knight Returns, Part 1 
8080    Batman: The Dark Knight Returns, Part 2 
140                                First Knight 
2417                         Cry in the Dark, A 
5778                          Alone in the Dark 
7375                             Knight and Day 
3576                               Black Knight 
3190                           Knight's Tale, A 
6858                       Alone in the Dark II 
Name: title, dtype: object

In [22]:
genre_recommendations('Toy Story ').head(10)

2355           Toy Story 2 
7355           Toy Story 3 
3595              Toy, The 
1570            L.A. Story 
2227      Story of Us, The 
4089          Toy Soldiers 
3187            Love Story 
2110    Christmas Story, A 
4047           Ghost Story 
8736            True Story 
Name: title, dtype: object

In [23]:
genre_recommendations('Jumanji ').head(10)

9636    Jumanji: Welcome to the Jungle 
0                            Toy Story 
2                     Grumpier Old Men 
3                    Waiting to Exhale 
4          Father of the Bride Part II 
5                                 Heat 
6                              Sabrina 
7                         Tom and Huck 
8                         Sudden Death 
9                            GoldenEye 
Name: title, dtype: object