## Import Libraries

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [11]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import SVD, Reader, Dataset 
from surprise.model_selection import cross_validate

## Data Understanding

In [59]:
movies = pd.read_csv("MovieDatasetOriginal.csv")
movies.head()

Unnamed: 0.1,Unnamed: 0,id,imdb_id,popularity,budget,revenue,original_title,cast,homepage,director,tagline,keywords,overview,runtime,genres,production_companies,release_date,vote_count,vote_average,release_year,budget_adj,revenue_adj,profit,popularity_level
0,0,135397,tt0369610,32.985763,150000000,1513528810,Jurassic World,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,http://www.jurassicworld.com/,Colin Trevorrow,The park is open.,monster|dna|tyrannosaurus rex|velociraptor|island,Twenty-two years after the events of Jurassic ...,124,Action|Adventure|Science Fiction|Thriller,Universal Studios|Amblin Entertainment|Legenda...,9/6/2015,5562,6.5,2015,137999939.3,1392446000.0,1363528810,High
1,1,76341,tt1392190,28.419936,150000000,378436354,Mad Max: Fury Road,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,http://www.madmaxmovie.com/,George Miller,What a Lovely Day.,future|chase|post-apocalyptic|dystopia|australia,An apocalyptic story set in the furthest reach...,120,Action|Adventure|Science Fiction|Thriller,Village Roadshow Pictures|Kennedy Miller Produ...,13/5/2015,6185,7.1,2015,137999939.3,348161300.0,228436354,High
2,2,262500,tt2908446,13.112507,110000000,295238201,Insurgent,Shailene Woodley|Theo James|Kate Winslet|Ansel...,http://www.thedivergentseries.movie/#insurgent,Robert Schwentke,One Choice Can Destroy You,based on novel|revolution|dystopia|sequel|dyst...,Beatrice Prior must confront her inner demons ...,119,Adventure|Science Fiction|Thriller,Summit Entertainment|Mandeville Films|Red Wago...,18/3/2015,2480,6.3,2015,101199955.5,271619000.0,185238201,High
3,3,140607,tt2488496,11.173104,200000000,2068178225,Star Wars: The Force Awakens,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,http://www.starwars.com/films/star-wars-episod...,J.J. Abrams,Every generation has a story.,android|spaceship|jedi|space opera|3d,Thirty years after defeating the Galactic Empi...,136,Action|Adventure|Science Fiction|Fantasy,Lucasfilm|Truenorth Productions|Bad Robot,15/12/2015,5292,7.5,2015,183999919.0,1902723000.0,1868178225,High
4,4,168259,tt2820852,9.335014,190000000,1506249360,Furious 7,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,http://www.furious7.com/,James Wan,Vengeance Hits Home,car race|speed|revenge|suspense|car,Deckard Shaw seeks revenge against Dominic Tor...,137,Action|Crime|Thriller,Universal Pictures|Original Film|Media Rights ...,1/4/2015,2947,7.3,2015,174799923.1,1385749000.0,1316249360,High


In [26]:
movies.columns

Index(['Unnamed: 0', 'id', 'imdb_id', 'popularity', 'budget', 'revenue',
       'original_title', 'cast', 'homepage', 'director', 'tagline', 'keywords',
       'overview', 'runtime', 'genres', 'production_companies', 'release_date',
       'vote_count', 'vote_average', 'release_year', 'budget_adj',
       'revenue_adj', 'profit', 'popularity_level'],
      dtype='object')

## Data Preprocessing for 'Overview' Content Based Recommender System

In [27]:
movies[['overview']].head()

Unnamed: 0,overview
0,Twenty-two years after the events of Jurassic ...
1,An apocalyptic story set in the furthest reach...
2,Beatrice Prior must confront her inner demons ...
3,Thirty years after defeating the Galactic Empi...
4,Deckard Shaw seeks revenge against Dominic Tor...


In [15]:
movies.isnull().sum()

overview    0
dtype: int64

### Constructing TF-IDF Matrix

Term Frequency (TF) is the relative frequency of a word in a document and is given as (term instances/total instances). Inverse Document Frequency (IDF) is the relative count of documents containing the term and is given as log(number of documents/documents with term). The overall importance of each word to the documents in which they appear is equal to TF * IDF

This gives us a matrix where each column represents a word in the overall overview vocabulary and each row represents a movie.This is done to reduce the importance of words that occur frequently in plot overviews and therefore, their significance in computing the final similarity score.

In [28]:
tfidfv=TfidfVectorizer(analyzer='word', stop_words='english')
tfidfv_matrix=tfidfv.fit_transform(movies['overview'])
print(tfidfv_matrix.todense())
tfidfv_matrix.todense().shape

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


(1287, 10470)

### Computing Similarity Score

In [29]:
cosine_sim1 = linear_kernel(tfidfv_matrix, tfidfv_matrix)

In [30]:
cosine_sim1.shape 

(1287, 1287)

In [31]:
indices=pd.Series(data=list(movies.index), index= movies['original_title'] )

In [32]:
indices.head()

original_title
Jurassic World                  0
Mad Max: Fury Road              1
Insurgent                       2
Star Wars: The Force Awakens    3
Furious 7                       4
dtype: int64

## Modelling

In [37]:
# Function that takes in movie title as input and outputs most similar movies
def content_recommendations(title, cosine_sim):
    
    # Get the index of the movie that matches the title
    idx = indices[title]
    
    # Get the pairwsie similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort the movies based on the similarity scores
    sim_scores.sort(key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar movies
    sim_scores=sim_scores[1:11]
    
    # Get the movie indices
    ind=[]
    for (x,y) in sim_scores:
        ind.append(x)
        
    # Return the top 10 most similar movies
    tit=[]
    for x in ind:
        tit.append(movies.iloc[x]['original_title'])
    return pd.Series(data=tit, index=ind)

In [38]:
content_recommendations('The Dark Knight Rises',cosine_sim1)

453                      The Dark Knight
914                        Batman Begins
80                  Secret in Their Eyes
486                            Max Payne
612                              Beastly
112                      American Sniper
815                            Daredevil
1207                              Eraser
521     Justice League: The New Frontier
216                              Case 39
dtype: object

In [39]:
content_recommendations('The Avengers',cosine_sim1)

14                  Avengers: Age of Ultron
24             Kingsman: The Secret Service
220                                    Push
1044                 The Day After Tomorrow
48                                      Spy
1065             Team America: World Police
644                             The Dilemma
981                            The Fountain
1241    Lock, Stock and Two Smoking Barrels
385         Wall Street: Money Never Sleeps
dtype: object

## Movie Cast, Crew, Keywords, Genres Based Recommender

## Data Preprocessing

In [60]:
movies[['cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,cast,director,keywords,genres
0,Chris Pratt|Bryce Dallas Howard|Irrfan Khan|Vi...,Colin Trevorrow,monster|dna|tyrannosaurus rex|velociraptor|island,Action|Adventure|Science Fiction|Thriller
1,Tom Hardy|Charlize Theron|Hugh Keays-Byrne|Nic...,George Miller,future|chase|post-apocalyptic|dystopia|australia,Action|Adventure|Science Fiction|Thriller
2,Shailene Woodley|Theo James|Kate Winslet|Ansel...,Robert Schwentke,based on novel|revolution|dystopia|sequel|dyst...,Adventure|Science Fiction|Thriller
3,Harrison Ford|Mark Hamill|Carrie Fisher|Adam D...,J.J. Abrams,android|spaceship|jedi|space opera|3d,Action|Adventure|Science Fiction|Fantasy
4,Vin Diesel|Paul Walker|Jason Statham|Michelle ...,James Wan,car race|speed|revenge|suspense|car,Action|Crime|Thriller


In [61]:
def format_names(x):
    # Check if the input is a string and contains '|'
    if isinstance(x, str) and '|' in x:
        # Split the string by '|', strip whitespace, and return as a list
        return [name.strip() for name in x.split('|')]
    else:
        # If input is not a string or doesn't contain '|', return an empty list
        return []

# Specify columns to process
features = ['cast', 'keywords', 'genres']

# Apply the format_names function to each specified feature column
for feature in features:
    movies[feature] = movies[feature].apply(format_names)

In [62]:
def format_director(x):
    # Check if the input is a non-empty string
    if isinstance(x, str) and x.strip():  # Ensure the string is not empty after stripping whitespace
        return [x.strip()]  # Return a list with the stripped director name
    else:
        return []  # Return an empty list for invalid or empty input

# Apply the format_director function to the 'director' column
movies['director'] = movies['director'].apply(format_director)

In [63]:
movies[['cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,cast,director,keywords,genres
0,"[Chris Pratt, Bryce Dallas Howard, Irrfan Khan...",[Colin Trevorrow],"[monster, dna, tyrannosaurus rex, velociraptor...","[Action, Adventure, Science Fiction, Thriller]"
1,"[Tom Hardy, Charlize Theron, Hugh Keays-Byrne,...",[George Miller],"[future, chase, post-apocalyptic, dystopia, au...","[Action, Adventure, Science Fiction, Thriller]"
2,"[Shailene Woodley, Theo James, Kate Winslet, A...",[Robert Schwentke],"[based on novel, revolution, dystopia, sequel,...","[Adventure, Science Fiction, Thriller]"
3,"[Harrison Ford, Mark Hamill, Carrie Fisher, Ad...",[J.J. Abrams],"[android, spaceship, jedi, space opera, 3d]","[Action, Adventure, Science Fiction, Fantasy]"
4,"[Vin Diesel, Paul Walker, Jason Statham, Miche...",[James Wan],"[car race, speed, revenge, suspense, car]","[Action, Crime, Thriller]"


In [64]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #Check if director exists. If not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [65]:
# Apply clean_data function to your features.
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movies[feature] = movies[feature].apply(clean_data)

In [66]:
movies[['cast', 'director', 'keywords', 'genres']].head()

Unnamed: 0,cast,director,keywords,genres
0,"[chrispratt, brycedallashoward, irrfankhan, vi...",[colintrevorrow],"[monster, dna, tyrannosaurusrex, velociraptor,...","[action, adventure, sciencefiction, thriller]"
1,"[tomhardy, charlizetheron, hughkeays-byrne, ni...",[georgemiller],"[future, chase, post-apocalyptic, dystopia, au...","[action, adventure, sciencefiction, thriller]"
2,"[shailenewoodley, theojames, katewinslet, anse...",[robertschwentke],"[basedonnovel, revolution, dystopia, sequel, d...","[adventure, sciencefiction, thriller]"
3,"[harrisonford, markhamill, carriefisher, adamd...",[j.j.abrams],"[android, spaceship, jedi, spaceopera, 3d]","[action, adventure, sciencefiction, fantasy]"
4,"[vindiesel, paulwalker, jasonstatham, michelle...",[jameswan],"[carrace, speed, revenge, suspense, car]","[action, crime, thriller]"


In [69]:
def create_soup(row):
    # Join keywords, cast, and genres into a single string
    soup_parts = []
    
    # Append keywords (if available)
    if isinstance(row['keywords'], list):
        soup_parts.extend(row['keywords'])
    
    # Append cast (if available)
    if isinstance(row['cast'], list):
        soup_parts.extend(row['cast'])
    
    # Append director (if available)
    if isinstance(row['director'], str):
        soup_parts.append(row['director'])
    
    # Append genres (if available)
    if isinstance(row['genres'], list):
        soup_parts.extend(row['genres'])
    
    # Join all soup parts into a single string
    return ' '.join(soup_parts)

# Apply the create_soup function to each row of the DataFrame along axis=1
movies['soup'] = movies.apply(create_soup, axis=1)

In [71]:
from sklearn.feature_extraction.text import CountVectorizer

count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(movies['soup'])

In [72]:
cosine_sim2 = cosine_similarity(count_matrix, count_matrix)

In [73]:
movies = movies.reset_index()
indices = pd.Series(movies.index, index=movies['original_title'])

In [75]:
content_recommendations('The Dark Knight Rises', cosine_sim2)

453         The Dark Knight
914           Batman Begins
129     A Most Violent Year
946             Harsh Times
487           Vantage Point
1126                 Hitman
352                  Takers
45                 Blackhat
504              RockNRolla
883          Gangster Squad
dtype: object

In [80]:
content_recommendations('The Avengers', cosine_sim2)

14                 Avengers: Age of Ultron
282                             Iron Man 2
95     Captain America: The Winter Soldier
455                               Iron Man
535     Captain America: The First Avenger
848                   Thor: The Dark World
17                                 Ant-Man
849                             Iron Man 3
539                                   Thor
521       Justice League: The New Frontier
dtype: object

In [81]:
content_recommendations('The Avengers', cosine_sim2)

14                 Avengers: Age of Ultron
282                             Iron Man 2
95     Captain America: The Winter Soldier
455                               Iron Man
535     Captain America: The First Avenger
848                   Thor: The Dark World
17                                 Ant-Man
849                             Iron Man 3
539                                   Thor
521       Justice League: The New Frontier
dtype: object