In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


#Importing libraries and data

In [None]:
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from pylab import rcParams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Load movies
movies = pd.read_csv("/content/drive/My Drive/MovieRecommender/filtered.csv", header=0, sep=';')
movies = movies.replace({np.nan: None}) 

In [None]:
''' With previously used data given by MadewithML

# Load movies
movies = pd.read_csv("/content/drive/My Drive/MovieRecommender/filtered.csv", header=0, parse_dates=['date'])
movies = movies.replace({np.nan: None}) # replace NaN with None 

'''

# User defined functions

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

# EDA

In [None]:
movies.head()

In [None]:
'''
# Function to display all rows and columns
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)
'''

'\n# Function to display all rows and columns\ndef display_all(df):\n    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): \n        display(df)\n'

In [None]:
display_all(movies.tail().T)

Observations:
- Empty values in cell. Specifically,
    - Some movies do not have taglines, keywords, date collection, revenue, budget, production companies.
- Values in genre, keywords columns are stored as one string, needs to be split and stored as lists. (#Data cleaning1)

In [None]:
display_all(movies.describe(include='all').T)

In [None]:
movies.dtypes

id                       int64
title                   object
tagline                 object
description             object
genres                  object
keywords                object
date                    object
collection              object
runtime                 object
revenue                 object
budget                   int64
director                object
cast                    object
production_companies    object
production_countries    object
popularity              object
average_vote            object
num_votes               object
language                object
imdb_id                 object
poster_url              object
dtype: object

#User profile

In [None]:
movies.shape

(45416, 21)

In [None]:
movies_subset = movies.dropna(subset=['keywords'])
movies_subset = movies_subset.reset_index(drop=True)

In [None]:
movies_subset.head(10)

In [None]:
print(movies.shape)
print(movies_subset.shape)

(45416, 21)
(31088, 21)


In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_vectorizer.fit_transform(movies_subset.keywords)

In [None]:
tfidf_np_matrix = tfidf_matrix.toarray()

In [None]:
tfidf_np_matrix.shape

(31088, 12595)

In [None]:
tfidf_np_matrix[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
indices = pd.Series(movies_subset.title)
indices[:5]

0            LEGO DC Super Hero Girls: Brain Drain
1                        The Truth Is in the Stars
2                                         Firebase
3                        Chris D'Elia: Man on Fire
4    Rory Scovel Tries Stand-Up for the First Time
Name: title, dtype: object

In [None]:
def recommendations(titles, tfidf_np_matrix=tfidf_np_matrix):
  
  idx = []
  recommended_movies=[]

  for title in titles:
    idx.append(indices[indices==title].index[0])
  print(idx)
  user_pref_vector = tfidf_np_matrix[idx].mean(axis=0)
  print(user_pref_vector )
  cosine_sim = cosine_similarity(tfidf_matrix,np.atleast_2d(user_pref_vector))
  df_cosine_sim = pd.DataFrame(cosine_sim,columns=['sim_score'])
  df_cosine_sim = df_cosine_sim.sort_values(by='sim_score',ascending=False)
  print(df_cosine_sim.head(20))
  top_10_indexes = list(df_cosine_sim.iloc[0:(10+len(titles))].index)
  print(top_10_indexes)
  for n,i in enumerate(top_10_indexes):
    if n<len(titles):
      continue;
    recommended_movies.append([movies_subset.title.iloc[i],movies_subset.genres.iloc[i],movies_subset.keywords.iloc[i],movies_subset.popularity.iloc[i],movies_subset.average_vote.iloc[i],movies_subset.num_votes.iloc[i],df_cosine_sim.sim_score.iloc[n]])

  return pd.DataFrame(recommended_movies,columns=['movie_title','genres','keywords','popularity','average_vote','num_votes','cosine_score'])

In [None]:
recommendations(['Toy Story', 'The Dark Knight','The Dark Knight Rises','Ace Ventura: When Nature Calls'])

[18031, 11244, 7704, 18387]
[0. 0. 0. ... 0. 0. 0.]
       sim_score
11244   0.652236
7704    0.652236
18031   0.452569
18387   0.452569
12964   0.378137
20256   0.359609
17518   0.347478
19399   0.328999
320     0.327274
900     0.308285
1237    0.307096
9886    0.303715
7179    0.302242
163     0.298729
1236    0.298729
6796    0.293774
1457    0.291654
7580    0.285195
2285    0.280284
1724    0.279583
[11244, 7704, 18031, 18387, 12964, 20256, 17518, 19399, 320, 900, 1237, 9886, 7179, 163]


Unnamed: 0,movie_title,genres,keywords,popularity,average_vote,num_votes,cosine_score
0,Batman Begins,"action, crime, drama","himalaya, martial arts, dc comics, crime fight...",28.505341,7.5,7511.0,0.378137
1,Batman,"fantasy, action","double life, dc comics, dual identity, chemica...",19.10673,7.0,2145.0,0.359609
2,Batman & Robin,"action, crime, fantasy","double life, dc comics, dual identity, crime f...",17.038824,4.2,1447.0,0.347478
3,Batman Returns,"action, fantasy","holiday, corruption, double life, dc comics, c...",15.001681,6.6,1706.0,0.328999
4,LEGO DC Comics Super Heroes: Justice League - ...,"adventure, animation, action","dc comics, gotham city, joker, super powers, lego",3.581416,7.0,22.0,0.327274
5,Batman Unlimited: Monster Mayhem,"action, animation, family","dc comics, joker, superhero, super powers",2.772324,6.0,40.0,0.308285
6,Batman vs. Robin,"action, adventure, animation","dc comics, gotham city, based on comic, robin,...",5.48942,6.8,218.0,0.307096
7,Justice League: The New Frontier,"action, adventure, animation, science fiction","dc comics, superhero, based on comic, super po...",7.031719,6.6,115.0,0.303715
8,Barbie and the Three Musketeers,"animation, family",based on toy,6.670165,6.2,81.0,0.302242
9,Batman: Return of the Caped Crusaders,"action, animation, comedy","dc comics, superhero, super powers",3.495883,7.2,37.0,0.298729


In [None]:
movies_subset[movies_subset.title.str.contains('Ace Ventura',case=False)==True]

Unnamed: 0,id,title,tagline,description,genres,keywords,date,collection,runtime,revenue,budget,director,cast,production_companies,production_countries,popularity,average_vote,num_votes,language,imdb_id,poster_url
17,9273,Ace Ventura: When Nature Calls,New animals. New adventures. Same hair.,"Summoned from an ashram in Tibet, Ace finds hi...","crime, comedy, adventure","africa, indigenous, human animal relationship,...",1995-11-10 00:00:00,Ace Ventura Collection,90,212386000.0,30000000,Steve Oedekerk,"Jim Carrey, Ian McNeice, Simon Callow, Maynard...","O Entertainment, Warner Bros., Morgan Creek Pr...",United States of America,8.20545,6.1,1128,en,tt0112281,/wRlGnJhEzcxBjvWtvbjhDSU1cIY.jpg
297,3049,Ace Ventura: Pet Detective,"He's the best there is! (Actually, he's the on...",He's Ace Ventura: Pet Detective. Jim Carrey is...,"comedy, mystery","dolphin, mascot, private detective, pets",1994-02-04 00:00:00,Ace Ventura Collection,86,107217000.0,15000000,Tom Shadyac,"Jim Carrey, Sean Young, Courteney Cox, Tone Lo...","Warner Bros., Morgan Creek Productions",United States of America,11.248,6.4,1684,en,tt0109040,/nZirljb8XYbKTWsRQTplDGhx39Q.jpg
