In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
#from surprise import Reader, Dataset, SVD, evaluate

import warnings; warnings.simplefilter('ignore')

In [2]:
md = pd.read_csv("the-movies-dataset/movies_metadata.csv")

In [3]:
md['genres'] = md['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [4]:
vote_counts = md[md['vote_count'].notnull()]['vote_count'].astype('int')
vote_averages = md[md['vote_average'].notnull()]['vote_average'].astype('int')
C = vote_averages.mean()
C

5.244896612406511

In [5]:
m = vote_counts.quantile(0.95)
m

434.0

In [6]:
md['year'] = pd.to_datetime(md['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x != np.nan else np.nan)

qualified = md[(md['vote_count'] >= m) & (md['vote_count'].notnull()) & (md['vote_average'].notnull())][['title', 'vote_count', 'year', 'vote_average', 'popularity', 'genres']]
qualified['vote_count'] = qualified['vote_count'].astype('int')
qualified['vote_average'] = qualified['vote_average'].astype('int')
qualified.shape

(2274, 6)

In [7]:
md.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year'],
      dtype='object')

In [8]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [9]:
qualified['wr'] = qualified.apply(weighted_rating, axis=1)

In [10]:
links_small = pd.read_csv('the-movies-dataset/links_small.csv')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [11]:
md = md.drop([19730, 29503, 35587])
md['id'] = md['id'].astype('int')
smd = md[md['id'].isin(links_small)]
smd.shape

(9099, 25)

In [12]:
smd['wr'] = smd.apply(weighted_rating, axis = 1)

In [13]:
smd.sort_values('wr', ascending = False).head(10)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count,year,wr
314,False,,25000000,"[Drama, Crime]",,278,tt0111161,en,The Shawshank Redemption,Framed in the 1940s for the double murder of h...,...,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Fear can hold you prisoner. Hope can set you f...,The Shawshank Redemption,False,8.5,8358.0,1994,8.339318
834,False,"{'id': 230, 'name': 'The Godfather Collection'...",6000000,"[Drama, Crime]",http://www.thegodfather.com/,238,tt0068646,en,The Godfather,"Spanning the years 1945 to 1955, a chronicle o...",...,175.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,An offer you can't refuse.,The Godfather,False,8.5,6024.0,1972,8.281246
12481,False,"{'id': 263, 'name': 'The Dark Knight Collectio...",185000000,"[Drama, Action, Crime, Thriller]",http://thedarkknight.warnerbros.com/dvdsite/,155,tt0468569,en,The Dark Knight,Batman raises the stakes in his war on crime. ...,...,152.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Why So Serious?,The Dark Knight,False,8.3,12269.0,2008,8.195622
2843,False,,63000000,[Drama],http://www.foxmovies.com/movies/fight-club,550,tt0137523,en,Fight Club,A ticking-time-bomb insomniac and a slippery s...,...,139.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Mischief. Mayhem. Soap.,Fight Club,False,8.3,9678.0,1999,8.168877
292,False,,8000000,"[Thriller, Crime]",,680,tt0110912,en,Pulp Fiction,"A burger-loving hit man, his philosophical par...",...,154.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Just because you are a character doesn't mean ...,Pulp Fiction,False,8.3,8670.0,1994,8.154359
351,False,,55000000,"[Comedy, Drama, Romance]",,13,tt0109830,en,Forrest Gump,A man with a low IQ has accomplished great thi...,...,142.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,"The world will never be the same, once you've ...",Forrest Gump,False,8.2,8147.0,1994,8.05054
522,False,,22000000,"[Drama, History, War]",http://www.schindlerslist.com/,424,tt0108052,en,Schindler's List,The true story of how businessman Oskar Schind...,...,195.0,"[{'iso_639_1': 'de', 'name': 'Deutsch'}, {'iso...",Released,"Whoever saves one life, saves the world entire.",Schindler's List,False,8.3,4436.0,1993,8.027738
23673,False,,3300000,[Drama],http://sonyclassics.com/whiplash/,244786,tt2582802,en,Whiplash,"Under the direction of a ruthless instructor, ...",...,105.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The road to greatness can take you to the edge.,Whiplash,False,8.3,4376.0,2014,8.024342
15480,False,,160000000,"[Action, Thriller, Science Fiction, Mystery, A...",http://inceptionmovie.warnerbros.com/,27205,tt1375666,en,Inception,"Cobb, a skilled thief who commits corporate es...",...,148.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Your mind is the scene of the crime.,Inception,False,8.1,14075.0,2010,8.014597
1154,False,"{'id': 10, 'name': 'Star Wars Collection', 'po...",18000000,"[Adventure, Action, Science Fiction]",http://www.starwars.com/films/star-wars-episod...,1891,tt0080684,en,The Empire Strikes Back,"The epic saga continues as Luke Skywalker, in ...",...,124.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,The Adventure Continues...,The Empire Strikes Back,False,8.2,5998.0,1980,8.000604


In [14]:
credits = pd.read_csv('the-movies-dataset/credits.csv')
keywords = pd.read_csv('the-movies-dataset/keywords.csv')



In [15]:
keywords['id'] = keywords['id'].astype('int')
credits['id'] = credits['id'].astype('int')
md['id'] = md['id'].astype('int')

In [16]:
md = md.merge(credits, on='id')
md = md.merge(keywords, on='id')

In [17]:
smd = md[md['id'].isin(links_small)]
smd.shape

(9219, 28)

In [18]:
smd['cast'] = smd['cast'].apply(literal_eval)
smd['crew'] = smd['crew'].apply(literal_eval)
smd['keywords'] = smd['keywords'].apply(literal_eval)

In [19]:
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [20]:
smd['director'] = smd['crew'].apply(get_director)
smd['cast'] = smd['cast'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
smd['cast'] = smd['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
smd['keywords'] = smd['keywords'].apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])

In [21]:
smd['cast'] = smd['cast'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

s = smd.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'

In [22]:
s = s.value_counts()
s[:5]

independent film        610
woman director          550
murder                  399
duringcreditsstinger    327
based on novel          318
Name: keyword, dtype: int64

In [23]:
s = s[s > 1]

In [24]:
stemmer = SnowballStemmer('english')
stemmer.stem(' fucks')

' fuck'

In [25]:
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [26]:
smd['keywords'] = smd['keywords'].apply(filter_keywords)
smd['keywords'] = smd['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
smd['keywords'] = smd['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [27]:
smd['keywords'] = smd['keywords']  + smd['cast'] + smd['genres'] + smd['genres']

In [28]:
smd['director'] = smd['director'].apply(lambda x: [x])

In [29]:
smd['keywords'] = smd['keywords'] + smd['director'] + smd['cast']

In [30]:
smd['cast'][0]

['tomhanks', 'timallen', 'donrickles']

In [31]:
def list_to_str(x):
    list1 = x
    str1 = ''.join(str(e)+" " for e in list1)
    return str1

In [32]:
smd['soup'] = smd['keywords'].apply(list_to_str)

In [33]:
smd['soup']

0        jealousi toy boy friendship friend rivalri boy...
1        boardgam disappear basedonchildren'sbook newho...
2        fish bestfriend duringcreditssting waltermatth...
3        basedonnovel interracialrelationship singlemot...
4        babi midlifecrisi confid age daughter motherda...
5        robberi detect bank obsess chase shoot thief h...
6        pari brotherbrotherrelationship chauffeur long...
7        jonathantaylorthomas bradrenfro rachaelleighco...
8        terrorist hostag explos vicepresid jean-claude...
9        cuba falselyaccus secretident computervirus se...
10       whitehous usapresid newlov widow michaeldougla...
11       dracula spoof leslienielsen melbrooks amyyasbe...
12       wolf alaska dog goos bearattack kevinbacon bob...
13       usapresid presidentialelect watergatescand bio...
14       exoticisland treasur map ship scalp pirat geen...
15       poker drugabus 1970s overdos illegalprostitut ...
16       bowl basedonnovel servant countrylif janeauste.

In [34]:
list1 = [1, 2, 3]
str1 = ''.join(str(e) for e in list1)
str1

'123'

In [35]:
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
count_matrix = count.fit_transform(smd['soup'])

In [None]:
cosine_sim = cosine_similarity(count_matrix, count_matrix)

In [85]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [86]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [87]:
get_recommendations('The Dark Knight').head(10)

8031         The Dark Knight Rises
6218                 Batman Begins
7659    Batman: Under the Red Hood
8927       Kidnapping Mr. Heineken
1134                Batman Returns
5943                      Thursday
6623                  The Prestige
1260                Batman & Robin
2560                 Boiling Point
3597                    Off Limits
Name: title, dtype: object

In [88]:
smd.columns

Index(['index', 'adult', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'id', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count', 'year', 'cast', 'crew', 'keywords',
       'director', 'soup'],
      dtype='object')

In [92]:
smd = smd.drop([ 'adult', 'belongs_to_collection', 'budget', 'genres',
       'homepage', 'imdb_id', 'original_language', 'original_title',
       'overview', 'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'video', 'year', 'cast', 'crew', 'keywords',
       'director', 'soup'], axis = 1)

In [93]:
smd.to_csv("perc3.csv")

In [94]:
smd

Unnamed: 0,index,id,title
0,0,862,Toy Story
1,1,8844,Jumanji
2,2,15602,Grumpier Old Men
3,3,31357,Waiting to Exhale
4,4,11862,Father of the Bride Part II
5,5,949,Heat
6,6,11860,Sabrina
7,7,45325,Tom and Huck
8,8,9091,Sudden Death
9,9,710,GoldenEye


In [95]:
import pickle

file = open('model_updated.pickle', 'wb')
pickle.dump(cosine_sim, file)

In [97]:
pickled2 = open("model_updated.pickle", "rb")
cosine = pickle.load(pickled2)

In [98]:
cosine == cosine_sim

array([[ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       ...,
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True],
       [ True,  True,  True, ...,  True,  True,  True]])

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [103]:
smd[smd['title'] == 'Rush']

Unnamed: 0,index,id,title
5402,7713,20289,Rush
8493,21810,96721,Rush


In [104]:
smd = smd.drop(5402)

In [105]:
smd[smd['title'] == 'Rush']

Unnamed: 0,index,id,title
8493,21810,96721,Rush


In [107]:
smd = smd.reset_index()
titles = smd['title']
indices = pd.Series(smd.index, index=smd['title'])

In [108]:
def get_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

In [117]:
get_recommendations('').head(10)

3493                                        Scary Movie 2
6532                                     End of the Spear
3041                                          Scary Movie
3271                              I'm Gonna Git You Sucka
349                                A Low Down Dirty Shame
59      Don't Be a Menace to South Central While Drink...
7838                                                Super
334                   The Naked Gun 33⅓: The Final Insult
1390                                            Senseless
4054                                           Stir Crazy
Name: title, dtype: object

In [110]:
md[md['title'] == 'Rush']

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,status,tagline,title,video,vote_average,vote_count,year,cast,crew,keywords
7713,False,,0,"[Crime, Drama]",,20289,tt0102820,en,Rush,Undercover cop Jim Raynor (Jason Patric) is a ...,...,Released,How far do they go before they've gone too far?,Rush,False,6.9,43.0,1991,"[{'cast_id': 1, 'character': 'Jim Raynor', 'cr...","[{'credit_id': '52fe43dfc3a368484e003237', 'de...","[{'id': 572, 'name': 'sex'}, {'id': 1556, 'nam..."
21810,False,,38000000,"[Drama, Action]",,96721,tt1979320,en,Rush,A biographical drama centered on the rivalry b...,...,Released,Everyone's driven by something.,Rush,False,7.7,2310.0,2013,"[{'cast_id': 2, 'character': 'Niki Lauda', 'cr...","[{'credit_id': '52fe49bf9251416c750d20f1', 'de...","[{'id': 5378, 'name': 'world champion'}, {'id'..."
