In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
pd.options.mode.chained_assignment = None


In [2]:
df = pd.read_csv('https://raw.githubusercontent.com/hanhduyenjn/Movie-Recommendation/master/movies_metadata.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
cols = [1,2,4,6,7,8,11,13,14,15,16,18,19,21]
df.drop(df.columns[cols],axis=1,inplace=True)

In [None]:
df.head(50)

In [4]:
'''Split the movie that belongs to > 1 genres into seperate rows'''
df['genres'] = df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
s = df.apply(lambda x: pd.Series(x['genres']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'genre'
df = df.drop('genres', axis=1).join(s)

  This is separate from the ipykernel package so we can avoid doing imports until


In [14]:
def qualified_by_genre(genre, percentile=0.05):
    genre_df = df[df['genre'] == genre]

    '''Caculate C - the mean score for all the movies belonging to the genre and m - the top percentile% in terms of number of votes'''
    C = genre_df[genre_df['vote_average'].notnull()]['vote_average'].mean() 
    m = genre_df[genre_df['vote_count'].notnull()]['vote_count'].quantile(1-percentile)
    
    qualified = genre_df[(genre_df['vote_count'] >= m) & (genre_df['vote_count'].notnull()) & (genre_df['vote_average'].notnull())]
    
    '''IMDB's weighted rating formula'''
    qualified['wr'] = qualified.apply(lambda x: (x['vote_count']/(x['vote_count']+m) * x['vote_average']) + (m/(m+x['vote_count']) * C), axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(100)
    
    return qualified

In [15]:
'''Get the top 10 movie of a particular genre'''
qualified_by_genre('Romance').head(10)

Unnamed: 0,adult,id,overview,popularity,production_companies,spoken_languages,title,vote_average,vote_count,genre,wr
351,False,13,A man with a low IQ has accomplished great thi...,48.307194,"[{'name': 'Paramount Pictures', 'id': 4}]","[{'iso_639_1': 'en', 'name': 'English'}]",Forrest Gump,8.2,8147.0,Romance,8.079811
10309,False,19404,"Raj is a rich, carefree, happy-go-lucky second...",34.457024,"[{'name': 'Yash Raj Films', 'id': 1569}]","[{'iso_639_1': 'hi', 'name': 'हिन्दी'}]",Dilwale Dulhania Le Jayenge,9.1,661.0,Romance,7.800406
40882,False,313369,"Mia, an aspiring actress, serves lattes to mov...",19.681686,"[{'name': 'Summit Entertainment', 'id': 491}, ...","[{'iso_639_1': 'en', 'name': 'English'}]",La La Land,7.9,4745.0,Romance,7.725728
22168,False,152601,"In the not so distant future, Theodore, a lone...",13.829515,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_639_1': 'en', 'name': 'English'}]",Her,7.9,4215.0,Romance,7.705843
40251,False,372058,High schoolers Mitsuha and Taki are complete s...,34.461252,"[{'name': 'CoMix Wave Films', 'id': 10198}]","[{'iso_639_1': 'ja', 'name': '日本語'}]",Your Name.,8.5,1030.0,Romance,7.705577
7208,False,38,"Joel Barish, heartbroken that his girlfriend u...",12.906327,"[{'name': 'Anonymous Content', 'id': 10039}, {...","[{'iso_639_1': 'en', 'name': 'English'}]",Eternal Sunshine of the Spotless Mind,7.9,3758.0,Romance,7.684657
4843,False,194,"At a tiny Parisian café, the adorable yet pain...",12.879381,"[{'name': 'France 3 Cinéma', 'id': 591}, {'nam...","[{'iso_639_1': 'fr', 'name': 'Français'}, {'is...",Amélie,7.8,3403.0,Romance,7.575931
24982,False,266856,The Theory of Everything is the extraordinary ...,11.85302,"[{'name': 'Working Title Films', 'id': 10163}]","[{'iso_639_1': 'la', 'name': 'Latin'}, {'iso_6...",The Theory of Everything,7.8,3403.0,Romance,7.575931
7834,False,11036,An epic love story centered around an older ma...,15.239013,"[{'name': 'New Line Cinema', 'id': 12}]","[{'iso_639_1': 'en', 'name': 'English'}]",The Notebook,7.7,3163.0,Romance,7.472931
4865,False,453,"At Princeton University, John Nash struggles t...",11.93646,"[{'name': 'Imagine Entertainment', 'id': 23}, ...","[{'iso_639_1': 'en', 'name': 'English'}]",A Beautiful Mind,7.7,3087.0,Romance,7.468025
