In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_style('white', { 'axes.spines.right': False, 'axes.spines.top': False})
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.decomposition import TruncatedSVD
from nltk.corpus import stopwords

## Load Data

In [2]:
path = '../MLdata/'

In [3]:
# movie metadata
df_meta=pd.read_csv(path + 'movies_metadata.csv', low_memory=False, encoding='UTF-8') 
df_meta = df_meta.drop([19730, 29503, 35587])

In [4]:
df_meta = df_meta.set_index(df_meta['id'].str.strip().replace(',','').astype(int))
pd.set_option('display.max_colwidth', 20)
df_meta.head()

Unnamed: 0_level_0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
862,False,"{'id': 10194, 'n...",30000000,"[{'id': 16, 'nam...",http://toystory....,862,tt0114709,en,Toy Story,"Led by Woody, An...",...,1995-10-30,373554033.0,81.0,[{'iso_639_1': '...,Released,,Toy Story,False,7.7,5415.0
8844,False,,65000000,"[{'id': 12, 'nam...",,8844,tt0113497,en,Jumanji,When siblings Ju...,...,1995-12-15,262797249.0,104.0,[{'iso_639_1': '...,Released,Roll the dice an...,Jumanji,False,6.9,2413.0
15602,False,"{'id': 119050, '...",0,"[{'id': 10749, '...",,15602,tt0113228,en,Grumpier Old Men,A family wedding...,...,1995-12-22,0.0,101.0,[{'iso_639_1': '...,Released,Still Yelling. S...,Grumpier Old Men,False,6.5,92.0
31357,False,,16000000,"[{'id': 35, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mist...",...,1995-12-22,81452156.0,127.0,[{'iso_639_1': '...,Released,Friends are the ...,Waiting to Exhale,False,6.1,34.0
11862,False,"{'id': 96871, 'n...",0,"[{'id': 35, 'nam...",,11862,tt0113041,en,Father of the Br...,Just when George...,...,1995-02-10,76578911.0,106.0,[{'iso_639_1': '...,Released,Just When His Wo...,Father of the Br...,False,5.7,173.0


In [5]:
# load movie credits
df_credits = pd.read_csv(path + 'credits.csv', encoding='UTF-8')
df_credits = df_credits.set_index('id')

In [6]:
# load movie keywords
df_keywords=pd.read_csv(path + 'keywords.csv', low_memory=False, encoding='UTF-8') 
df_keywords = df_keywords.set_index('id')

In [7]:
# merge
df_k_c = df_keywords.merge(df_credits, left_index=True, right_on='id')
df = df_k_c.merge(df_meta[['release_date','genres','overview','title']], left_index=True, right_on='id')
df.head(3)

Unnamed: 0_level_0,keywords,cast,crew,release_date,genres,overview,title
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
862,"[{'id': 931, 'na...","[{'cast_id': 14,...",[{'credit_id': '...,1995-10-30,"[{'id': 16, 'nam...","Led by Woody, An...",Toy Story
8844,"[{'id': 10090, '...","[{'cast_id': 1, ...",[{'credit_id': '...,1995-12-15,"[{'id': 12, 'nam...",When siblings Ju...,Jumanji
15602,"[{'id': 1495, 'n...","[{'cast_id': 2, ...",[{'credit_id': '...,1995-12-22,"[{'id': 10749, '...",A family wedding...,Grumpier Old Men


## Data Cleaning

### Smaller subset

In [8]:
links_small = pd.read_csv(path + 'links_small.csv', encoding='UTF-8')
links_small = links_small[links_small['tmdbId'].notnull()]['tmdbId'].astype('int')

In [9]:
df = df[df.index.isin(links_small)]

### Bag-of-words Model

In [10]:
df_movies = pd.DataFrame()

In [11]:
df_movies['keywords'] = df['keywords'].apply(lambda x: [i['name'] for i in eval(x)])
df_movies['keywords'] = df_movies['keywords'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))

In [12]:
df_movies['overview'] = df['overview'].fillna('')

In [13]:
df_movies['release_date'] = pd.to_datetime(df['release_date'], errors='coerce').apply(lambda x: str(x).split('-')[0] if x!= np.nan else np.nan)

In [14]:
df_movies['cast'] = df['cast'].apply(lambda x:[i['name'] for i in eval(x)])
df_movies['cast'] = df_movies['cast'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))

In [15]:
df_movies['genres'] = df['genres'].apply(lambda x: [i['name'] for i in eval(x)])
df_movies['genres'] = df_movies['genres'].apply(lambda x: ' '.join([i.replace(" ", "") for i in x]))

In [16]:
df_movies['title'] = df['title']

In [17]:
# merge all fields into 'tag' column
df_movies['tags'] = df_movies['keywords'] + ' ' + df_movies['cast']+' '+df_movies['genres']+' '+df_movies['release_date']

In [18]:
df_movies.drop(df_movies[df_movies['tags']==''].index, inplace=True)
df_movies.drop_duplicates(inplace=True)

In [19]:
df_movies['new_id'] = range(0, len(df_movies))

In [20]:
df_movies = df_movies[['new_id', 'title', 'tags']]

In [21]:
pd.set_option('display.max_colwidth', 500)
pd.set_option('display.expand_frame_repr', False)

In [22]:
df_movies

Unnamed: 0_level_0,new_id,title,tags
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
862,0,Toy Story,jealousy toy boy friendship friends rivalry boynextdoor newtoy toycomestolife TomHanks TimAllen DonRickles JimVarney WallaceShawn JohnRatzenberger AnniePotts JohnMorris ErikvonDetten LaurieMetcalf R.LeeErmey SarahFreeman PennJillette Animation Comedy Family 1995
8844,1,Jumanji,boardgame disappearance basedonchildren'sbook newhome recluse giantinsect RobinWilliams JonathanHyde KirstenDunst BradleyPierce BonnieHunt BebeNeuwirth DavidAlanGrier PatriciaClarkson AdamHann-Byrd LauraBellBundy JamesHandy GillianBarber BrandonObray CyrusThiedeke GaryJosephThorup LeonardZola LloydBerry MalcolmStewart AnnabelKershaw DarrylHenriques RobynDriscoll PeterBryant SarahGilson FloricaVlad JuneLion BrendaLockmuller Adventure Fantasy Family 1995
15602,2,Grumpier Old Men,fishing bestfriend duringcreditsstinger oldmen WalterMatthau JackLemmon Ann-Margret SophiaLoren DarylHannah BurgessMeredith KevinPollak Romance Comedy 1995
31357,3,Waiting to Exhale,basedonnovel interracialrelationship singlemother divorce chickflick WhitneyHouston AngelaBassett LorettaDevine LelaRochon GregoryHines DennisHaysbert MichaelBeach MykeltiWilliamson LamontJohnson WesleySnipes Comedy Drama Romance 1995
11862,4,Father of the Bride Part II,baby midlifecrisis confidence aging daughter motherdaughterrelationship pregnancy contraception gynecologist SteveMartin DianeKeaton MartinShort KimberlyWilliams-Paisley GeorgeNewbern KieranCulkin BDWong PeterMichaelGoetz KateMcGregor-Stewart JaneAdams EugeneLevy LoriAlan Comedy 1995
...,...,...,...
159550,9077,The Last Brickmaker in America,friendship brickmaking SidneyPoitier WendyCrewson JayO.Sanders MaryAlice BernieCasey CodyNewton PiperLaurie Drama 2001
392572,9078,Rustom,bollywood AkshayKumar IleanaD'Cruz EshaGupta ArjanBajwa UshaNadkarni SachinKhedekar KumudMishra AnangDesai ParmeetSethi IndraneelBhattacharya KanwaljitSingh BrijendraKala GireeshSahedev NagrajManjule Thriller Romance 2016
402672,9079,Mohenjo Daro,bollywood HrithikRoshan PoojaHegde KabirBedi ArunodaySingh KishoriShahane CaseyFrank ManishChoudhary NarendraJha NitishBharadwaj DigantaHazarika SharadKelkar SuhasiniMulay Adventure Drama History Romance 2016
315011,9080,Shin Godzilla,monster godzilla giantmonster destruction kaiju toyko HirokiHasegawa YutakaTakenouchi SatomiIshihara KengoKora MatsuoSatoru MikakoIchikawa IsseiTakahashi KanjiTsuda ShinyaTsukamoto ToruNomaguchi DaisukeKuroda RenOsugi KimikoYo AkiraEmoto SeiHiraizumi ToruTezuka KenichiYajima AkiraHamada IkujiNakamura TetsuWatanabe JunKunimura ShingoTsurumi JunHashimoto PierreTaki TakumiSaito KREVA KenMitsuishi KyusakuShimada TaroSuwa YûKamio ArataFuruta MoroMorooka KôseiKatô ShoheiAbe KeisukeKoide HairiKatag...


In [23]:
df_movies.to_csv('movies.csv')

### Vectorization

In [24]:
stop = list(stopwords.words('english'))

# create the tfid vectorizer, alternatively you can also use countVectorizer
tfidf =  TfidfVectorizer(max_features=5000, analyzer = 'word', stop_words=stop)
vectorized_data = tfidf.fit_transform(df_movies['tags'])
count_matrix = pd.DataFrame(vectorized_data.toarray(), index=df_movies['tags'].index.tolist())

In [25]:
count_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4990,4991,4992,4993,4994,4995,4996,4997,4998,4999
862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8844,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15602,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
31357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
159550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
392572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
402672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
315011,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
print(tfidf.get_feature_names_out()[940:990])

['code' 'codycameron' 'coffin' 'cohen' 'coldwar' 'cole' 'colehauser'
 'coleman' 'colinfarrell' 'colinfirth' 'colinhanks' 'colinkenny'
 'colinquinn' 'colinsalmon' 'colinstinton' 'colleencamp' 'colleendewhurst'
 'college' 'collegestudent' 'colmfeore' 'colmmeaney' 'coma' 'combat'
 'comedian' 'comedy' 'comet' 'comingofage' 'comingout' 'common'
 'communism' 'communist' 'company' 'competition' 'composer' 'computer'
 'computervirus' 'con' 'concentrationcamp' 'concert' 'conchataferrell'
 'confession' 'confidence' 'congress' 'conman' 'connell' 'conniebritton'
 'connienielsen' 'connieray' 'connor' 'conor']


## Dimensionality Reduction

In [27]:
svd = TruncatedSVD(n_components=3000)
reduced_data = svd.fit_transform(count_matrix)

## Text Similarity with Cosine Similarity Scores

In [28]:
similarity = cosine_similarity(reduced_data)

In [29]:
np.save('movies_similarity', similarity)

## Generate Recommendation

In [2]:
similarity = np.load('movies_similarity.npy')

In [3]:
df_movies = pd.read_csv('movies.csv')
df_movies.head()

Unnamed: 0,id,new_id,title,tags
0,862,0,Toy Story,jealousy toy boy friendship friends rivalry bo...
1,8844,1,Jumanji,boardgame disappearance basedonchildren'sbook ...
2,15602,2,Grumpier Old Men,fishing bestfriend duringcreditsstinger oldmen...
3,31357,3,Waiting to Exhale,basedonnovel interracialrelationship singlemot...
4,11862,4,Father of the Bride Part II,baby midlifecrisis confidence aging daughter m...


In [4]:
# create a function that takes in movie title as input and returns a list of the most similar movies
def get_recommendations(title, n, cosine_sim=similarity):
    
    # get the index of the movie that matches the title
    movie_index = df_movies[df_movies.title==title].new_id.values[0]
    
    # get the pairwsie similarity scores of all movies with that movie and sort the movies based on the similarity scores
    sim_scores_all = sorted(list(enumerate(cosine_sim[movie_index])), key=lambda x: x[1], reverse=True)
    
    # checks if recommendations are limited
    if n > 0:
        sim_scores_all = sim_scores_all[1:n+1]
        
    # get the movie indices of the top similar movies
    movie_indices = [i[0] for i in sim_scores_all]
    scores = [i[1] for i in sim_scores_all]
    
    # return the top n most similar movies from the movies df
    top_titles_df = pd.DataFrame(df_movies.iloc[movie_indices]['title'])
    top_titles_df['sim_scores'] = scores
    top_titles_df['ranking'] = range(1, len(top_titles_df) + 1)
    
    return top_titles_df, sim_scores_all


In [5]:
# # generate a list of recommendations for a specific movie title
# movie_name = 'The Matrix'
# number_of_recommendations = 15
# top_titles_df, _ = get_recommendations(movie_name, number_of_recommendations)

In [5]:
# list of movies a user has seen
movie_list = ['The Lion King', 'Se7en', 'Blade Runner', 'Quantum of Solace', 'Casino Royale', 'Skyfall']

# create a copy of the movie dataframe and add a column in which we aggregated the scores
user_scores = pd.DataFrame(df_movies['title'])
user_scores['sim_scores'] = 0.0

# top number of scores to be considered for each movie
number_of_recommendations = 10000
for movie_name in movie_list:
    try:
        top_titles_df, _ = get_recommendations(movie_name, number_of_recommendations)
    except:
        pass
    # aggregate the scores
    user_scores = pd.concat([user_scores, top_titles_df[['title', 'sim_scores']]]).groupby(['title'], as_index=False).sum({'sim_scores'})

In [6]:
user_scores = user_scores[~user_scores['title'].isin(movie_list)]
user_scores.sort_values(by='sim_scores', ascending=False)[:20]

Unnamed: 0,title,sim_scores
5831,Spectre,1.067911
8163,Total Recall,0.63391
3403,Johnny English,0.586215
5555,Shaft,0.577136
3533,King Solomon's Mines,0.562624
6084,Surviving the Game,0.548778
7818,The Three Musketeers,0.530187
4172,Mirage,0.526142
358,Alice in Wonderland,0.488152
2168,Everything or Nothing,0.469017


### Merge with Netflix Data [GET NON-NETFLIX MOVIES TOO]

In [7]:
netflix = pd.read_csv('../Netflix-Data/titles.csv')

In [8]:
netflix = netflix[['title', 'type', 'imdb_score', 'imdb_votes', 'tmdb_popularity', 'tmdb_score']]
netflix = netflix[netflix['type'] == 'MOVIE']

In [9]:
recs = netflix.merge(user_scores, how='inner', on='title')
recs = recs.sort_values(by='sim_scores', ascending=False)[:24]

In [10]:
recs

Unnamed: 0,title,type,imdb_score,imdb_votes,tmdb_popularity,tmdb_score,sim_scores
187,Total Recall,MOVIE,6.2,253352.0,51.651,5.967,0.63391
277,Mirage,MOVIE,7.4,54188.0,19.839,7.5,0.526142
296,Nothing to Lose,MOVIE,2.3,23031.0,45.267,5.5,0.461521
85,Wanted,MOVIE,6.7,384995.0,33.101,6.5,0.40677
304,Brother,MOVIE,6.7,1066.0,7.102,6.6,0.387138
65,Spawn,MOVIE,5.2,68184.0,16.989,5.343,0.360395
211,Oldboy,MOVIE,5.7,75728.0,20.164,5.9,0.355107
29,Mission: Impossible,MOVIE,7.1,420673.0,41.115,6.9,0.276662
66,Croupier,MOVIE,7.0,21599.0,8.381,6.8,0.260625
61,Johnny Mnemonic,MOVIE,5.6,71563.0,10.75,5.7,0.256105


In [None]:
def weighted_rating(x, m, C):
    v = x['imdb_votes']
    R = x['imdb_score']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [None]:
recs = recs[['title', 'imdb_score', 'imdb_votes']]
vote_counts = recs[recs['imdb_votes'].notnull()]['imdb_votes'].astype('int')
vote_averages = recs[recs['imdb_score'].notnull()]['imdb_score'].astype('int')
C = vote_averages.mean()
m = vote_counts.quantile(0.60)
qualified = recs[(recs['imdb_votes'] >= m) & (recs['imdb_votes'].notnull())
                & (recs['imdb_score'].notnull())]
qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
qualified['imdb_score'] = qualified['imdb_score'].astype('int')
qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)
qualified = qualified.sort_values('wr', ascending=False).head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_votes'] = qualified['imdb_votes'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['imdb_score'] = qualified['imdb_score'].astype('int')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  qualified['wr'] = qualified.apply(weighted_rating, args=(m, C), axis=1)


In [None]:
qualified

Unnamed: 0,title,imdb_score,imdb_votes,wr
0,Taxi Driver,8,808582,7.823347
216,Blue Jasmine,7,201749,6.794792
213,Begin Again,7,155249,6.757205
226,The Butler,7,114768,6.711146
224,Philomena,7,99647,6.689116
251,Home,6,101975,6.184366
227,Cloudy with a Chance of Meatballs 2,6,102071,6.184277
219,The Giver,6,118877,6.170038
222,The Call,6,121843,6.167751
215,Olympus Has Fallen,6,274270,6.099179


## Get non-Netflix movies

In [None]:
not_netflix_recs = user_scores[~user_scores['title'].isin(netflix['title'])]
not_netflix_recs = not_netflix_recs.sort_values(by='sim_scores', ascending=False)

In [None]:
not_netflix_recs[:20]

Unnamed: 0,title,sim_scores
7299,The Matrix Reloaded,3.567983
7300,The Matrix Revolutions,3.31344
2599,Ghost in the Shell,1.697667
6300,The Animatrix,1.473375
6211,Terminator 3: Rise of the Machines,1.415669
3513,Kill Command,1.358681
3972,Man of Tai Chi,1.349463
6088,Suspect Zero,1.275179
559,Appleseed,1.216043
4091,Memento,1.204914
