Preprocessing of data to create training and testing sets of data

In [2]:
import pandas as pd
import numpy as np
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from nltk.tokenize import word_tokenize
from nltk import everygrams
from nltk.stem.snowball import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import wordnet

import warnings; warnings.simplefilter('ignore')

In [3]:
#read in data from csv to dataframes

movies_metadata_df = pd.read_csv("TheMoviesDataset\movies_metadata.csv")
ratings_df = pd.read_csv("TheMoviesDataset/ratings_small.csv") #used / instead of \ due to '\r' representing carriage return in string casusing a error
credits_df = pd.read_csv("TheMoviesDataset/credits.csv")
keywords_df = pd.read_csv("TheMoviesDataset/keywords.csv")
links_small_df = pd.read_csv("TheMoviesDataset/links_small.csv")

In [4]:
print('movies: ', movies_metadata_df.shape)
print('credits: ', credits_df.shape)
print('keywords: ', keywords_df.shape)
print('ratings: ', ratings_df.shape)
print('links_small: ', links_small_df.shape)

movies:  (45466, 24)
credits:  (45476, 3)
keywords:  (46419, 2)
ratings:  (100004, 4)
links_small:  (9125, 3)


In [5]:
movies_metadata_df.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [6]:
credits_df.head(5)

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [7]:
keywords_df.head(5)

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [8]:
ratings_df.head(5)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [9]:
links_small_df.head(5)

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [10]:
#rename 'id' to 'movieId' in movie_metadata, credits, and keywords to be consitant across data

movies_metadata_df = movies_metadata_df.rename(columns={'id': 'movieId'})
credits_df = credits_df.rename(columns={'id': 'movieId'})
keywords_df = keywords_df.rename(columns={'id': 'movieId'})

In [11]:
#incorrectly formated data found within dataset, easier to drop movies by index than fix

movies_metadata_df = movies_metadata_df.drop(35586)
movies_metadata_df = movies_metadata_df.drop(35587)
movies_metadata_df = movies_metadata_df.drop(29502)
movies_metadata_df = movies_metadata_df.drop(29503)
movies_metadata_df = movies_metadata_df.drop(19729)
movies_metadata_df = movies_metadata_df.drop(19730)

#reset index so that theres none missing
movies_metadata_df.reset_index(drop=True, inplace=True)

In [12]:
#change 'movieId' type to 'int'
links_small_df = links_small_df[links_small_df['tmdbId'].notnull()]['tmdbId'].astype('int')
movies_metadata_df['movieId'] = movies_metadata_df['movieId'].astype('int')
credits_df['movieId'] = credits_df['movieId'].astype('int')
keywords_df['movieId'] = keywords_df['movieId'].astype('int')

In [13]:
#reducing dataset to only ranked movies

movies_metadata_df = movies_metadata_df[movies_metadata_df['movieId'].isin(links_small_df)]
credits_df = credits_df[credits_df['movieId'].isin(links_small_df)]
keywords_df = keywords_df[keywords_df['movieId'].isin(links_small_df)]
ratings_df = ratings_df[ratings_df['movieId'].isin(links_small_df)]

print('movies: ', movies_metadata_df.shape)
print('credits: ', credits_df.shape)
print('keywords: ', keywords_df.shape)
print('ratings: ', ratings_df.shape)
print('links_small: ', links_small_df.shape)

movies:  (9099, 24)
credits:  (9099, 3)
keywords:  (9117, 2)
ratings:  (32146, 4)
links_small:  (9112,)


In [14]:
links_small_df.head(5)

0      862
1     8844
2    15602
3    31357
4    11862
Name: tmdbId, dtype: int32

In [1255]:
movies_metadata_df['genres'] = movies_metadata_df['genres'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
movies_metadata_df.head(5)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,movieId,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[Animation, Comedy, Family]",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[Adventure, Fantasy, Family]",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[Romance, Comedy]",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[Comedy, Drama, Romance]",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,[Comedy],,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [1256]:
#Transform keywords from [{'id': 0, 'name': 'x'}, {'id': 1, 'name': 'y'} to [x, y]

keywords_df['keywords'] = keywords_df['keywords'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
keywords_df.head(5)

Unnamed: 0,movieId,keywords
0,862,"[jealousy, toy, boy, friendship, friends, riva..."
1,8844,"[board game, disappearance, based on children'..."
2,15602,"[fishing, best friend, duringcreditsstinger, o..."
3,31357,"[based on novel, interracial relationship, sin..."
4,11862,"[baby, midlife crisis, confidence, aging, daug..."


In [1257]:
credits_df['cast'] = credits_df['cast'].fillna('[]').apply(literal_eval).apply(lambda x: [i['name'] for i in x] if isinstance(x, list) else [])
credits_df['cast'] = credits_df['cast'].apply(lambda x: x[:3] if len(x) >=3 else x)
credits_df['crew'] = credits_df['crew'].fillna('[]').apply(literal_eval)
credits_df.head(5)

Unnamed: 0,cast,crew,movieId
0,"[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[Whitney Houston, Angela Bassett, Loretta Devine]","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


In [1258]:
#getting director(s) from crew, if no director(s) return empty list

def get_directors(x):
    return [next((i['name'] for i in x if i['job'] in ['Director']), [])]

#getting writer(s) from crew, if no writer(s) return empty list

def get_writers(x):
    return [next((i['name'] for i in x if i['job'] in ['Screenplay', 'Writer']), [])]

#apply functions

credits_df['directors'] = credits_df['crew'].apply(get_directors)
credits_df['writers'] = credits_df['crew'].apply(get_writers)

credits_df.head(5)

Unnamed: 0,cast,crew,movieId,directors,writers
0,"[Tom Hanks, Tim Allen, Don Rickles]","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862,[John Lasseter],[Joss Whedon]
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844,[Joe Johnston],[Jonathan Hensleigh]
2,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602,[Howard Deutch],[Mark Steven Johnson]
3,"[Whitney Houston, Angela Bassett, Loretta Devine]","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357,[Forest Whitaker],[Ronald Bass]
4,"[Steve Martin, Diane Keaton, Martin Short]","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862,[Charles Shyer],[Nancy Meyers]


In [1259]:
credits_df = credits_df.drop('crew', axis=1)
credits_df.head(5)

Unnamed: 0,cast,movieId,directors,writers
0,"[Tom Hanks, Tim Allen, Don Rickles]",862,[John Lasseter],[Joss Whedon]
1,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]",8844,[Joe Johnston],[Jonathan Hensleigh]
2,"[Walter Matthau, Jack Lemmon, Ann-Margret]",15602,[Howard Deutch],[Mark Steven Johnson]
3,"[Whitney Houston, Angela Bassett, Loretta Devine]",31357,[Forest Whitaker],[Ronald Bass]
4,"[Steve Martin, Diane Keaton, Martin Short]",11862,[Charles Shyer],[Nancy Meyers]


In [1260]:
credits_df = credits_df.reindex(columns=['movieId','directors','writers','cast'])
credits_df.head(5)

Unnamed: 0,movieId,directors,writers,cast
0,862,[John Lasseter],[Joss Whedon],"[Tom Hanks, Tim Allen, Don Rickles]"
1,8844,[Joe Johnston],[Jonathan Hensleigh],"[Robin Williams, Jonathan Hyde, Kirsten Dunst]"
2,15602,[Howard Deutch],[Mark Steven Johnson],"[Walter Matthau, Jack Lemmon, Ann-Margret]"
3,31357,[Forest Whitaker],[Ronald Bass],"[Whitney Houston, Angela Bassett, Loretta Devine]"
4,11862,[Charles Shyer],[Nancy Meyers],"[Steve Martin, Diane Keaton, Martin Short]"


In [1261]:
movies_features_df = pd.merge(credits_df, movies_metadata_df[['movieId','genres']], on='movieId')
movies_features_df = pd.merge(movies_features_df, keywords_df[['movieId','keywords']], on='movieId')
movies_features_df.head(5)

Unnamed: 0,movieId,directors,writers,cast,genres,keywords
0,862,[John Lasseter],[Joss Whedon],"[Tom Hanks, Tim Allen, Don Rickles]","[Animation, Comedy, Family]","[jealousy, toy, boy, friendship, friends, riva..."
1,8844,[Joe Johnston],[Jonathan Hensleigh],"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[Adventure, Fantasy, Family]","[board game, disappearance, based on children'..."
2,15602,[Howard Deutch],[Mark Steven Johnson],"[Walter Matthau, Jack Lemmon, Ann-Margret]","[Romance, Comedy]","[fishing, best friend, duringcreditsstinger, o..."
3,31357,[Forest Whitaker],[Ronald Bass],"[Whitney Houston, Angela Bassett, Loretta Devine]","[Comedy, Drama, Romance]","[based on novel, interracial relationship, sin..."
4,11862,[Charles Shyer],[Nancy Meyers],"[Steve Martin, Diane Keaton, Martin Short]",[Comedy],"[baby, midlife crisis, confidence, aging, daug..."


In [1262]:
# function to convert all strings to lower case and strip names of spaces
def clean_data(x):
    cleaned_items = []
    for item in x:
        cleaned_item = str(item).lower().replace(' ', '')
        cleaned_items.append(cleaned_item)
    return cleaned_items

# list of features to clean
features = ['directors', 'writers', 'cast', 'genres']

# apply function to each feature using map() instead of apply()
movies_features_df[features] = movies_features_df[features].applymap(clean_data)

In [1263]:
s = movies_features_df.apply(lambda x: pd.Series(x['keywords']),axis=1).stack().reset_index(level=1, drop=True)
s.name = 'keyword'
s = s.value_counts()
s = s[s > 1]

stemmer = SnowballStemmer('english')
def filter_keywords(x):
    words = []
    for i in x:
        if i in s:
            words.append(i)
    return words

In [1268]:
movies_features_df['keywords'] = movies_features_df['keywords'].apply(filter_keywords)
movies_features_df['keywords'] = movies_features_df['keywords'].apply(lambda x: [stemmer.stem(i) for i in x])
movies_features_df['keywords'] = movies_features_df['keywords'].apply(lambda x: [str.lower(i.replace(" ", "")) for i in x])

In [1269]:
# merge the two DataFrames on the common column 'A'
movies_features_df = movies_features_df.join(movies_metadata_df.set_index('movieId')['title'], on='movieId')
movies_features_df = movies_features_df[['movieId', 'title','directors', 'writers', 'cast', 'genres', 'keywords']]
#movies_features_df.rename(columns={'original_title': 'title'}, inplace=True)
movies_features_df.head(5)

Unnamed: 0,movieId,title,directors,writers,cast,genres,keywords
0,862,Toy Story,[johnlasseter],[josswhedon],"[tomhanks, timallen, donrickles]","[animation, comedy, family]","[jealousi, toy, boy, friendship, friend, rival..."
1,8844,Jumanji,[joejohnston],[jonathanhensleigh],"[robinwilliams, jonathanhyde, kirstendunst]","[adventure, fantasy, family]","[boardgam, disappear, basedonchildren'sbook, n..."
2,15602,Grumpier Old Men,[howarddeutch],[markstevenjohnson],"[waltermatthau, jacklemmon, ann-margret]","[romance, comedy]","[fish, bestfriend, duringcreditssting]"
3,31357,Waiting to Exhale,[forestwhitaker],[ronaldbass],"[whitneyhouston, angelabassett, lorettadevine]","[comedy, drama, romance]","[basedonnovel, interracialrelationship, single..."
4,11862,Father of the Bride Part II,[charlesshyer],[nancymeyers],"[stevemartin, dianekeaton, martinshort]",[comedy],"[babi, midlifecrisi, confid, age, daughter, mo..."


In [1271]:
df = movies_features_df
df['soup'] = df['directors'] + df['writers'] + df['cast'] + df['genres'] + df['keywords']
df.head(5)

Unnamed: 0,movieId,title,directors,writers,cast,genres,keywords,soup
0,862,Toy Story,[johnlasseter],[josswhedon],"[tomhanks, timallen, donrickles]","[animation, comedy, family]","[jealousi, toy, boy, friendship, friend, rival...","[johnlasseter, josswhedon, tomhanks, timallen,..."
1,8844,Jumanji,[joejohnston],[jonathanhensleigh],"[robinwilliams, jonathanhyde, kirstendunst]","[adventure, fantasy, family]","[boardgam, disappear, basedonchildren'sbook, n...","[joejohnston, jonathanhensleigh, robinwilliams..."
2,15602,Grumpier Old Men,[howarddeutch],[markstevenjohnson],"[waltermatthau, jacklemmon, ann-margret]","[romance, comedy]","[fish, bestfriend, duringcreditssting]","[howarddeutch, markstevenjohnson, waltermattha..."
3,31357,Waiting to Exhale,[forestwhitaker],[ronaldbass],"[whitneyhouston, angelabassett, lorettadevine]","[comedy, drama, romance]","[basedonnovel, interracialrelationship, single...","[forestwhitaker, ronaldbass, whitneyhouston, a..."
4,11862,Father of the Bride Part II,[charlesshyer],[nancymeyers],"[stevemartin, dianekeaton, martinshort]",[comedy],"[babi, midlifecrisi, confid, age, daughter, mo...","[charlesshyer, nancymeyers, stevemartin, diane..."


In [1272]:
df =  pd.merge(df, movies_metadata_df[['movieId','vote_count']], on='movieId')
df =  pd.merge(df, movies_metadata_df[['movieId','vote_average']], on='movieId')

In [1273]:
count = CountVectorizer(analyzer='word',ngram_range=(1,2),min_df=0, stop_words='english')
df['soup_str'] = df['soup'].apply(lambda x: ' '.join(x))
count_matrix = count.fit_transform(df['soup_str'])

In [1274]:
count_matrix.shape

(10171, 110706)

In [1275]:
def get_recommendations(title):
    idx = indices[title]
    if type(idx) != np.int64:
        if len(idx)>1:
            print("ALERT: Multiple values")
            idx = idx[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:31]
    movie_indices = [i[0] for i in sim_scores]
    return titles.iloc[movie_indices]

cosine_sim = cosine_similarity(count_matrix, count_matrix)
df = df.reset_index()
titles = df['title']
indices = pd.Series(df.index, index=df['title'])

In [1276]:
get_recommendations('Avatar').head(10)

9297            Star Trek Into Darkness
1142                             Aliens
3384                 Dungeons & Dragons
9620                  Jupiter Ascending
1668                     Small Soldiers
7937               Dragonball Evolution
3228    Sinbad and the Eye of the Tiger
5414               Hercules in New York
6940      Left Behind III: World at War
1836         Return from Witch Mountain
Name: title, dtype: object

Adding IMDB weighted raitng

In [1286]:
def weighted_rating(x):
    v = x['vote_count']
    R = x['vote_average']
    return (v/(v+m) * R) + (m/(m+v) * C)

In [1287]:
def improved_recommendations(title):
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:26]
    movie_indices = [i[0] for i in sim_scores]
    
    movies = df.iloc[movie_indices][['title', 'vote_count', 'vote_average']]
    vote_counts = movies[movies['vote_count'].notnull()]['vote_count'].astype('int')
    vote_averages = movies[movies['vote_average'].notnull()]['vote_average'].astype('int')
    C = vote_averages.mean()
    m = vote_counts.quantile(0.60)
    qualified = movies[(movies['vote_count'] >= m) & (movies['vote_count'].notnull()) & (movies['vote_average'].notnull())]
    qualified['vote_count'] = qualified['vote_count'].astype('int')
    qualified['vote_average'] = qualified['vote_average'].astype('int')
    qualified['wr'] = qualified.apply(weighted_rating, axis=1)
    qualified = qualified.sort_values('wr', ascending=False).head(10)
    return qualified

In [1292]:
improved_recommendations('Toy Story')

Unnamed: 0,title,vote_count,vote_average,wr
4113,"Monsters, Inc.",6150,7,6.726328
8413,Toy Story 3,4710,7,6.668281
2690,Toy Story 2,3914,7,6.624222
9491,The Lego Movie,3127,7,6.567417
7112,Cars,3991,6,5.971504
8188,Cloudy with a Chance of Meatballs,1799,6,5.955398
6746,Robots,1383,6,5.950038
3184,Chicken Run,1190,6,5.947089
7640,Horton Hears a Who!,927,6,5.942459
7150,Monster House,912,6,5.942171
