# Package Import

In [67]:
import pandas as pd
import functools as ft
import numpy as np
import matplotlib.pylab as plt
import string
import re
import spacy
import seaborn as sns

from pathlib import Path

from sklearn import preprocessing
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.neighbors import NearestNeighbors, KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier

import scikitplot as skplt

import dmba
from dmba import regressionSummary
from dmba import adjusted_r2_score, AIC_score, BIC_score
from dmba import backward_elimination, forward_selection, stepwise_selection
from dmba import plotDecisionTree, classificationSummary, regressionSummary
from dmba import gainsChart, liftChart

from ast import literal_eval


#import warnings
#warnings.filterwarnings('ignore')
#warnings.simplefilter(action='ignore', category=FutureWarning)

%matplotlib inline

# Reading the file

In [2]:
## Input file path where data folder is located
#FILEPATH = '../data/movie-set-data/'
FILEPATH = '/Users/sanjayregiphilip/OneDrive/DS/ADS505/Final/movie-set-data/'

## Load individual files
credits_df = pd.read_csv(f'{FILEPATH}credits.csv')
keywords_df = pd.read_csv(f'{FILEPATH}keywords.csv')
links_df = pd.read_csv(f'{FILEPATH}links.csv')
#links_small_df = pd.read_csv(f'{FILEPATH}links_small.csv')
movie_meta_df = pd.read_csv(f'{FILEPATH}movies_metadata.csv')
ratings_df = pd.read_csv(f'{FILEPATH}ratings.csv')
#ratings_small_df = pd.read_csv(f'{FILEPATH}ratings_small.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
movie_meta_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
# drop vote average and vote count
movie_meta_df=movie_meta_df.drop(columns=['vote_average', 'vote_count'])

## following not needed if vote count is removed
# dropping corrupted rows
#filtered_rows = movie_meta_df[movie_meta_df['vote_count'].isnull()].index
#movie_meta_df=movie_meta_df.drop(index=filtered_rows)

#movie_meta_df['id'] = movie_meta_df.id.astype('int')

In [5]:
# dropping adult rated movies
filtered_rows = movie_meta_df[movie_meta_df['adult']=='True'].index
movie_meta_df = movie_meta_df.drop(index=filtered_rows)

# dropping adult column no longer needed
movie_meta_df = movie_meta_df.drop(columns='adult')

In [6]:
# dropping movies that are not already Released
filtered_rows = movie_meta_df[movie_meta_df['status']!='Released'].index
movie_meta_df = movie_meta_df.drop(index=filtered_rows)

# dropping status column no longer needed with unreleased movies excluded
movie_meta_df = movie_meta_df.drop(columns='status')

In [7]:
movie_meta_df['video'].unique()

array([False, True], dtype=object)

In [8]:
# dropping movies that are not already Released
filtered_rows = movie_meta_df[movie_meta_df['video']==True].index
movie_meta_df = movie_meta_df.drop(index=filtered_rows)

In [9]:
# dropping video column no longer needed
movie_meta_df = movie_meta_df.drop(columns='video')

## Combine Dataframes

In [10]:
movie_meta_df['id']=movie_meta_df['id'].astype(int)
credits_df['id']=credits_df['id'].astype(int)
keywords_df['id']=keywords_df['id'].astype(int)


movies = pd.merge(movie_meta_df,credits_df, on='id', how='left')
movies = pd.merge(movies,keywords_df, on='id', how='left')


movies.head()

Unnamed: 0,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,production_countries,release_date,revenue,runtime,spoken_languages,tagline,title,cast,crew,keywords
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


## Get ratings

In [11]:
links_df['id'] = links_df['tmdbId']
movies = pd.merge(movies,links_df, on='id', how='left')

In [12]:
ratings_df.drop(columns=['userId', 'timestamp'], inplace=True)
ratings_df.head()

Unnamed: 0,movieId,rating
0,110,1.0
1,147,4.5
2,858,5.0
3,1221,5.0
4,1246,5.0


In [13]:
avg_ratings = round(ratings_df.groupby('movieId').mean(),2)

In [14]:
movies = pd.merge(movies,avg_ratings, on='movieId', how='left')

movies.head()

Unnamed: 0,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,tagline,title,cast,crew,keywords,movieId,imdbId,tmdbId,rating
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",1,114709,862.0,3.89
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",2,113497,8844.0,3.24
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",3,113228,15602.0,3.18
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",4,114885,31357.0,2.88
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",5,113041,11862.0,3.08


# Extract Features from JSON columns

In [15]:
features = ['cast', 'crew', 'genres', 'keywords']

movies = movies.dropna(subset=features)

for feature in features:
    movies[feature] = movies[feature].apply(literal_eval)

In [16]:
movies.head()

Unnamed: 0,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,...,spoken_languages,tagline,title,cast,crew,keywords,movieId,imdbId,tmdbId,rating
0,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.9469,...,"[{'iso_639_1': 'en', 'name': 'English'}]",,Toy Story,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...","[{'id': 931, 'name': 'jealousy'}, {'id': 4290,...",1,114709,862.0,3.89
1,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.0155,...,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Roll the dice and unleash the excitement!,Jumanji,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...","[{'id': 10090, 'name': 'board game'}, {'id': 1...",2,113497,8844.0,3.24
2,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...","[{'id': 1495, 'name': 'fishing'}, {'id': 12392...",3,113228,15602.0,3.18
3,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.85949,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Friends are the people who let you be yourself...,Waiting to Exhale,"[{'cast_id': 1, 'character': 'Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...","[{'id': 818, 'name': 'based on novel'}, {'id':...",4,114885,31357.0,2.88
4,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.38752,...,"[{'iso_639_1': 'en', 'name': 'English'}]",Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...","[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n...",5,113041,11862.0,3.08


In [17]:
## function to get name of director from the crew field
def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

# get director
movies['director'] = movies['crew'].apply(get_director)
movies = movies.drop(columns='crew')

In [18]:
# Returns the list of top 3 elements for genres and keywords
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #Check if more than 3 elements exist. If yes, return only first three. If no, return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #Return empty list in case of missing/malformed data
    return []

In [19]:
features = ['cast', 'keywords', 'genres']

for feature in features:
    movies[feature] = movies[feature].apply(get_list)

In [20]:
movies.columns

Index(['belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'tagline', 'title', 'cast', 'keywords', 'movieId',
       'imdbId', 'tmdbId', 'rating', 'director'],
      dtype='object')

In [21]:
movies = movies.drop(columns=['belongs_to_collection', # this looks like an id column to me so not necessary to add
                     'homepage', # weblink so not needed
                     'id', # simple ID column not helpful for predicting
                     'imdb_id', # again an ID column
                     'original_title', # just a name
                     'overview', # we already have the top keywords as a column
                     'poster_path', # 
                     'production_companies', # 
                     'production_countries', #
                     'spoken_languages', # we have original language as a column - high correlation
                     'tagline', # we have keywords
                     'title', # just a name    
                     'movieId' , 'imdbId', 'tmdbId']) # just more IDs     

In [22]:
movies.head()

Unnamed: 0,budget,genres,original_language,popularity,release_date,revenue,runtime,cast,keywords,rating,director
0,30000000,"[Animation, Comedy, Family]",en,21.9469,1995-10-30,373554033.0,81.0,"[Tom Hanks, Tim Allen, Don Rickles]","[jealousy, toy, boy]",3.89,John Lasseter
1,65000000,"[Adventure, Fantasy, Family]",en,17.0155,1995-12-15,262797249.0,104.0,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[board game, disappearance, based on children'...",3.24,Joe Johnston
2,0,"[Romance, Comedy]",en,11.7129,1995-12-22,0.0,101.0,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[fishing, best friend, duringcreditsstinger]",3.18,Howard Deutch
3,16000000,"[Comedy, Drama, Romance]",en,3.85949,1995-12-22,81452156.0,127.0,"[Whitney Houston, Angela Bassett, Loretta Devine]","[based on novel, interracial relationship, sin...",2.88,Forest Whitaker
4,0,[Comedy],en,8.38752,1995-02-10,76578911.0,106.0,"[Steve Martin, Diane Keaton, Martin Short]","[baby, midlife crisis, confidence]",3.08,Charles Shyer


## Missing Data

In [23]:
movies.isnull().sum()

budget                 0
genres                 0
original_language     10
popularity             0
release_date          74
revenue                0
runtime              257
cast                   0
keywords               0
rating               733
director             861
dtype: int64

In [24]:
movies = movies.dropna()
movies.isnull().sum()

budget               0
genres               0
original_language    0
popularity           0
release_date         0
revenue              0
runtime              0
cast                 0
keywords             0
rating               0
director             0
dtype: int64

In [25]:
movies.shape

(44518, 11)

## Encode Variables

In [26]:
genres_cat = movies.genres.str.join('|').str.get_dummies().add_prefix('genres_')

In [27]:
movies = pd.concat([movies, genres_cat], axis = 1)

#### These two features can be added as dummies but they take a lot of processing power

In [28]:
#cast_cat = movies.cast.str.join('|').str.get_dummies().add_prefix('cast_')
#cast_cat

In [29]:
#keywords_cat = movies.keywords.str.join('|').str.get_dummies().add_prefix('keywords_')
#keywords_cat

In [30]:
#movies = pd.get_dummies(movies, columns=['director'])
#movies

In [31]:
movies['Release_Year'] = movies['release_date'].str[:4]
movies['Release_Month'] = movies['release_date'].str[5:7]

In [32]:
movies = pd.get_dummies(movies, columns=['Release_Year', 'Release_Month', 'original_language'])
movies

Unnamed: 0,budget,genres,popularity,release_date,revenue,runtime,cast,keywords,rating,director,...,original_language_tl,original_language_tr,original_language_uk,original_language_ur,original_language_uz,original_language_vi,original_language_wo,original_language_xx,original_language_zh,original_language_zu
0,30000000,"[Animation, Comedy, Family]",21.9469,1995-10-30,373554033.0,81.0,"[Tom Hanks, Tim Allen, Don Rickles]","[jealousy, toy, boy]",3.89,John Lasseter,...,0,0,0,0,0,0,0,0,0,0
1,65000000,"[Adventure, Fantasy, Family]",17.0155,1995-12-15,262797249.0,104.0,"[Robin Williams, Jonathan Hyde, Kirsten Dunst]","[board game, disappearance, based on children'...",3.24,Joe Johnston,...,0,0,0,0,0,0,0,0,0,0
2,0,"[Romance, Comedy]",11.7129,1995-12-22,0.0,101.0,"[Walter Matthau, Jack Lemmon, Ann-Margret]","[fishing, best friend, duringcreditsstinger]",3.18,Howard Deutch,...,0,0,0,0,0,0,0,0,0,0
3,16000000,"[Comedy, Drama, Romance]",3.85949,1995-12-22,81452156.0,127.0,"[Whitney Houston, Angela Bassett, Loretta Devine]","[based on novel, interracial relationship, sin...",2.88,Forest Whitaker,...,0,0,0,0,0,0,0,0,0,0
4,0,[Comedy],8.38752,1995-02-10,76578911.0,106.0,"[Steve Martin, Diane Keaton, Martin Short]","[baby, midlife crisis, confidence]",3.08,Charles Shyer,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46329,0,[Science Fiction],0.661558,1995-01-01,0.0,85.0,"[Lisa Boyle, Kena Land, Zaneta Polard]",[],2.00,Aaron Osborne,...,0,0,0,0,0,0,0,0,0,0
46330,0,"[Drama, Action, Romance]",5.683753,1991-05-13,0.0,104.0,"[Patrick Bergin, Uma Thurman, David Morrissey]",[],4.00,John Irvin,...,0,0,0,0,0,0,0,0,0,0
46332,0,[Drama],0.178241,2011-11-17,0.0,360.0,"[Angel Aquino, Perry Dizon, Hazel Orencio]","[artist, play, pinoy]",5.00,Lav Diaz,...,1,0,0,0,0,0,0,0,0,0
46333,0,"[Action, Drama, Thriller]",0.903007,2003-08-01,0.0,90.0,"[Erika Eleniak, Adam Baldwin, Julie du Page]",[],1.00,Mark L. Lester,...,0,0,0,0,0,0,0,0,0,0


In [33]:
## remove variables that are not needed after encoding

movies = movies.drop(columns=['genres', 'release_date', 'cast', 'keywords', 'director'])

## Train Test Split

In [34]:
trainData, testData = train_test_split(movies, test_size=0.2, random_state=1)

In [35]:
trainData.reset_index(drop=True, inplace=True)
testData.reset_index(drop=True, inplace=True)

In [36]:
print(trainData.shape)
print(testData.shape)

(35614, 259)
(8904, 259)


## Scale Variables

In [37]:
scaler = MinMaxScaler()

to_be_scaled = ['budget', 'popularity', 'revenue', 'runtime']

In [38]:
# training scaler ONLY on training data

train_scaled = scaler.fit_transform(trainData[to_be_scaled])

train_scaled = pd.DataFrame(train_scaled)

train_scaled.columns = to_be_scaled

# add scaled data back to dataframe
trainData = trainData.drop(columns=to_be_scaled)
trainData = pd.concat([trainData, train_scaled], axis = 1)

trainData.head()

Unnamed: 0,rating,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Family,genres_Fantasy,...,original_language_uz,original_language_vi,original_language_wo,original_language_xx,original_language_zh,original_language_zu,budget,popularity,revenue,runtime
0,2.5,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0.0,4e-06,0.0,0.06449
1,0.5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,2e-06,0.0,0.079618
2,2.0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.000208,0.0,0.075637
3,3.25,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5.3e-05,0.001045,0.0,0.027866
4,3.17,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.002123,0.0,0.0


In [39]:
# fitting scaler on test data

test_scaled = scaler.transform(testData[to_be_scaled])

test_scaled = pd.DataFrame(test_scaled)

test_scaled.columns = to_be_scaled

# add scaled data back to dataframe
testData = testData.drop(columns=to_be_scaled)
testData = pd.concat([testData, test_scaled], axis = 1)

testData.head()

Unnamed: 0,rating,genres_Action,genres_Adventure,genres_Animation,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Family,genres_Fantasy,...,original_language_uz,original_language_vi,original_language_wo,original_language_xx,original_language_zh,original_language_zu,budget,popularity,revenue,runtime
0,2.75,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0.0,0.000992,0.0,0.076433
1,3.4,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0.0,0.001789,0.0,0.071656
2,3.5,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0.0,0.001147,0.0,0.091561
3,2.42,1,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0.0,0.004349,0.0,0.121019
4,4.5,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0.0,0.011693,0.0,0.075637


## Model Building

In [75]:
classes = ['Low','High']

trainY =  pd.cut(trainData.rating,bins=[0,4, 5],labels= classes)
testY = pd.cut(testData.rating,bins=[0,4, 5],labels= classes)

trainX = trainData.drop(columns=['rating'])
testX = testData.drop(columns=['rating'])

### Logistic Regression

In [76]:
classes = ['High', 'Low'] ## I don't know why this is being flipped in the classification summary so reassigned

In [77]:
logit_movies = LogisticRegression(penalty="l2", solver='liblinear', class_weight = 'balanced')
logit_movies.fit(trainX, trainY)

classificationSummary(trainY, logit_movies.predict(trainX), class_names = classes)

Confusion Matrix (Accuracy 0.6353)

       Prediction
Actual  High   Low
  High   863   362
   Low 12626 21763


### Decision Trees

In [66]:
tree_movies = DecisionTreeClassifier(max_depth=7, min_samples_split=50, 
                                     min_impurity_decrease=0.01, random_state = 1, class_weight = 'balanced')
tree_movies.fit(trainX, trainY)

plotDecisionTree(tree_movies, feature_names=trainX.columns)
classificationSummary(trainY, tree_movies.predict(trainX), class_names = classes)

Confusion Matrix (Accuracy 0.4985)

       Prediction
Actual  High   Low
  High   915   310
   Low 17549 16840


### Random Forests

In [65]:
rf_movies = RandomForestClassifier(max_depth=3, random_state = 1, class_weight = 'balanced')

rf_movies.fit(trainX, trainY)

classificationSummary(trainY, rf_movies.predict(trainX), class_names = classes)

Confusion Matrix (Accuracy 0.6104)

       Prediction
Actual  High   Low
  High   856   369
   Low 13506 20883


### Stochastic Gradient Descent

In [69]:
sgd_movies = SGDClassifier(loss="hinge", penalty="l2", max_iter=5, class_weight = 'balanced')

sgd_movies.fit(trainX, trainY)

classificationSummary(trainY, sgd_movies.predict(trainX), class_names = classes)

Confusion Matrix (Accuracy 0.5615)

       Prediction
Actual  High   Low
  High   800   425
   Low 15192 19197




### Neural Network

In [None]:
#nn_movies = MLPClassifier(random_state = 1)

#nn_movies.fit(trainX, trainY)

#classificationSummary(trainY, clf.predict(trainX))