In [64]:
import numpy as np
import seaborn as sns
import pandas as pd
import scipy.stats
import matplotlib.pyplot as plt
import math
from sklearn.model_selection import train_test_split

In [2]:
movies = pd.read_csv("movie_metadata.csv")

In [3]:
movies

Unnamed: 0,color,director_name,num_critic_for_reviews,duration,director_facebook_likes,actor_3_facebook_likes,actor_2_name,actor_1_facebook_likes,gross,genres,...,num_user_for_reviews,language,country,content_rating,budget,title_year,actor_2_facebook_likes,imdb_score,aspect_ratio,movie_facebook_likes
0,Color,James Cameron,723.0,178.0,0.0,855.0,Joel David Moore,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,...,3054.0,English,USA,PG-13,237000000.0,2009.0,936.0,7.9,1.78,33000
1,Color,Gore Verbinski,302.0,169.0,563.0,1000.0,Orlando Bloom,40000.0,309404152.0,Action|Adventure|Fantasy,...,1238.0,English,USA,PG-13,300000000.0,2007.0,5000.0,7.1,2.35,0
2,Color,Sam Mendes,602.0,148.0,0.0,161.0,Rory Kinnear,11000.0,200074175.0,Action|Adventure|Thriller,...,994.0,English,UK,PG-13,245000000.0,2015.0,393.0,6.8,2.35,85000
3,Color,Christopher Nolan,813.0,164.0,22000.0,23000.0,Christian Bale,27000.0,448130642.0,Action|Thriller,...,2701.0,English,USA,PG-13,250000000.0,2012.0,23000.0,8.5,2.35,164000
4,,Doug Walker,,,131.0,,Rob Walker,131.0,,Documentary,...,,,,,,,12.0,7.1,,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5038,Color,Scott Smith,1.0,87.0,2.0,318.0,Daphne Zuniga,637.0,,Comedy|Drama,...,6.0,English,Canada,,,2013.0,470.0,7.7,,84
5039,Color,,43.0,43.0,,319.0,Valorie Curry,841.0,,Crime|Drama|Mystery|Thriller,...,359.0,English,USA,TV-14,,,593.0,7.5,16.00,32000
5040,Color,Benjamin Roberds,13.0,76.0,0.0,0.0,Maxwell Moody,0.0,,Drama|Horror|Thriller,...,3.0,English,USA,,1400.0,2013.0,0.0,6.3,,16
5041,Color,Daniel Hsia,14.0,100.0,0.0,489.0,Daniel Henney,946.0,10443.0,Comedy|Drama|Romance,...,9.0,English,USA,PG-13,,2012.0,719.0,6.3,2.35,660


# data cleaning and processing

## dropping nas

In [4]:
movies.dropna(inplace = True)

### consequences of dropping nas <-- LYDIA TO DO

What values are we losing when we drop nas? is this causing the dataset to be more imabalanced or less imbalanced? some visualizations like histograms may be helpful here like 

## identifying relevant columns to keep for analysis

In [5]:
string_columns = movies.select_dtypes(exclude=[np.number])
string_columns.columns

Index(['color', 'director_name', 'actor_2_name', 'genres', 'actor_1_name',
       'movie_title', 'actor_3_name', 'plot_keywords', 'movie_imdb_link',
       'language', 'country', 'content_rating'],
      dtype='object')

In [6]:
movies.columns

## not really sure what num_critic_for_review means so I'll be dropping it 

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')

In [7]:
# Dropping most string columns & other less relevant columns

movies_relevant_columns = movies.drop(['color', 'director_name', 'actor_2_name', 'actor_1_name',
       'movie_title', 'actor_3_name', 'movie_imdb_link',
       'language', 'country', 'content_rating', 'num_critic_for_reviews', 'facenumber_in_poster', 'aspect_ratio'], axis = 1)

In [8]:
movies_relevant_columns.columns

Index(['duration', 'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_1_facebook_likes', 'gross', 'genres', 'num_voted_users',
       'cast_total_facebook_likes', 'plot_keywords', 'num_user_for_reviews',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'movie_facebook_likes'],
      dtype='object')

In [9]:
movies = movies_relevant_columns

# model generation

## model with no genre or plot features (numeric columns only)

### data processing

In [10]:
numeric_columns_df = movies.drop(["genres", "plot_keywords"], axis = 1)

In [11]:
numeric_columns_df

Unnamed: 0,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,movie_facebook_likes
0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,3054.0,237000000.0,2009.0,936.0,7.9,33000
1,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,1238.0,300000000.0,2007.0,5000.0,7.1,0
2,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,994.0,245000000.0,2015.0,393.0,6.8,85000
3,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,2701.0,250000000.0,2012.0,23000.0,8.5,164000
5,132.0,475.0,530.0,640.0,73058679.0,212204,1873,738.0,263700000.0,2012.0,632.0,6.6,24000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,110.0,107.0,45.0,576.0,136007.0,3924,776,39.0,4500.0,2004.0,133.0,6.9,171
5027,90.0,397.0,0.0,5.0,673780.0,4555,5,26.0,10000.0,2000.0,0.0,7.5,697
5033,77.0,291.0,8.0,291.0,424760.0,72639,368,371.0,7000.0,2004.0,45.0,7.0,19000
5035,81.0,0.0,6.0,121.0,2040920.0,52055,147,130.0,7000.0,1992.0,20.0,6.9,0


### splitting into training and testing

In [67]:
X = numeric_columns_df

In [71]:
numeric_columns_df

Unnamed: 0,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,movie_facebook_likes
0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,3054.0,237000000.0,2009.0,936.0,7.9,33000
1,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,1238.0,300000000.0,2007.0,5000.0,7.1,0
2,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,994.0,245000000.0,2015.0,393.0,6.8,85000
3,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,2701.0,250000000.0,2012.0,23000.0,8.5,164000
5,132.0,475.0,530.0,640.0,73058679.0,212204,1873,738.0,263700000.0,2012.0,632.0,6.6,24000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,110.0,107.0,45.0,576.0,136007.0,3924,776,39.0,4500.0,2004.0,133.0,6.9,171
5027,90.0,397.0,0.0,5.0,673780.0,4555,5,26.0,10000.0,2000.0,0.0,7.5,697
5033,77.0,291.0,8.0,291.0,424760.0,72639,368,371.0,7000.0,2004.0,45.0,7.0,19000
5035,81.0,0.0,6.0,121.0,2040920.0,52055,147,130.0,7000.0,1992.0,20.0,6.9,0


In [75]:
np.random.seed(42)

#X = .drop("imdb_score", axis = 1)
#y = movies.imdb_score

In [77]:
y = movies.imdb_score

0       7.9
1       7.1
2       6.8
3       8.5
5       6.6
       ... 
5026    6.9
5027    7.5
5033    7.0
5035    6.9
5042    6.6
Name: imdb_score, Length: 3756, dtype: float64

In [80]:
X = numeric_columns_df

In [82]:
X

0       7.9
1       7.1
2       6.8
3       8.5
5       6.6
       ... 
5026    6.9
5027    7.5
5033    7.0
5035    6.9
5042    6.6
Name: imdb_score, Length: 3756, dtype: float64

In [84]:
X = X.drop("imdb_score", axis = 1)

KeyError: "['imdb_score'] not found in axis"

In [86]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

### standardizing columns

In [88]:
from sklearn.preprocessing import StandardScaler

In [90]:
scaler = StandardScaler()

# fit scaler on training data
norm = scaler.fit(X_train)

# transform training data
X_train_norm = pd.DataFrame(scaler.transform(X_train), columns = X_train.columns)

# transform testing dataabs
X_test_norm = pd.DataFrame(scaler.transform(X_test), columns = X_test.columns)

## model with genre one hot encoded (no plot col)

### data processing

In [12]:
genre_one_hot_df = movies.drop(["plot_keywords"], axis = 1)

In [13]:
genres = movies.genres

In [15]:
genres_lists = [x.split("|") for x in genres]

In [16]:
genres_lists

[['Action', 'Adventure', 'Fantasy', 'Sci-Fi'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Action', 'Adventure', 'Thriller'],
 ['Action', 'Thriller'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Action', 'Adventure', 'Romance'],
 ['Adventure',
  'Animation',
  'Comedy',
  'Family',
  'Fantasy',
  'Musical',
  'Romance'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Adventure', 'Family', 'Fantasy', 'Mystery'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Action', 'Adventure'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Action', 'Adventure', 'Western'],
 ['Action', 'Adventure', 'Fantasy', 'Sci-Fi'],
 ['Action', 'Adventure', 'Family', 'Fantasy'],
 ['Action', 'Adventure', 'Sci-Fi'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Action', 'Adventure', 'Comedy', 'Family', 'Fantasy', 'Sci-Fi'],
 ['Adventure', 'Fantasy'],
 ['Action', 'Adventure', 'Fantasy'],
 ['Action', 'Adventure', 'Drama', 'History'],
 ['Adventure', 'Fantasy'],
 ['Adventure', 'Family', 'Fantasy'],
 ['Action', 'Adventure

In [17]:
genre_one_hot_df["genres"] = genres_lists

In [18]:
genre_one_hot_df

Unnamed: 0,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,genres,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,budget,title_year,actor_2_facebook_likes,imdb_score,movie_facebook_likes
0,178.0,0.0,855.0,1000.0,760505847.0,"[Action, Adventure, Fantasy, Sci-Fi]",886204,4834,3054.0,237000000.0,2009.0,936.0,7.9,33000
1,169.0,563.0,1000.0,40000.0,309404152.0,"[Action, Adventure, Fantasy]",471220,48350,1238.0,300000000.0,2007.0,5000.0,7.1,0
2,148.0,0.0,161.0,11000.0,200074175.0,"[Action, Adventure, Thriller]",275868,11700,994.0,245000000.0,2015.0,393.0,6.8,85000
3,164.0,22000.0,23000.0,27000.0,448130642.0,"[Action, Thriller]",1144337,106759,2701.0,250000000.0,2012.0,23000.0,8.5,164000
5,132.0,475.0,530.0,640.0,73058679.0,"[Action, Adventure, Sci-Fi]",212204,1873,738.0,263700000.0,2012.0,632.0,6.6,24000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,110.0,107.0,45.0,576.0,136007.0,"[Drama, Music, Romance]",3924,776,39.0,4500.0,2004.0,133.0,6.9,171
5027,90.0,397.0,0.0,5.0,673780.0,[Drama],4555,5,26.0,10000.0,2000.0,0.0,7.5,697
5033,77.0,291.0,8.0,291.0,424760.0,"[Drama, Sci-Fi, Thriller]",72639,368,371.0,7000.0,2004.0,45.0,7.0,19000
5035,81.0,0.0,6.0,121.0,2040920.0,"[Action, Crime, Drama, Romance, Thriller]",52055,147,130.0,7000.0,1992.0,20.0,6.9,0


In [20]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer(sparse_output=True)

In [21]:
genre_one_hot_df = genre_one_hot_df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(genre_one_hot_df.pop('genres')),
                index=movies.index,
                columns=mlb.classes_))

In [22]:
genre_one_hot_df

Unnamed: 0,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,budget,title_year,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,3054.0,237000000.0,2009.0,...,0,0,0,0,0,1,0,0,0,0
1,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,1238.0,300000000.0,2007.0,...,0,0,0,0,0,0,0,0,0,0
2,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,994.0,245000000.0,2015.0,...,0,0,0,0,0,0,0,1,0,0
3,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,2701.0,250000000.0,2012.0,...,0,0,0,0,0,0,0,1,0,0
5,132.0,475.0,530.0,640.0,73058679.0,212204,1873,738.0,263700000.0,2012.0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,110.0,107.0,45.0,576.0,136007.0,3924,776,39.0,4500.0,2004.0,...,0,1,0,0,1,0,0,0,0,0
5027,90.0,397.0,0.0,5.0,673780.0,4555,5,26.0,10000.0,2000.0,...,0,0,0,0,0,0,0,0,0,0
5033,77.0,291.0,8.0,291.0,424760.0,72639,368,371.0,7000.0,2004.0,...,0,0,0,0,0,1,0,1,0,0
5035,81.0,0.0,6.0,121.0,2040920.0,52055,147,130.0,7000.0,1992.0,...,0,0,0,0,1,0,0,1,0,0


### splitting into testing & training & standardizing

## model with plot one hot encoded (no genre col)

### data processing

In [23]:
plot_one_hot_df = movies.drop(["genres"], axis = 1)

In [24]:
plots = movies.plot_keywords

In [25]:
plots_lists = [x.split("|") for x in plots]

In [29]:
plot_one_hot_df["plot_keywords"] = plots_lists

In [30]:
mlb = MultiLabelBinarizer(sparse_output=True)

In [31]:
plot_one_hot_df = plot_one_hot_df.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(plot_one_hot_df.pop('plot_keywords')),
                index=plot_one_hot_df.index,
                columns=mlb.classes_))

In [34]:
plot_one_hot_df

Unnamed: 0,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,budget,title_year,...,zeus,zodiac,zodiac killer,zoloft,zombie,zombie apocalypse,zombie spoof,zoo,zoologist,zorro
0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,3054.0,237000000.0,2009.0,...,0,0,0,0,0,0,0,0,0,0
1,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,1238.0,300000000.0,2007.0,...,0,0,0,0,0,0,0,0,0,0
2,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,994.0,245000000.0,2015.0,...,0,0,0,0,0,0,0,0,0,0
3,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,2701.0,250000000.0,2012.0,...,0,0,0,0,0,0,0,0,0,0
5,132.0,475.0,530.0,640.0,73058679.0,212204,1873,738.0,263700000.0,2012.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,110.0,107.0,45.0,576.0,136007.0,3924,776,39.0,4500.0,2004.0,...,0,0,0,0,0,0,0,0,0,0
5027,90.0,397.0,0.0,5.0,673780.0,4555,5,26.0,10000.0,2000.0,...,0,0,0,0,0,0,0,0,0,0
5033,77.0,291.0,8.0,291.0,424760.0,72639,368,371.0,7000.0,2004.0,...,0,0,0,0,0,0,0,0,0,0
5035,81.0,0.0,6.0,121.0,2040920.0,52055,147,130.0,7000.0,1992.0,...,0,0,0,0,0,0,0,0,0,0


### splitting into testing & training & standardizing

## model w/ only genre tags

In [37]:
movies.columns

Index(['duration', 'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_1_facebook_likes', 'gross', 'genres', 'num_voted_users',
       'cast_total_facebook_likes', 'plot_keywords', 'num_user_for_reviews',
       'budget', 'title_year', 'actor_2_facebook_likes', 'imdb_score',
       'movie_facebook_likes'],
      dtype='object')

In [39]:
genres_df = genre_one_hot_df.drop(['duration', 'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_1_facebook_likes', 'gross', 'num_voted_users',
       'cast_total_facebook_likes', 'num_user_for_reviews',
       'budget', 'title_year', 'actor_2_facebook_likes',
       'movie_facebook_likes'], axis = 1)

In [40]:
genres_df

Unnamed: 0,imdb_score,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,7.9,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,7.1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6.8,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,8.5,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
5,6.6,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,6.9,0,0,0,0,0,0,0,1,0,...,0,1,0,0,1,0,0,0,0,0
5027,7.5,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
5033,7.0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,1,0,0
5035,6.9,1,0,0,0,0,1,0,1,0,...,0,0,0,0,1,0,0,1,0,0


### splitting into testing & training & standardizing

## model with only plot tags

In [42]:
plot_df = plot_one_hot_df.drop(['duration', 'director_facebook_likes', 'actor_3_facebook_likes',
       'actor_1_facebook_likes', 'gross', 'num_voted_users',
       'cast_total_facebook_likes', 'num_user_for_reviews',
       'budget', 'title_year', 'actor_2_facebook_likes',
       'movie_facebook_likes'], axis = 1)

In [43]:
plot_df

Unnamed: 0,imdb_score,10 year old,1000000 b.c.,1190s,12 step program,12 year old,12 year time span,12th century,13 year old,13 year olds,...,zeus,zodiac,zodiac killer,zoloft,zombie,zombie apocalypse,zombie spoof,zoo,zoologist,zorro
0,7.9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7.1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,6.8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8.5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6.6,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,6.9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5027,7.5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5033,7.0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5035,6.9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### splitting into testing & training & standardizing

#

## model with all plot and genre features

In [51]:
movies_dupe = movies

In [52]:
movies_dupe["plot_keywords"] = plots_lists

In [53]:
mlb = MultiLabelBinarizer(sparse_output=True)

In [54]:
movies_dupe = movies_dupe.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(movies_dupe.pop('plot_keywords')),
                index=movies_dupe.index,
                columns=mlb.classes_))

In [55]:
movies_dupe

Unnamed: 0,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,genres,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,budget,...,zeus,zodiac,zodiac killer,zoloft,zombie,zombie apocalypse,zombie spoof,zoo,zoologist,zorro
0,178.0,0.0,855.0,1000.0,760505847.0,Action|Adventure|Fantasy|Sci-Fi,886204,4834,3054.0,237000000.0,...,0,0,0,0,0,0,0,0,0,0
1,169.0,563.0,1000.0,40000.0,309404152.0,Action|Adventure|Fantasy,471220,48350,1238.0,300000000.0,...,0,0,0,0,0,0,0,0,0,0
2,148.0,0.0,161.0,11000.0,200074175.0,Action|Adventure|Thriller,275868,11700,994.0,245000000.0,...,0,0,0,0,0,0,0,0,0,0
3,164.0,22000.0,23000.0,27000.0,448130642.0,Action|Thriller,1144337,106759,2701.0,250000000.0,...,0,0,0,0,0,0,0,0,0,0
5,132.0,475.0,530.0,640.0,73058679.0,Action|Adventure|Sci-Fi,212204,1873,738.0,263700000.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,110.0,107.0,45.0,576.0,136007.0,Drama|Music|Romance,3924,776,39.0,4500.0,...,0,0,0,0,0,0,0,0,0,0
5027,90.0,397.0,0.0,5.0,673780.0,Drama,4555,5,26.0,10000.0,...,0,0,0,0,0,0,0,0,0,0
5033,77.0,291.0,8.0,291.0,424760.0,Drama|Sci-Fi|Thriller,72639,368,371.0,7000.0,...,0,0,0,0,0,0,0,0,0,0
5035,81.0,0.0,6.0,121.0,2040920.0,Action|Crime|Drama|Romance|Thriller,52055,147,130.0,7000.0,...,0,0,0,0,0,0,0,0,0,0


In [56]:
movies_dupe["plot_keywords"] = genres_lists

In [57]:
movies_dupe["genres"] = genres_lists

In [58]:
mlb = MultiLabelBinarizer(sparse_output=True)

In [59]:
movies_dupe = movies_dupe.join(
            pd.DataFrame.sparse.from_spmatrix(
                mlb.fit_transform(movies_dupe.pop('genres')),
                index=movies_dupe.index,
                columns=mlb.classes_))

In [60]:
movies_dupe

Unnamed: 0,duration,director_facebook_likes,actor_3_facebook_likes,actor_1_facebook_likes,gross,num_voted_users,cast_total_facebook_likes,num_user_for_reviews,budget,title_year,...,Horror,Music,Musical,Mystery,Romance,Sci-Fi,Sport,Thriller,War,Western
0,178.0,0.0,855.0,1000.0,760505847.0,886204,4834,3054.0,237000000.0,2009.0,...,0,0,0,0,0,1,0,0,0,0
1,169.0,563.0,1000.0,40000.0,309404152.0,471220,48350,1238.0,300000000.0,2007.0,...,0,0,0,0,0,0,0,0,0,0
2,148.0,0.0,161.0,11000.0,200074175.0,275868,11700,994.0,245000000.0,2015.0,...,0,0,0,0,0,0,0,1,0,0
3,164.0,22000.0,23000.0,27000.0,448130642.0,1144337,106759,2701.0,250000000.0,2012.0,...,0,0,0,0,0,0,0,1,0,0
5,132.0,475.0,530.0,640.0,73058679.0,212204,1873,738.0,263700000.0,2012.0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5026,110.0,107.0,45.0,576.0,136007.0,3924,776,39.0,4500.0,2004.0,...,0,1,0,0,1,0,0,0,0,0
5027,90.0,397.0,0.0,5.0,673780.0,4555,5,26.0,10000.0,2000.0,...,0,0,0,0,0,0,0,0,0,0
5033,77.0,291.0,8.0,291.0,424760.0,72639,368,371.0,7000.0,2004.0,...,0,0,0,0,0,1,0,1,0,0
5035,81.0,0.0,6.0,121.0,2040920.0,52055,147,130.0,7000.0,1992.0,...,0,0,0,0,1,0,0,1,0,0


In [62]:
all_features_df = movies_dupe

# previous code

### dropping nas

In [204]:
movies_no_na = movies_clean_genres.dropna()

In [205]:
#movies_no_na

In [206]:
string_columns = movies_no_na.select_dtypes(exclude=[np.number])
string_columns

Unnamed: 0,movie_color,director_name,actor_2_name,actor_1_name,movie_title,actor_3_name,movie_imdb_link,movie_language,country,content_rating
0,Color,James Cameron,Joel David Moore,CCH Pounder,Avatar,Wes Studi,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,English,USA,PG-13
1,Color,Gore Verbinski,Orlando Bloom,Johnny Depp,Pirates of the Caribbean: At World's End,Jack Davenport,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,English,USA,PG-13
2,Color,Sam Mendes,Rory Kinnear,Christoph Waltz,Spectre,Stephanie Sigman,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,English,UK,PG-13
3,Color,Christopher Nolan,Christian Bale,Tom Hardy,The Dark Knight Rises,Joseph Gordon-Levitt,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,English,USA,PG-13
5,Color,Andrew Stanton,Samantha Morton,Daryl Sabara,John Carter,Polly Walker,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,English,USA,PG-13
...,...,...,...,...,...,...,...,...,...,...
5026,Color,Olivier Assayas,Béatrice Dalle,Maggie Cheung,Clean,Don McKellar,http://www.imdb.com/title/tt0388838/?ref_=fn_t...,French,France,R
5027,Color,Jafar Panahi,Nargess Mamizadeh,Fereshteh Sadre Orafaiy,The Circle,Mojgan Faramarzi,http://www.imdb.com/title/tt0255094/?ref_=fn_t...,Persian,Iran,Not Rated
5033,Color,Shane Carruth,David Sullivan,Shane Carruth,Primer,Casey Gooden,http://www.imdb.com/title/tt0390384/?ref_=fn_t...,English,USA,PG-13
5035,Color,Robert Rodriguez,Peter Marquardt,Carlos Gallardo,El Mariachi,Consuelo Gómez,http://www.imdb.com/title/tt0104815/?ref_=fn_t...,Spanish,USA,R


### finding relevant columns to keep for analysis 

In [207]:
# Dropping string columns
movies_relevant_columns = movies_no_na.columns.drop(["facenumber_in_poster", "aspect_ratio", "movie_imdb_link","movie_color","actor_1_name","actor_2_name","director_name",
                                                    "movie_title","actor_3_name","movie_imdb_link","movie_language","country","content_rating"])
len(movies_relevant_columns)

8101

In [208]:
movies_clean = movies_no_na[movies_relevant_columns]

In [209]:
# Normalization
# x should not contain y
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
import numpy as np
X = movies_clean
scaler.fit(X)
X = pd.DataFrame(scaler.transform(X), columns = X.columns)



### splitting the data into training and testing

In [215]:
import numpy as np
from sklearn.model_selection import train_test_split
np.random.seed(42)

In [211]:
X = X.drop("imdb_score", axis = 1)
y = movies_clean.imdb_score

In [216]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

## Exploratory data analysis

In [None]:
# Check for correlation between predictors using a confusion matrix 
# Helpful- https://jazpeng.github.io/predict_movie_ratings/

## One hot encoding

## Model generation

In [None]:
# Logistic Regression
# Decision tree
# Lasso 
# Ridge 
# KNN classifier
# Random forest
# Compare to baseline accuracy score?
# Use GridSerach CV tool?

# action items

# questions

should we do more EDA to figure out how to clean this properly and / or to indicate what parts of the data we're losing when we drop NAs? (for example, if the NAs are mostly associated with older films or a particular genre of film)