In [1]:
# Import dependencies
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [2]:
# Read in dataset from SQL
#pd.read_sql_table('table_name', 'postgres:///db_name')

# Read in csv data file
prime_data = pd.read_csv("raw_movies.csv")
prime_data.head()

Unnamed: 0,video_id,cvt_per_day,weighted_categorical_position,weighted_horizontal_poition,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category
0,385504,307127.6056,1,3,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301017,51,other award,PG-13,1.71
1,300175,270338.4264,1,3,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.98325,41,no award,R,3.25
2,361899,256165.8674,1,3,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.763675,58,other award,R,2.646667
3,308314,196622.721,3,4,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.703583,94,Oscar,R,1.666667
4,307201,159841.6521,1,3,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.545533,37,other award,R,3.066667


In [3]:
# Investigate datatypes
prime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   video_id                       4226 non-null   int64  
 1   cvt_per_day                    4226 non-null   float64
 2   weighted_categorical_position  4226 non-null   int64  
 3   weighted_horizontal_poition    4226 non-null   int64  
 4   import_id                      4226 non-null   object 
 5   release_year                   4226 non-null   int64  
 6   genres                         4226 non-null   object 
 7   imdb_votes                     4226 non-null   int64  
 8   budget                         4226 non-null   int64  
 9   boxoffice                      4226 non-null   int64  
 10  imdb_rating                    4226 non-null   float64
 11  duration_in_mins               4226 non-null   float64
 12  metacritic_score               4226 non-null   i

Starting Code for simple cleaning

In [4]:
# Dropping unnecessary data
# If you're reading this: Sam already did a lot of cleaning, the data appears to be robust, uniform, and thorough. I can
# only find columns that need to be dropped. I will also keep original DF copies just in case, so we don't have to go
# back and forth between datasets if something was dropped that shouldn't have been.

prime_data_original_df = prime_data
prime_data = prime_data.drop(columns=[
    'cvt_per_day',
    'weighted_categorical_position',
    'weighted_horizontal_poition'])
prime_data.head()

Unnamed: 0,video_id,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category
0,385504,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301017,51,other award,PG-13,1.71
1,300175,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.98325,41,no award,R,3.25
2,361899,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.763675,58,other award,R,2.646667
3,308314,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.703583,94,Oscar,R,1.666667
4,307201,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.545533,37,other award,R,3.066667


Starting Code for OneHotEncoder

In [5]:
# Count unique values in genres column
prime_data['genres'].value_counts()

Documentary                            269
Comedy                                 265
Drama                                  253
Horror                                 149
Drama,Romance                           81
                                      ... 
Independent,Comedy                       1
Mystery,Drama,Thriller                   1
Drama,Crime,Mystery,Sci-Fi,Thriller      1
Action,Kids & Family,Independent         1
Action,Independent,Adventure,Crime       1
Name: genres, Length: 1165, dtype: int64

In [6]:
# Split different genres into different columns
genres_df = prime_data['genres'].str.split(',', expand=True).rename(columns = lambda x: "genre"+str(x+1))
genres_df.head()

Unnamed: 0,genre1,genre2,genre3,genre4,genre5,genre6,genre7
0,Action,Thriller,Drama,,,,
1,Comedy,Crime,Thriller,,,,
2,Crime,Drama,,,,,
3,Thriller,Drama,War,Documentary,Mystery,Action,
4,Crime,Thriller,Mystery,Documentary,,,


In [7]:
#The label encoder’s fit_transform() method is used to first train the label encoder, 
#then convert the text data into numerical data.
le = LabelEncoder()
genres_df['genre1'] = genres_df['genre1'].astype(str)
genres_df['genre2'] = genres_df['genre2'].astype(str)

genres2 = genres_df.copy()
genres2['genre1'] = le.fit_transform(genres2['genre1'])
genres2['genre2'] = le.fit_transform(genres2['genre2'])

genres2.head()

Unnamed: 0,genre1,genre2,genre3,genre4,genre5,genre6,genre7
0,0,23,Drama,,,,
1,5,5,Thriller,,,,
2,6,7,,,,,
3,23,7,War,Documentary,Mystery,Action,
4,6,23,Mystery,Documentary,,,


In [8]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(genres_df.genre1.values.reshape(-1,1)))

# Rename encoded columns
encode_df.columns = enc.get_feature_names(['genres'])
encode_df.head()

Unnamed: 0,genres_Action,genres_Adult,genres_Adventure,genres_Animation,genres_Anime,genres_Comedy,genres_Crime,genres_Documentary,genres_Drama,genres_Fantasy,...,genres_Music,genres_Musicals,genres_Mystery,genres_Reality,genres_Romance,genres_Sci-Fi,genres_Sport,genres_Thriller,genres_War,genres_Western
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# Merge the two DataFrames together and drop the genres column
prime_data_encoded = prime_data.merge(encode_df,left_index=True,right_index=True).drop("genres",1)

In [12]:
prime_data_encoded

Unnamed: 0,video_id,import_id,release_year,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,...,genres_Music,genres_Musicals,genres_Mystery,genres_Reality,genres_Romance,genres_Sci-Fi,genres_Sport,genres_Thriller,genres_War,genres_Western
0,385504,lionsgate,2013,69614,15000000,42930462,6.5,112.301017,51,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300175,lionsgate,2013,46705,15000000,3301046,6.5,94.983250,41,no award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,361899,other,2012,197596,26000000,37397291,7.3,115.763675,58,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,308314,lionsgate,2008,356339,15000000,15700000,7.6,130.703583,94,Oscar,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,307201,lionsgate,2013,46720,27220000,8551228,6.4,105.545533,37,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4221,160848,other,2010,14,0,0,7.8,4.311600,0,no award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4222,365178,other,2013,0,0,0,0.0,4.878900,0,no award,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4223,11615,other,2008,505,0,0,7.1,110.350000,0,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4224,26365,other,2003,5,100000,0,6.0,79.633333,0,no award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
#le = LabelEncoder()
#prime_data_encoded = prime_data.copy()
#prime_data_encoded = le.fit_transform(prime_data_encoded['import_id'])
#prime_data_encoded
# Encoded the import_id alphabetically
# 0 = lionsgate
# 1 = mgm
# 2 = other
# 3 = paramount

import_id = {
    "lionsgate": 1,
    "mgm": 2,
    "paramount":3,
    "other": 0,}

prime_data_encoded["import_id"] = prime_data_encoded["import_id"].apply(lambda x: import_id[x])


In [14]:
prime_data_encoded.head()

Unnamed: 0,video_id,import_id,release_year,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,...,genres_Music,genres_Musicals,genres_Mystery,genres_Reality,genres_Romance,genres_Sci-Fi,genres_Sport,genres_Thriller,genres_War,genres_Western
0,385504,1,2013,69614,15000000,42930462,6.5,112.301017,51,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300175,1,2013,46705,15000000,3301046,6.5,94.98325,41,no award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,361899,0,2012,197596,26000000,37397291,7.3,115.763675,58,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,308314,1,2008,356339,15000000,15700000,7.6,130.703583,94,Oscar,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,307201,1,2013,46720,27220000,8551228,6.4,105.545533,37,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
prime_data_encoded = prime_data_encoded.rename(columns = {"import_id":"producer"})
prime_data_encoded.head()

Unnamed: 0,video_id,producer,release_year,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,...,genres_Music,genres_Musicals,genres_Mystery,genres_Reality,genres_Romance,genres_Sci-Fi,genres_Sport,genres_Thriller,genres_War,genres_Western
0,385504,1,2013,69614,15000000,42930462,6.5,112.301017,51,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300175,1,2013,46705,15000000,3301046,6.5,94.98325,41,no award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,361899,0,2012,197596,26000000,37397291,7.3,115.763675,58,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,308314,1,2008,356339,15000000,15700000,7.6,130.703583,94,Oscar,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,307201,1,2013,46720,27220000,8551228,6.4,105.545533,37,other award,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
awards = {
    "BAFTA": 1,
    "Golden Globe": 2,
    "Oscar":3,
    "other award": 4,
    "no award": 0}

prime_data_encoded["awards"] = prime_data_encoded["awards"].apply(lambda x: awards[x])

In [26]:
prime_data_encoded.head()

Unnamed: 0,video_id,producer,release_year,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,...,genres_Music,genres_Musicals,genres_Mystery,genres_Reality,genres_Romance,genres_Sci-Fi,genres_Sport,genres_Thriller,genres_War,genres_Western
0,385504,1,2013,69614,15000000,42930462,6.5,112.301017,51,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300175,1,2013,46705,15000000,3301046,6.5,94.98325,41,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,361899,0,2012,197596,26000000,37397291,7.3,115.763675,58,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,308314,1,2008,356339,15000000,15700000,7.6,130.703583,94,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,307201,1,2013,46720,27220000,8551228,6.4,105.545533,37,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
mpaa = {
    "G": 1,
    "PG": 2,
    "PG-13":3,
    "NC-17": 4,
    "R": 5,
    "NotRated": 0} 

prime_data_encoded["mpaa"] = prime_data_encoded["mpaa"].apply(lambda x: mpaa[x])

In [28]:
prime_data_encoded.head()

Unnamed: 0,video_id,producer,release_year,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,...,genres_Music,genres_Musicals,genres_Mystery,genres_Reality,genres_Romance,genres_Sci-Fi,genres_Sport,genres_Thriller,genres_War,genres_Western
0,385504,1,2013,69614,15000000,42930462,6.5,112.301017,51,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,300175,1,2013,46705,15000000,3301046,6.5,94.98325,41,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,361899,0,2012,197596,26000000,37397291,7.3,115.763675,58,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,308314,1,2008,356339,15000000,15700000,7.6,130.703583,94,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,307201,1,2013,46720,27220000,8551228,6.4,105.545533,37,4,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
