# Import Data

In [38]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [32]:
# Read in dataset from SQL
#pd.read_sql_table('table_name', 'postgres:///db_name')

# Read in csv data file
prime_data = pd.read_csv("raw_movies.csv")
prime_data.head()

Unnamed: 0,video_id,cvt_per_day,weighted_categorical_position,weighted_horizontal_poition,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category
0,385504,307127.6056,1,3,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301017,51,other award,PG-13,1.71
1,300175,270338.4264,1,3,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.98325,41,no award,R,3.25
2,361899,256165.8674,1,3,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.763675,58,other award,R,2.646667
3,308314,196622.721,3,4,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.703583,94,Oscar,R,1.666667
4,307201,159841.6521,1,3,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.545533,37,other award,R,3.066667


In [33]:
# Investigate datatypes
prime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   video_id                       4226 non-null   int64  
 1   cvt_per_day                    4226 non-null   float64
 2   weighted_categorical_position  4226 non-null   int64  
 3   weighted_horizontal_poition    4226 non-null   int64  
 4   import_id                      4226 non-null   object 
 5   release_year                   4226 non-null   int64  
 6   genres                         4226 non-null   object 
 7   imdb_votes                     4226 non-null   int64  
 8   budget                         4226 non-null   int64  
 9   boxoffice                      4226 non-null   int64  
 10  imdb_rating                    4226 non-null   float64
 11  duration_in_mins               4226 non-null   float64
 12  metacritic_score               4226 non-null   i

In [34]:
# Count unique values in genres column
prime_data['genres'].value_counts()

Documentary                           269
Comedy                                265
Drama                                 253
Horror                                149
Drama,Romance                          81
                                     ... 
Drama,Kids & Family,Musicals            1
Drama,Kids & Family,Action,Romance      1
Action,Adventure,Comedy,Thriller        1
Action,Adventure,Western,Drama          1
Fantasy,Sci-Fi,Drama                    1
Name: genres, Length: 1165, dtype: int64

In [35]:
# Split different genres into different columns
genres_df = prime_data['genres'].str.split(',', expand=True).rename(columns = lambda x: "genre"+str(x+1))
genres_df.head()

Unnamed: 0,genre1,genre2,genre3,genre4,genre5,genre6,genre7
0,Action,Thriller,Drama,,,,
1,Comedy,Crime,Thriller,,,,
2,Crime,Drama,,,,,
3,Thriller,Drama,War,Documentary,Mystery,Action,
4,Crime,Thriller,Mystery,Documentary,,,


In [41]:
#The label encoder’s fit_transform() method is used to first train the label encoder, 
#then convert the text data into numerical data.
le = LabelEncoder()
genres_df['genre1'] = genres_df['genre1'].astype(str)
genres_df['genre2'] = genres_df['genre2'].astype(str)

genres2 = genres_df.copy()
genres2['genre1'] = le.fit_transform(genres2['genre1'])
genres2['genre2'] = le.fit_transform(genres2['genre2'])

genres2.head()

Unnamed: 0,genre1,genre2,genre3,genre4,genre5,genre6,genre7
0,0,23,Drama,,,,
1,5,5,Thriller,,,,
2,6,7,,,,,
3,23,7,War,Documentary,Mystery,Action,
4,6,23,Mystery,Documentary,,,


# Encode, Split, & Standardize Data

In [36]:
# Create the OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit the encoder and produce encoded DataFrame
encode_df = pd.DataFrame(enc.fit_transform(genres_df.genre1.values.reshape(-1,1)))

# Rename encoded columns
#encode_df.columns = enc.get_feature_names(['genres'])
encode_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,16,17,18,19,20,21,22,23,24,25
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Merge the two DataFrames together and drop the genres column
prime_data.merge(encode_df,left_index=True,right_index=True).drop("genres",1)

In [4]:
# Split preprocessed data into features and target arrays

# Split the preprocessed data into a training and testing dataset


In [5]:
# Create a StandardScaler instance

# Fit the StandardScaler

# Scale the data


# Create Learning Model

In [6]:
# Define the model

# Add hidden layers

# Add the output layer that uses a probability activation function

# Check the structure of the Sequential model


In [7]:
# Compile the Sequential model together and customize metrics


# Train and Test Neural Network

In [8]:
# Fit / train the model to the training data


In [9]:
# Evaluate model performance using the test data
