# Import Data

In [1]:
# Import dependencies
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Read in dataset from SQL
#pd.read_sql_table('table_name', 'postgres:///db_name')

# Read in csv data file
prime_data = pd.read_csv("raw_movies.csv")
prime_data.head()

Unnamed: 0,video_id,cvt_per_day,weighted_categorical_position,weighted_horizontal_poition,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category
0,385504,307127.6056,1,3,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301017,51,other award,PG-13,1.71
1,300175,270338.4264,1,3,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.98325,41,no award,R,3.25
2,361899,256165.8674,1,3,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.763675,58,other award,R,2.646667
3,308314,196622.721,3,4,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.703583,94,Oscar,R,1.666667
4,307201,159841.6521,1,3,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.545533,37,other award,R,3.066667


In [3]:
# Investigate datatypes
prime_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   video_id                       4226 non-null   int64  
 1   cvt_per_day                    4226 non-null   float64
 2   weighted_categorical_position  4226 non-null   int64  
 3   weighted_horizontal_poition    4226 non-null   int64  
 4   import_id                      4226 non-null   object 
 5   release_year                   4226 non-null   int64  
 6   genres                         4226 non-null   object 
 7   imdb_votes                     4226 non-null   int64  
 8   budget                         4226 non-null   int64  
 9   boxoffice                      4226 non-null   int64  
 10  imdb_rating                    4226 non-null   float64
 11  duration_in_mins               4226 non-null   float64
 12  metacritic_score               4226 non-null   i

In [4]:
# Count unique values in genres column
prime_data['genres'].value_counts()

Documentary                                         269
Comedy                                              265
Drama                                               253
Horror                                              149
Drama,Romance                                        81
                                                   ... 
Adventure,Fantasy,Horror                              1
Crime,Drama,Music,Romance                             1
Horror,Thriller,Comedy,Sci-Fi                         1
Sci-Fi,War,Thriller                                   1
Thriller,Foreign/International,Independent,Drama      1
Name: genres, Length: 1165, dtype: int64

In [5]:
# Convert genres column from string into a list
prime_data["genres"] = prime_data.genres.apply(lambda x: x.split(','))
prime_data.head()

Unnamed: 0,video_id,cvt_per_day,weighted_categorical_position,weighted_horizontal_poition,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category
0,385504,307127.6056,1,3,lionsgate,2013,"[Action, Thriller, Drama]",69614,15000000,42930462,6.5,112.301017,51,other award,PG-13,1.71
1,300175,270338.4264,1,3,lionsgate,2013,"[Comedy, Crime, Thriller]",46705,15000000,3301046,6.5,94.98325,41,no award,R,3.25
2,361899,256165.8674,1,3,other,2012,"[Crime, Drama]",197596,26000000,37397291,7.3,115.763675,58,other award,R,2.646667
3,308314,196622.721,3,4,lionsgate,2008,"[Thriller, Drama, War, Documentary, Mystery, A...",356339,15000000,15700000,7.6,130.703583,94,Oscar,R,1.666667
4,307201,159841.6521,1,3,lionsgate,2013,"[Crime, Thriller, Mystery, Documentary]",46720,27220000,8551228,6.4,105.545533,37,other award,R,3.066667


# Encode, Split, & Standardize Data

In [6]:
# Use MultiLabelBinarizer() to break up and encode genre list 
mlb = MultiLabelBinarizer()
prime_data = prime_data.join(pd.DataFrame(mlb.fit_transform(prime_data.pop('genres')), columns=mlb.classes_, index=prime_data.index))
prime_data.head()

Unnamed: 0,video_id,cvt_per_day,weighted_categorical_position,weighted_horizontal_poition,import_id,release_year,imdb_votes,budget,boxoffice,imdb_rating,...,Music,Musicals,Mystery,Reality,Romance,Sci-Fi,Sport,Thriller,War,Western
0,385504,307127.6056,1,3,lionsgate,2013,69614,15000000,42930462,6.5,...,0,0,0,0,0,0,0,1,0,0
1,300175,270338.4264,1,3,lionsgate,2013,46705,15000000,3301046,6.5,...,0,0,0,0,0,0,0,1,0,0
2,361899,256165.8674,1,3,other,2012,197596,26000000,37397291,7.3,...,0,0,0,0,0,0,0,0,0,0
3,308314,196622.721,3,4,lionsgate,2008,356339,15000000,15700000,7.6,...,0,0,1,0,0,0,0,1,1,0
4,307201,159841.6521,1,3,lionsgate,2013,46720,27220000,8551228,6.4,...,0,0,1,0,0,0,0,1,0,0


In [7]:
# Split preprocessed data into features and target arrays
y = prime_data["star_category"].values
X = prime_data.drop(["star_category"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [8]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

ValueError: could not convert string to float: 'mgm'

# Create Learning Model

In [None]:
# Define the model

# Add hidden layers

# Add the output layer that uses a probability activation function

# Check the structure of the Sequential model


In [None]:
# Compile the Sequential model together and customize metrics


# Train and Test Neural Network

In [None]:
# Fit / train the model to the training data


In [None]:
# Evaluate model performance using the test data
