<a href="https://colab.research.google.com/github/hemsmalli5/Final-Project---Week1/blob/master/Project_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# ***Movie Data Analysis***

Through this project we plan to focus on following business predictions/questions:

Predict popular movie ratings and/or genres within certain release period and intricate genres relationships based on investment and release years.

Analysis specific to one genre, predict if highest budget action movies delivered better revenue?



In [1]:
# Import dependencies
#from google.colab import files
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer, StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

# **1. Import Data**

In [2]:
# Upload file
#uploaded = files.upload()

In [3]:
# read the data file
movie_data = pd.read_csv('raw_movies.csv', header= 0, sep= ',', lineterminator='\n')
movie_data.head()

Unnamed: 0,video_id,cvt_per_day,weighted_categorical_position,weighted_horizontal_poition,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category\r
0,385504,307127.6056,1,3,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301017,51,other award,PG-13,1.71
1,300175,270338.4264,1,3,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.98325,41,no award,R,3.25
2,361899,256165.8674,1,3,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.763675,58,other award,R,2.646667
3,308314,196622.721,3,4,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.703583,94,Oscar,R,1.666667
4,307201,159841.6521,1,3,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.545533,37,other award,R,3.066667


In [4]:
# For Step 2, will need to establish SQL Database connection and read in dataset from DB
# pd.read_sql_table('table_name', 'postgres:///db_name')

# **2. Explore data**

## Understand Numerical Features

In [5]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   video_id                       4226 non-null   int64  
 1   cvt_per_day                    4226 non-null   float64
 2   weighted_categorical_position  4226 non-null   int64  
 3   weighted_horizontal_poition    4226 non-null   int64  
 4   import_id                      4226 non-null   object 
 5   release_year                   4226 non-null   int64  
 6   genres                         4226 non-null   object 
 7   imdb_votes                     4226 non-null   int64  
 8   budget                         4226 non-null   int64  
 9   boxoffice                      4226 non-null   int64  
 10  imdb_rating                    4226 non-null   float64
 11  duration_in_mins               4226 non-null   float64
 12  metacritic_score               4226 non-null   i

# **3. Clean and Prep Data**

In [6]:
#Make copy of df for editing
movie_data_2 = movie_data.copy()

## Exclude Erroneous Data

Each video should only appear once in the list, duplicated video will be removed.

In [7]:
if movie_data_2['video_id'].duplicated().sum()==0:
  print('no duplicated index')

no duplicated index


## Remove Columns

In [8]:
# Dropping unnecessary data
movie_data_2 = movie_data_2.drop(columns=['weighted_categorical_position', 'weighted_horizontal_poition'])
movie_data_2.head()

Unnamed: 0,video_id,cvt_per_day,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category\r
0,385504,307127.6056,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301017,51,other award,PG-13,1.71
1,300175,270338.4264,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.98325,41,no award,R,3.25
2,361899,256165.8674,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.763675,58,other award,R,2.646667
3,308314,196622.721,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.703583,94,Oscar,R,1.666667
4,307201,159841.6521,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.545533,37,other award,R,3.066667


## Encode Columns

In [9]:
# Encoded the import_id column alphabetically
import_id = {
    "lionsgate": 1,
    "mgm": 2,
    "paramount":3,
    "other": 0,}

movie_data_2["import_id"] = movie_data_2["import_id"].apply(lambda x: import_id[x])

#Other option is to use OneHotEncoding (1 or 0 column encoding) to reduce numerical bias

In [10]:
# Rename importr_id column
movie_data_2 = movie_data_2.rename(columns = {"import_id":"production_comp"})

In [11]:
# Encoded the awards column
awards = {
    "BAFTA": 1,
    "Golden Globe": 2,
    "Oscar":3,
    "other award": 4,
    "no award": 0}

movie_data_2["awards"] = movie_data_2["awards"].apply(lambda x: awards[x])

In [12]:
# Encoded the mpaa column
mpaa = {
    "G": 1,
    "PG": 2,
    "PG-13":3,
    "NC-17": 4,
    "R": 5,
    "NotRated": 0} 

movie_data_2["mpaa"] = movie_data_2["mpaa"].apply(lambda x: mpaa[x])

## Split Genres

Some videos belong to more than one genere. Splitting each genre into a new column will help the ML model.

In [13]:
# Count unique values in genres column
movie_data_2['genres'].value_counts()

Documentary                                                   269
Comedy                                                        265
Drama                                                         253
Horror                                                        149
Drama,Romance                                                  81
                                                             ... 
Documentary,Music,Comedy                                        1
Drama,Thriller,Action                                           1
Fantasy,Sci-Fi,Drama                                            1
Foreign/International,Documentary,Adventure,Comedy,Romance      1
Sci-Fi,Horror,Fantasy                                           1
Name: genres, Length: 1165, dtype: int64

In [14]:
# Convert genres column from string into a list
movie_data_2["genres"] = movie_data_2.genres.apply(lambda x: x.split(','))
movie_data_2.head()

Unnamed: 0,video_id,cvt_per_day,production_comp,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category\r
0,385504,307127.6056,1,2013,"[Action, Thriller, Drama]",69614,15000000,42930462,6.5,112.301017,51,4,3,1.71
1,300175,270338.4264,1,2013,"[Comedy, Crime, Thriller]",46705,15000000,3301046,6.5,94.98325,41,0,5,3.25
2,361899,256165.8674,0,2012,"[Crime, Drama]",197596,26000000,37397291,7.3,115.763675,58,4,5,2.646667
3,308314,196622.721,1,2008,"[Thriller, Drama, War, Documentary, Mystery, A...",356339,15000000,15700000,7.6,130.703583,94,3,5,1.666667
4,307201,159841.6521,1,2013,"[Crime, Thriller, Mystery, Documentary]",46720,27220000,8551228,6.4,105.545533,37,4,5,3.066667


In [15]:
# Use MultiLabelBinarizer() to break up and encode genre list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('genres')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

Unnamed: 0,video_id,cvt_per_day,production_comp,release_year,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,...,Music,Musicals,Mystery,Reality,Romance,Sci-Fi,Sport,Thriller,War,Western
0,385504,307127.6056,1,2013,69614,15000000,42930462,6.5,112.301017,51,...,0,0,0,0,0,0,0,1,0,0
1,300175,270338.4264,1,2013,46705,15000000,3301046,6.5,94.98325,41,...,0,0,0,0,0,0,0,1,0,0
2,361899,256165.8674,0,2012,197596,26000000,37397291,7.3,115.763675,58,...,0,0,0,0,0,0,0,0,0,0
3,308314,196622.721,1,2008,356339,15000000,15700000,7.6,130.703583,94,...,0,0,1,0,0,0,0,1,1,0
4,307201,159841.6521,1,2013,46720,27220000,8551228,6.4,105.545533,37,...,0,0,1,0,0,0,0,1,0,0


# **4. Pre-Processing for ML**
## Split & Standardize Data

In [16]:
# Split preprocessed data into features and target arrays
y = movie_data_2["star_category\r"].values
X = movie_data_2.drop(["star_category\r"],1).values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [17]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# **5. Create Learning Model**

In [18]:
# Define the model
number_input_features = len(X_train[0])
nn_model = tf.keras.models.Sequential()

# Add layer to the Sequential model using Keras’ Dense class
nn_model.add(tf.keras.layers.Dense(5, input_dim=number_input_features, activation="relu"))

# Add the output layer that uses a probability activation function
nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the Sequential model
nn_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 5)                 200       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 6         
Total params: 206
Trainable params: 206
Non-trainable params: 0
_________________________________________________________________


In [19]:
# Compile the Sequential model together and customize metrics
nn_model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# **6. Train and Test Neural Network**

In [20]:
# Fit / train the keras model to the training data
fit_model = nn_model.fit(X_train_scaled, y_train, epochs=5)

Train on 3169 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [21]:
# Evaluate model performance using the test data
model_loss, model_accuracy = nn_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

1057/1 - 0s - loss: -1.6226e+00 - accuracy: 0.2469
Loss: -3.67988591991513, Accuracy: 0.2469252645969391


# **7. Precitions / Conclusion**

# **8. Summary**
