<a href="https://colab.research.google.com/github/hemsmalli5/Final-Project---Week1/blob/master/Project_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# ***Video Data Analysis***


In [None]:
# Import dependencies
from google.colab import files
import numpy as np
import pandas as pd
import sklearn as sl
import sklearn.preprocessing as preprocessing
from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('max_colwidth', 100)

# **1. Import Data**

In [None]:
# Upload file
uploaded = files.upload()

Saving raw_movies.csv to raw_movies (2).csv


In [None]:
# read the data file
movie_data = pd.read_csv('raw_movies.csv', header= 0, sep= ',', lineterminator='\n')
movie_data.head()

Unnamed: 0,video_id,cvt_per_day,weighted_categorical_position,weighted_horizontal_poition,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category\r
0,385504,307127.606,1,3,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301,51,other award,PG-13,1.71
1,300175,270338.426,1,3,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.983,41,no award,R,3.25
2,361899,256165.867,1,3,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.764,58,other award,R,2.647
3,308314,196622.721,3,4,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.704,94,Oscar,R,1.667
4,307201,159841.652,1,3,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.546,37,other award,R,3.067


In [None]:
# For Step 2, will need to establish SQL Database connection and read in dataset from DB
# pd.read_sql_table('table_name', 'postgres:///db_name')

# **2. Explore data**

## Understand Numerical Features

In [None]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 16 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   video_id                       4226 non-null   int64  
 1   cvt_per_day                    4226 non-null   float64
 2   weighted_categorical_position  4226 non-null   int64  
 3   weighted_horizontal_poition    4226 non-null   int64  
 4   import_id                      4226 non-null   object 
 5   release_year                   4226 non-null   int64  
 6   genres                         4226 non-null   object 
 7   imdb_votes                     4226 non-null   int64  
 8   budget                         4226 non-null   int64  
 9   boxoffice                      4226 non-null   int64  
 10  imdb_rating                    4226 non-null   float64
 11  duration_in_mins               4226 non-null   float64
 12  metacritic_score               4226 non-null   i

# **3. Clean and Prep Data**

In [None]:
#Make copy of df for editing
movie_data_2 = movie_data.copy()

## Exclude Erroneous Data

Each video should only appear once in the list, duplicated video will be removed.

In [None]:
if movie_data_2['video_id'].duplicated().sum()==0:
  print('no duplicated index')

no duplicated index


## Remove Columns

In [None]:
# Dropping unnecessary data
movie_data_2 = movie_data_2.drop(columns=['weighted_categorical_position', 'weighted_horizontal_poition'])
movie_data_2.head()

Unnamed: 0,video_id,cvt_per_day,import_id,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category\r
0,385504,307127.606,lionsgate,2013,"Action,Thriller,Drama",69614,15000000,42930462,6.5,112.301,51,other award,PG-13,1.71
1,300175,270338.426,lionsgate,2013,"Comedy,Crime,Thriller",46705,15000000,3301046,6.5,94.983,41,no award,R,3.25
2,361899,256165.867,other,2012,"Crime,Drama",197596,26000000,37397291,7.3,115.764,58,other award,R,2.647
3,308314,196622.721,lionsgate,2008,"Thriller,Drama,War,Documentary,Mystery,Action",356339,15000000,15700000,7.6,130.704,94,Oscar,R,1.667
4,307201,159841.652,lionsgate,2013,"Crime,Thriller,Mystery,Documentary",46720,27220000,8551228,6.4,105.546,37,other award,R,3.067


## Encode Columns

In [None]:
# Encoded the import_id column alphabetically
import_id = {
    "lionsgate": 1,
    "mgm": 2,
    "paramount":3,
    "other": 0,}

movie_data_2["import_id"] = movie_data_2["import_id"].apply(lambda x: import_id[x])

#Other option is to use OneHotEncoding (1 or 0 column encoding) to reduce numerical bias

In [None]:
# Rename importr_id column
movie_data_2 = movie_data_2.rename(columns = {"import_id":"production_comp"})

In [None]:
# Encoded the awards column
awards = {
    "BAFTA": 1,
    "Golden Globe": 2,
    "Oscar":3,
    "other award": 4,
    "no award": 0}

movie_data_2["awards"] = movie_data_2["awards"].apply(lambda x: awards[x])

In [None]:
# Encoded the mpaa column
mpaa = {
    "G": 1,
    "PG": 2,
    "PG-13":3,
    "NC-17": 4,
    "R": 5,
    "NotRated": 0} 

movie_data_2["mpaa"] = movie_data_2["mpaa"].apply(lambda x: mpaa[x])

## Split Genres

Some videos belong to more than one genere. Splitting each genre into a new column will help the ML model.

In [None]:
# Count unique values in genres column
movie_data_2['genres'].value_counts()

Documentary                                                          269
Comedy                                                               265
Drama                                                                253
Horror                                                               149
Drama,Romance                                                         81
Comedy,Drama                                                          79
Thriller                                                              77
Comedy,Drama,Romance                                                  58
Horror,Thriller                                                       58
Comedy,Romance                                                        51
Documentary,Music                                                     48
Action                                                                46
Drama,Comedy                                                          42
Music                                              

In [None]:
# Convert genres column from string into a list
movie_data_2["genres"] = movie_data_2.genres.apply(lambda x: x.split(','))
movie_data_2.head()

Unnamed: 0,video_id,cvt_per_day,production_comp,release_year,genres,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category\r
0,385504,307127.606,1,2013,"[Action, Thriller, Drama]",69614,15000000,42930462,6.5,112.301,51,4,3,1.71
1,300175,270338.426,1,2013,"[Comedy, Crime, Thriller]",46705,15000000,3301046,6.5,94.983,41,0,5,3.25
2,361899,256165.867,0,2012,"[Crime, Drama]",197596,26000000,37397291,7.3,115.764,58,4,5,2.647
3,308314,196622.721,1,2008,"[Thriller, Drama, War, Documentary, Mystery, Action]",356339,15000000,15700000,7.6,130.704,94,3,5,1.667
4,307201,159841.652,1,2013,"[Crime, Thriller, Mystery, Documentary]",46720,27220000,8551228,6.4,105.546,37,4,5,3.067


In [None]:
# Use MultiLabelBinarizer() to break up and encode genre list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('genres')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

Unnamed: 0,video_id,cvt_per_day,production_comp,release_year,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category\r,Action,Adult,Adventure,Animation,Anime,Comedy,Crime,Documentary,Drama,Fantasy,Foreign/International,Holiday,Horror,Independent,Kids & Family,LGBT,Lifestyle,Music,Musicals,Mystery,Reality,Romance,Sci-Fi,Sport,Thriller,War,Western
0,385504,307127.606,1,2013,69614,15000000,42930462,6.5,112.301,51,4,3,1.71,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,300175,270338.426,1,2013,46705,15000000,3301046,6.5,94.983,41,0,5,3.25,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,361899,256165.867,0,2012,197596,26000000,37397291,7.3,115.764,58,4,5,2.647,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,308314,196622.721,1,2008,356339,15000000,15700000,7.6,130.704,94,3,5,1.667,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0
4,307201,159841.652,1,2013,46720,27220000,8551228,6.4,105.546,37,4,5,3.067,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


## Clean Up Categorical Features

In [None]:
movie_data_2[['budget','boxoffice','metacritic_score','star_category\r','imdb_votes','imdb_rating']] = movie_data_2[['budget','boxoffice','metacritic_score','star_category\r','imdb_votes','imdb_rating']].replace(0, np.nan)
print(movie_data_2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   video_id               4226 non-null   int64  
 1   cvt_per_day            4226 non-null   float64
 2   production_comp        4226 non-null   int64  
 3   release_year           4226 non-null   int64  
 4   imdb_votes             3882 non-null   float64
 5   budget                 1772 non-null   float64
 6   boxoffice              1032 non-null   float64
 7   imdb_rating            3882 non-null   float64
 8   duration_in_mins       4226 non-null   float64
 9   metacritic_score       1214 non-null   float64
 10  awards                 4226 non-null   int64  
 11  mpaa                   4226 non-null   int64  
 12  star_category         2380 non-null   float64
 13  Action                 4226 non-null   int64  
 14  Adult                  4226 non-null   int64  
 15  Adve

In [None]:
print(movie_data_2.isnull().sum()/movie_data_2.shape[0])

video_id                0.000
cvt_per_day             0.000
production_comp         0.000
release_year            0.000
imdb_votes              0.081
budget                  0.581
boxoffice               0.756
imdb_rating             0.081
duration_in_mins        0.000
metacritic_score        0.713
awards                  0.000
mpaa                    0.000
star_category\r         0.437
Action                  0.000
Adult                   0.000
Adventure               0.000
Animation               0.000
Anime                   0.000
Comedy                  0.000
Crime                   0.000
Documentary             0.000
Drama                   0.000
Fantasy                 0.000
Foreign/International   0.000
Holiday                 0.000
Horror                  0.000
Independent             0.000
Kids & Family           0.000
LGBT                    0.000
Lifestyle               0.000
Music                   0.000
Musicals                0.000
Mystery                 0.000
Reality   

Filling missing data with mean value.



In [None]:
movie_data_2['boxoffice'] = movie_data_2['boxoffice'].fillna(movie_data_2['boxoffice'].mean())
movie_data_2['metacritic_score']=movie_data_2['metacritic_score'].fillna(movie_data_2['metacritic_score'].mean())
movie_data_2['star_category\r'] = movie_data_2['star_category\r'].fillna(movie_data_2['imdb_votes'].mean())
movie_data_2['imdb_votes'] = movie_data_2['imdb_votes'].fillna(movie_data_2['imdb_votes'].mean())
movie_data_2['imdb_rating']=movie_data_2['imdb_rating'].fillna(movie_data_2['imdb_rating'].mean())
movie_data_2['budget']= movie_data_2['budget'].fillna(movie_data_2['budget'].mean())
print(movie_data_2.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4226 entries, 0 to 4225
Data columns (total 40 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   video_id               4226 non-null   int64  
 1   cvt_per_day            4226 non-null   float64
 2   production_comp        4226 non-null   int64  
 3   release_year           4226 non-null   int64  
 4   imdb_votes             4226 non-null   float64
 5   budget                 4226 non-null   float64
 6   boxoffice              4226 non-null   float64
 7   imdb_rating            4226 non-null   float64
 8   duration_in_mins       4226 non-null   float64
 9   metacritic_score       4226 non-null   float64
 10  awards                 4226 non-null   int64  
 11  mpaa                   4226 non-null   int64  
 12  star_category         4226 non-null   float64
 13  Action                 4226 non-null   int64  
 14  Adult                  4226 non-null   int64  
 15  Adve

# **4. Pre-Processing for ML**
## Split & Standardize Data

In [None]:
# Split preprocessed data into features and target arrays

In [None]:
# Split the preprocessed data into a training and testing dataset

In [None]:
# Create a StandardScaler instance

In [None]:
# Fit the StandardScaler

In [None]:
# Scale the data

## Feature scaling

The impact of differnet scalling methods on the model performance is small. In the following model training and selections, the standard scalling sc data is used.

There are two most common used scaling method: normalization and standardscaler. If there are no specific requirement for the range of output, we choose to use standardscaler.

In [None]:
#Standard scaling
scale_lst= ['budget','boxoffice', 'imdb_votes','imdb_rating','duration_in_mins','metacritic_score','star_category\r']

new_prime_data_sc = movie_data_2.copy()

sc_scale = preprocessing.StandardScaler().fit(new_prime_data_sc[scale_lst])
new_prime_data_sc[scale_lst] = sc_scale.transform(new_prime_data_sc[scale_lst])
new_prime_data_sc.head()

Unnamed: 0,video_id,cvt_per_day,production_comp,release_year,imdb_votes,budget,boxoffice,imdb_rating,duration_in_mins,metacritic_score,awards,mpaa,star_category\r,Action,Adult,Adventure,Animation,Anime,Comedy,Crime,Documentary,Drama,Fantasy,Foreign/International,Holiday,Horror,Independent,Kids & Family,LGBT,Lifestyle,Music,Musicals,Mystery,Reality,Romance,Sci-Fi,Sport,Thriller,War,Western
0,385504,307127.606,1,2013,1.984,1.47,4.696,0.542,1.079,-0.63,4,3,-0.881,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,300175,270338.426,1,2013,1.258,1.47,-1.022,0.542,0.257,-1.997,0,5,-0.88,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
2,361899,256165.867,0,2012,6.043,3.108,3.898,1.1,1.243,0.327,4,5,-0.88,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,308314,196622.721,1,2008,11.077,1.47,0.767,1.309,1.952,5.25,3,5,-0.881,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0
4,307201,159841.652,1,2013,1.258,3.29,-0.265,0.472,0.758,-2.544,4,5,-0.88,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0


In [None]:
#MinMax scaling 
new_movie_data_mm = movie_data_2.copy()
mm_scale = preprocessing.MinMaxScaler().fit(new_movie_data_mm[scale_lst])
new_movie_data_mm[scale_lst] = mm_scale.transform(new_movie_data_mm[scale_lst])

#Robust scaling 
new_movie_data_rs = movie_data_2.copy()
rs_scale = preprocessing.RobustScaler().fit(new_movie_data_mm[scale_lst])
new_movie_data_rs[scale_lst] = rs_scale.transform(new_movie_data_rs[scale_lst])

# **5. Create Learning Model**

use multiple ML model (Lasso Linear regressino model and Random Forest Model) 

In [None]:
# Define the model
# ML sample on Lasso Linear regression 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt 

lr_train, lr_validate = train_test_split(train, test_size = 0.15, random_state = 0)

lr_train_x = lr_train.drop(['video_id', 'cvt_per_day'], axis =1)
lr_validate_x = lr_validate.drop(['video_id','cvt_per_day'], axis=1)
lr_train_y = lr_train['cvt_per_day']
lr_validate_y = lr_validate['cvt_per_day']

alphas = np.logspace(-0.3, 2.5, num=150)
#alphas = [0.00000001]
scores = np.empty_like(alphas)
opt_a = float('-inf')
max_score = float('-inf')
for i, a in enumerate(alphas):
    lasso = Lasso()
    lasso.set_params(alpha = a)
    lasso.fit(lr_train_x, lr_train_y)
    scores[i] = lasso.score(lr_validate_x, lr_validate_y)
    if scores[i]> max_score:
        max_score = scores[i]
        opt_a = a
        lasso_save = lasso
plt.plot(alphas, scores, color = 'b', linestyle= 'dashed', marker='o', markerfacecolor = 'blue', markersize = 6)
plt.xlabel('alpha')
plt.ylabel('score')
plt.grid(True)
plt.title('score vs. alpha')
plt.show()
modell_para = opt_a
print('The optimaized alpha and score of Lasso linear is: '), opt_a, max_score

#combine the validate data and training data, use the optimal alpha, re-train the model 
lasso_f = Lasso()
lasso_f.set_params(alpha = opt_a)
lasso_f.fit(model_train_x, model_train_y)

NameError: ignored

In [None]:
# use Random Forest Model 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rf=RandomForestRegressor(random_state=2,max_features='sqrt',n_jobs=-1)
param_grid={'n_estimators':[55,56,57,58,59,60,61,62,63,64,65],'max_depth':[15,16,17,18,19,20,21]}
clf=GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,refit=True,n_jobs=-1,pre_dispatch='2*n_jobs')
clf.fit(model_train_x,model_train_y)

result= clf.cv_results_
print(result)
max_depth=[15,16,17,18,19,20,21]
n_estimators = [55,56,57,58,59,60,61,62,63,64,65]
scores = clf.cv_results_['mean_test_score'].reshape(len(max_depth), len(n_estimators))
plt.figure(1)
plt.subplot(1,1,1)
for i,j in enumerate(max_depth):
    plt.plot(n_estimators, scores[i], '-o', label= 'max_depths is:'+str(j))
plt.legend(bbox_to_anchor = (1.05, 1), loc='upper left', borderaxespad= 0.)
plt.xlabel('mean_test_score')
plt.ylabel('n_estimators')
plt.show()
print('the best parameter for max_depth is: '+str(clf.best_params_['max_depth']))
print('the best parameter for n_estimators is :' +str(clf.best_params_['n_estimators']))


In [None]:
# Model Evaluation and Comparison 
lst_score = [lasso_score, ridge_score, rf_score]
model_lst = ['Lasso_linear',  'Random forest']

plt.figure(1)
plt.plot(model_lst, lst_score, 'ro')
plt.legend(['r-squre / score'])
plt.xlabel('model names',fontsize =16)
plt.ylabel('score / r square', fontsize =16)
plt.grid(True)
plt.show()

In [None]:
# Try use another model, 

In [None]:
# Add hidden layers

In [None]:
# Add the output layer that uses a probability activation function

In [None]:
# Check the structure of the Sequential model

In [None]:
# Compile the Sequential model together and customize metrics


# **6. Train and Test Neural Network**

In [None]:
# Fit / train the model to the training data

In [None]:
# Evaluate model performance using the test data

# **7. Precitions / Conclusion**

# **8. Summary**
