<a href="https://colab.research.google.com/github/hemsmalli5/Final-Project---Week1/blob/master/Project_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# ***Movie Data Analysis***

Through this project we plan to focus on following business predictions/questions:

Predict popular movie ratings and/or genres within certain release period and intricate genres relationships based on investment and release years.

Analysis specific to one genre, predict if highest budget action movies delivered better revenue?



In [1]:
# Import dependencies

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# **1. Import Data**

In [2]:
# read the data file
movie_basics = pd.read_csv('Resources/title.basics.tsv', sep='\t')
movie_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
# read the data file
movie_ratings = pd.read_csv('Resources/title.ratings.tsv', sep='\t')
movie_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1648
1,tt0000002,6.1,198
2,tt0000003,6.5,1352
3,tt0000004,6.2,120
4,tt0000005,6.2,2139


In [4]:
# read the data file
movie_crew = pd.read_csv('Resources/title.crew.tsv', sep='\t')
movie_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [5]:
# Will need to establish SQL Database connection and read in dataset from DB
# pd.read_sql_table('table_name', 'postgres:///db_name')

# **2. Merge Datasets**

In [6]:
# Merge three datasets into one dataframe
movie_data = pd.merge(movie_basics, movie_crew, on=["tconst", "tconst"])
movie_data = pd.merge(movie_data, movie_ratings, on=["tconst", "tconst"])
movie_data.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,directors,writers,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",nm0005690,\N,5.6,1648
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",nm0721526,\N,6.1,198
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",nm0721526,\N,6.5,1352
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",nm0721526,\N,6.2,120
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",nm0005690,\N,6.2,2139


# **3. Explore data**

## Understand Numerical Features

In [7]:
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1079292 entries, 0 to 1079291
Data columns (total 13 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   tconst          1079292 non-null  object 
 1   titleType       1079292 non-null  object 
 2   primaryTitle    1079291 non-null  object 
 3   originalTitle   1079291 non-null  object 
 4   isAdult         1079292 non-null  int64  
 5   startYear       1079292 non-null  object 
 6   endYear         1079292 non-null  object 
 7   runtimeMinutes  1079292 non-null  object 
 8   genres          1079290 non-null  object 
 9   directors       1079292 non-null  object 
 10  writers         1079292 non-null  object 
 11  averageRating   1079292 non-null  float64
 12  numVotes        1079292 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 115.3+ MB


# **3. Clean and Prep Data**

In [27]:
#Make copy of df for editing
movie_data_2 = movie_data.copy()

In [28]:
# Inspect title types
movie_data_2['titleType'].value_counts()

tvEpisode       491504
movie           253638
short           125694
tvSeries         71436
video            58124
tvMovie          41867
videoGame        11277
tvMiniSeries     10626
tvSpecial         9220
tvShort           5906
Name: titleType, dtype: int64

In [29]:
# Drop all titleTypes that are not movies from the df
movie_data_2 = (movie_data_2.loc[movie_data_2['titleType'] == 'movie'])

In [30]:
# Drop all adult films from the df
movie_data_2 = (movie_data_2.loc[movie_data_2['isAdult'] == 0])

In [31]:
# Inspect start years
movie_data_2['startYear'].value_counts()

2017    9412
2016    9152
2018    9052
2015    8872
2014    8680
        ... 
1902       3
1901       2
1894       1
2021       1
1897       1
Name: startYear, Length: 127, dtype: int64

In [32]:
# Replace "\N" with date
movie_data_2 = movie_data_2.replace(r'\\N','1700', regex=True)

In [33]:
# Convert column to int
movie_data_2.startYear = movie_data_2.startYear.astype(int)

In [34]:
# Filter year column
movie_data_2 = (movie_data_2.loc[movie_data_2['startYear'] > 1950])

In [35]:
# Inspect start years
movie_data_2['startYear'].value_counts()

2017    9412
2016    9152
2018    9052
2015    8872
2014    8680
        ... 
1954    1174
1953    1157
1951    1136
1952    1077
2021       1
Name: startYear, Length: 71, dtype: int64

In [36]:
# Replace 1700 with Nan
movie_data_2 = movie_data_2.replace(r'1700','NaN', regex=True)

In [50]:
# Inspect writers
movie_data_2['writers'].value_counts()

#Keep or drop writers

KeyError: 'Writers'

In [37]:
# Drop unneeded columns
movie_data_2 = movie_data_2.drop(columns=['tconst', 'titleType', 'originalTitle', 'isAdult','endYear', 'writers'])
movie_data_2.head()

Unnamed: 0,primaryTitle,startYear,runtimeMinutes,genres,directors,averageRating,numVotes
4160,La tierra de los toros,2000,60,,nm0615736,5.4,12
4278,Dama de noche,1993,102,"Drama,Mystery,Romance",nm0529960,6.2,20
4731,Frivolinas,2014,80,"Comedy,Musical",nm0136068,5.6,15
7318,Lebbra bianca,1951,100,Drama,nm0871077,5.4,42
9818,El negro que tenía el alma blanca,1951,87,"Drama,Musical",nm0140459,6.8,30


In [39]:
# Reset Index
movie_data_2 = movie_data_2.reset_index(drop=True)

#Should I make the movie title the index?

Unnamed: 0,primaryTitle,startYear,runtimeMinutes,genres,directors,averageRating,numVotes
0,La tierra de los toros,2000,60,,nm0615736,5.4,12
1,Dama de noche,1993,102,"Drama,Mystery,Romance",nm0529960,6.2,20
2,Frivolinas,2014,80,"Comedy,Musical",nm0136068,5.6,15
3,Lebbra bianca,1951,100,Drama,nm0871077,5.4,42
4,El negro que tenía el alma blanca,1951,87,"Drama,Musical",nm0140459,6.8,30
...,...,...,...,...,...,...,...
225663,The Mystery of a Buryat Lama,2018,94,"Biography,Documentary,History",nm3308828,3.6,7
225664,Drømmeland,2019,72,Documentary,nm5684093,6.5,40
225665,Akelarre,2020,90,"Drama,History,Horror",nm1893148,7.3,35
225666,The Secret of China,2019,,"Adventure,History,War",nm0910951,4.1,11


In [41]:
# Rename columns
movie_data_2 = movie_data_2.rename(columns = {"primaryTitle":"Title", "startYear":"Year", "runtimeMinutes":"Runtime(Min)", "genres":"Genres", "directors":"Directors", "averageRating":"AverageRating", "numVotes":"NumVotes"})
movie_data_2.head()

## Split Genres & Director Columns

Some videos belong to more than one genere. Splitting each genre into a new column will help the ML model.

In [None]:
#Split genres and directors and writers

In [43]:
# Inspect genres
movie_data_2['Genres'].value_counts()

Drama                           39317
Documentary                     23398
Comedy                          19961
NaN                              8607
Comedy,Drama                     7665
                                ...  
Drama,Sci-Fi,Western                1
Comedy,Drama,News                   1
Adventure,History,Thriller          1
Adventure,Documentary,Sci-Fi        1
Drama,Film-Noir,Music               1
Name: Genres, Length: 1135, dtype: int64

In [45]:
# Convert genres column from string into a list
movie_data_2["Genres"] = movie_data_2.Genres.apply(lambda x: x.split(','))
movie_data_2.head()

Unnamed: 0,Title,Year,Runtime(Min),Genres,Directors,AverageRating,NumVotes
4160,La tierra de los toros,2000,60,[NaN],nm0615736,5.4,12
4278,Dama de noche,1993,102,"[Drama, Mystery, Romance]",nm0529960,6.2,20
4731,Frivolinas,2014,80,"[Comedy, Musical]",nm0136068,5.6,15
7318,Lebbra bianca,1951,100,[Drama],nm0871077,5.4,42
9818,El negro que tenía el alma blanca,1951,87,"[Drama, Musical]",nm0140459,6.8,30


In [46]:
# Use MultiLabelBinarizer() to break up and encode genre list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('Genres')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

Unnamed: 0,Title,Year,Runtime(Min),Directors,AverageRating,NumVotes,Action,Adult,Adventure,Animation,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
4160,La tierra de los toros,2000,60,nm0615736,5.4,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4278,Dama de noche,1993,102,nm0529960,6.2,20,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4731,Frivolinas,2014,80,nm0136068,5.6,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7318,Lebbra bianca,1951,100,nm0871077,5.4,42,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9818,El negro que tenía el alma blanca,1951,87,nm0140459,6.8,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
# Inspect directors
movie_data_2['Directors'].value_counts()

NaN                                        2250
nm0001238                                   139
nm0781261                                   120
nm0947998                                   106
nm0554924                                   102
                                           ... 
nm0456978,nm0621183                           1
nm4161605                                     1
nm0949726,nm2089848,nm2060322,nm1622696       1
nm6458459                                     1
nm0665327                                     1
Name: Directors, Length: 100809, dtype: int64

## Encode Columns

## Clean Up Categorical Features

In [None]:
movie_data_2[['budget','boxoffice','metacritic_score','star_category\r','imdb_votes','imdb_rating']] = movie_data_2[['budget','boxoffice','metacritic_score','star_category\r','imdb_votes','imdb_rating']].replace(0, np.nan)
print(movie_data_2.info())

In [None]:
print(movie_data_2.isnull().sum()/movie_data_2.shape[0])

Filling missing data with mean value.



In [None]:
movie_data_2['boxoffice'] = movie_data_2['boxoffice'].fillna(movie_data_2['boxoffice'].mean())
movie_data_2['metacritic_score']=movie_data_2['metacritic_score'].fillna(movie_data_2['metacritic_score'].mean())
movie_data_2['star_category\r'] = movie_data_2['star_category\r'].fillna(movie_data_2['imdb_votes'].mean())
movie_data_2['imdb_votes'] = movie_data_2['imdb_votes'].fillna(movie_data_2['imdb_votes'].mean())
movie_data_2['imdb_rating']=movie_data_2['imdb_rating'].fillna(movie_data_2['imdb_rating'].mean())
movie_data_2['budget']= movie_data_2['budget'].fillna(movie_data_2['budget'].mean())
print(movie_data_2.info())

# **4. Pre-Processing for ML**
## Split & Standardize Data

In [None]:
# Split preprocessed data into features and target arrays

In [None]:
# Split the preprocessed data into a training and testing dataset

In [None]:
# Create a StandardScaler instance

In [None]:
# Fit the StandardScaler

In [None]:
# Scale the data

## Feature scaling

The impact of differnet scalling methods on the model performance is small. In the following model training and selections, the standard scalling sc data is used.

There are two most common used scaling method: normalization and standardscaler. If there are no specific requirement for the range of output, we choose to use standardscaler.

In [None]:
#Standard scaling
scale_lst= ['budget','boxoffice', 'imdb_votes','imdb_rating','duration_in_mins','metacritic_score','star_category\r']

new_prime_data_sc = movie_data_2.copy()

sc_scale = preprocessing.StandardScaler().fit(new_prime_data_sc[scale_lst])
new_prime_data_sc[scale_lst] = sc_scale.transform(new_prime_data_sc[scale_lst])
new_prime_data_sc.head()

In [None]:
#MinMax scaling 
new_movie_data_mm = movie_data_2.copy()
mm_scale = preprocessing.MinMaxScaler().fit(new_movie_data_mm[scale_lst])
new_movie_data_mm[scale_lst] = mm_scale.transform(new_movie_data_mm[scale_lst])

#Robust scaling 
new_movie_data_rs = movie_data_2.copy()
rs_scale = preprocessing.RobustScaler().fit(new_movie_data_mm[scale_lst])
new_movie_data_rs[scale_lst] = rs_scale.transform(new_movie_data_rs[scale_lst])

# **5. Create Learning Model**

use multiple ML model (Lasso Linear regressino model and Random Forest Model) 

In [None]:
# Define the model
# ML sample on Lasso Linear regression 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.metrics import mean_squared_error, r2_score
from math import sqrt 

lr_train, lr_validate = train_test_split(train, test_size = 0.15, random_state = 0)

lr_train_x = lr_train.drop(['video_id', 'cvt_per_day'], axis =1)
lr_validate_x = lr_validate.drop(['video_id','cvt_per_day'], axis=1)
lr_train_y = lr_train['cvt_per_day']
lr_validate_y = lr_validate['cvt_per_day']

alphas = np.logspace(-0.3, 2.5, num=150)
#alphas = [0.00000001]
scores = np.empty_like(alphas)
opt_a = float('-inf')
max_score = float('-inf')
for i, a in enumerate(alphas):
    lasso = Lasso()
    lasso.set_params(alpha = a)
    lasso.fit(lr_train_x, lr_train_y)
    scores[i] = lasso.score(lr_validate_x, lr_validate_y)
    if scores[i]> max_score:
        max_score = scores[i]
        opt_a = a
        lasso_save = lasso
plt.plot(alphas, scores, color = 'b', linestyle= 'dashed', marker='o', markerfacecolor = 'blue', markersize = 6)
plt.xlabel('alpha')
plt.ylabel('score')
plt.grid(True)
plt.title('score vs. alpha')
plt.show()
modell_para = opt_a
print('The optimaized alpha and score of Lasso linear is: '), opt_a, max_score

#combine the validate data and training data, use the optimal alpha, re-train the model 
lasso_f = Lasso()
lasso_f.set_params(alpha = opt_a)
lasso_f.fit(model_train_x, model_train_y)

In [None]:
# use Random Forest Model 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
rf=RandomForestRegressor(random_state=2,max_features='sqrt',n_jobs=-1)
param_grid={'n_estimators':[55,56,57,58,59,60,61,62,63,64,65],'max_depth':[15,16,17,18,19,20,21]}
clf=GridSearchCV(estimator=rf,param_grid=param_grid,cv=5,refit=True,n_jobs=-1,pre_dispatch='2*n_jobs')
clf.fit(model_train_x,model_train_y)

result= clf.cv_results_
print(result)
max_depth=[15,16,17,18,19,20,21]
n_estimators = [55,56,57,58,59,60,61,62,63,64,65]
scores = clf.cv_results_['mean_test_score'].reshape(len(max_depth), len(n_estimators))
plt.figure(1)
plt.subplot(1,1,1)
for i,j in enumerate(max_depth):
    plt.plot(n_estimators, scores[i], '-o', label= 'max_depths is:'+str(j))
plt.legend(bbox_to_anchor = (1.05, 1), loc='upper left', borderaxespad= 0.)
plt.xlabel('mean_test_score')
plt.ylabel('n_estimators')
plt.show()
print('the best parameter for max_depth is: '+str(clf.best_params_['max_depth']))
print('the best parameter for n_estimators is :' +str(clf.best_params_['n_estimators']))


In [None]:
# Model Evaluation and Comparison 
lst_score = [lasso_score, ridge_score, rf_score]
model_lst = ['Lasso_linear',  'Random forest']

plt.figure(1)
plt.plot(model_lst, lst_score, 'ro')
plt.legend(['r-squre / score'])
plt.xlabel('model names',fontsize =16)
plt.ylabel('score / r square', fontsize =16)
plt.grid(True)
plt.show()

In [None]:
# Try use another model, 

In [None]:
# Add hidden layers

In [None]:
# Add the output layer that uses a probability activation function

In [None]:
# Check the structure of the Sequential model

In [None]:
# Compile the Sequential model together and customize metrics

# **6. Train and Test Neural Network**

In [None]:
# Fit / train the model to the training data

In [None]:
# Evaluate model performance using the test data

# **7. Precitions / Conclusion**

# **8. Summary**
