<a href="https://colab.research.google.com/github/hemsmalli5/Final-Project---Week1/blob/master/Project_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# ***Movie Data Analysis***

Through this project we plan to focus on following business predictions/questions:

Predict popular movie ratings and/or genres within certain release period and intricate genres relationships based on investment and release years.

Analysis specific to one genre, predict if highest budget action movies delivered better revenue?



In [1]:
# Import dependencies

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# **Import Data**

### The Import Data and Merge Data sections will need to take place in SQL and then a single merged table will be pulled into the jupyter notebook / CoLab file for further editing

In [2]:
# read the data file
movie_basics = pd.read_csv('Resources/title.basics.tsv', sep='\t')
movie_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
# read the data file
movie_ratings = pd.read_csv('Resources/title.ratings.tsv', sep='\t')
movie_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1648
1,tt0000002,6.1,198
2,tt0000003,6.5,1352
3,tt0000004,6.2,120
4,tt0000005,6.2,2139


In [4]:
# read the data file
movie_crew = pd.read_csv('Resources/title.crew.tsv', sep='\t')
movie_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


In [5]:
# read the data file
movie_ratings = pd.read_csv('Resources/title.akas.tsv', sep='\t')
movie_ratings.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


### Due to the format of the akas file, all rows that do not have US as the region will need to be droped in SQL before it can be merged with the other datasets. All other ETL steps can take place using Python in the Jupter notebook / CoLab file

# **Merge Datasets**

In [6]:
# Merge three datasets into one dataframe
movie_data = pd.merge(movie_basics, movie_crew, on=["tconst", "tconst"])
movie_data = pd.merge(movie_data, movie_ratings, on=["tconst", "tconst"])

#Merge 4th dataset

movie_data.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,directors,writers,averageRating,numVotes
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",nm0005690,\N,5.6,1648
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",nm0721526,\N,6.1,198
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance",nm0721526,\N,6.5,1352
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short",nm0721526,\N,6.2,120
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",nm0005690,\N,6.2,2139


# **1. Import SQL Table**

In [5]:
# Will need to establish SQL Database connection and read in dataset from DB
# pd.read_sql_table('table_name', 'postgres:///db_name')

# **2. Clean and Prep Data**

In [7]:
# Understand Numerical Features
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1079292 entries, 0 to 1079291
Data columns (total 13 columns):
 #   Column          Non-Null Count    Dtype  
---  ------          --------------    -----  
 0   tconst          1079292 non-null  object 
 1   titleType       1079292 non-null  object 
 2   primaryTitle    1079291 non-null  object 
 3   originalTitle   1079291 non-null  object 
 4   isAdult         1079292 non-null  int64  
 5   startYear       1079292 non-null  object 
 6   endYear         1079292 non-null  object 
 7   runtimeMinutes  1079292 non-null  object 
 8   genres          1079290 non-null  object 
 9   directors       1079292 non-null  object 
 10  writers         1079292 non-null  object 
 11  averageRating   1079292 non-null  float64
 12  numVotes        1079292 non-null  int64  
dtypes: float64(1), int64(2), object(10)
memory usage: 115.3+ MB


In [94]:
#Make copy of df for editing
movie_data_2 = movie_data.copy()

In [95]:
# Inspect title types
movie_data_2['titleType'].value_counts()

tvEpisode       491504
movie           253638
short           125694
tvSeries         71436
video            58124
tvMovie          41867
videoGame        11277
tvMiniSeries     10626
tvSpecial         9220
tvShort           5906
Name: titleType, dtype: int64

In [96]:
# Drop all titleTypes that are not movies from the df
movie_data_2 = (movie_data_2.loc[movie_data_2['titleType'] == 'movie'])

In [97]:
# Drop all adult films from the df
movie_data_2 = (movie_data_2.loc[movie_data_2['isAdult'] == 0])

In [None]:
# inspect and drop all rows that are not in english


In [98]:
# Inspect start years
movie_data_2['startYear'].value_counts()

2017    9412
2016    9152
2018    9052
2015    8872
2014    8680
        ... 
1902       3
1901       2
1894       1
2021       1
1897       1
Name: startYear, Length: 127, dtype: int64

In [99]:
# Replace "\N" with date
movie_data_2 = movie_data_2.replace(r'\\N','1700', regex=True)

# Convert column to int
movie_data_2.startYear = movie_data_2.startYear.astype(int)

# Filter year column
movie_data_2 = (movie_data_2.loc[movie_data_2['startYear'] > 1950])

# Inspect start years
movie_data_2['startYear'].value_counts()

In [103]:
# Replace 1700 with Nan
movie_data_2 = movie_data_2.replace(r'1700','NaN', regex=True)

In [105]:
# Drop unneeded columns
movie_data_2 = movie_data_2.drop(columns=['tconst', 'titleType', 'originalTitle', 'isAdult','endYear', 'writers'])
movie_data_2.head()

Unnamed: 0,primaryTitle,startYear,runtimeMinutes,genres,directors,averageRating,numVotes
4160,La tierra de los toros,2000,60,,nm0615736,5.4,12
4278,Dama de noche,1993,102,"Drama,Mystery,Romance",nm0529960,6.2,20
4731,Frivolinas,2014,80,"Comedy,Musical",nm0136068,5.6,15
7318,Lebbra bianca,1951,100,Drama,nm0871077,5.4,42
9818,El negro que tenía el alma blanca,1951,87,"Drama,Musical",nm0140459,6.8,30


In [106]:
# Reset Index
movie_data_2 = movie_data_2.reset_index(drop=True)

#Should I make the movie title the index?

In [107]:
# Rename columns
movie_data_2 = movie_data_2.rename(columns = {"primaryTitle":"Title", "startYear":"Year", "runtimeMinutes":"Runtime(Min)", "genres":"Genres", "directors":"Directors", "averageRating":"AverageRating", "numVotes":"NumVotes"})
movie_data_2

Unnamed: 0,Title,Year,Runtime(Min),Genres,Directors,AverageRating,NumVotes
0,La tierra de los toros,2000,60,,nm0615736,5.4,12
1,Dama de noche,1993,102,"Drama,Mystery,Romance",nm0529960,6.2,20
2,Frivolinas,2014,80,"Comedy,Musical",nm0136068,5.6,15
3,Lebbra bianca,1951,100,Drama,nm0871077,5.4,42
4,El negro que tenía el alma blanca,1951,87,"Drama,Musical",nm0140459,6.8,30
...,...,...,...,...,...,...,...
225663,The Mystery of a Buryat Lama,2018,94,"Biography,Documentary,History",nm3308828,3.6,7
225664,Drømmeland,2019,72,Documentary,nm5684093,6.5,40
225665,Akelarre,2020,90,"Drama,History,Horror",nm1893148,7.3,35
225666,The Secret of China,2019,,"Adventure,History,War",nm0910951,4.1,11


In [None]:
jjj

## Split Genres & Director Columns

Some videos belong to more than one genere. Splitting each genre into a new column will help the ML model.

In [108]:
#Split genres and directors and writers

In [109]:
# Inspect genres
movie_data_2['Genres'].value_counts()

Drama                           39317
Documentary                     23398
Comedy                          19961
NaN                              8607
Comedy,Drama                     7665
                                ...  
Drama,Sci-Fi,Western                1
Comedy,Drama,News                   1
Adventure,History,Thriller          1
Adventure,Documentary,Sci-Fi        1
Drama,Film-Noir,Music               1
Name: Genres, Length: 1135, dtype: int64

In [117]:
# Inspect genres
movie_data_2['Runtime(Min)'].value_counts()

NaN    23807
90     14942
95      7001
100     6643
85      6484
       ...  
367        1
724        1
600        1
495        1
354        1
Name: Runtime(Min), Length: 357, dtype: int64

In [None]:
# Drop all NaN runtime

In [110]:
# Convert genres column from string into a list
movie_data_2["Genres"] = movie_data_2.Genres.apply(lambda x: x.split(','))
movie_data_2.head()

Unnamed: 0,Title,Year,Runtime(Min),Genres,Directors,AverageRating,NumVotes
0,La tierra de los toros,2000,60,[NaN],nm0615736,5.4,12
1,Dama de noche,1993,102,"[Drama, Mystery, Romance]",nm0529960,6.2,20
2,Frivolinas,2014,80,"[Comedy, Musical]",nm0136068,5.6,15
3,Lebbra bianca,1951,100,[Drama],nm0871077,5.4,42
4,El negro que tenía el alma blanca,1951,87,"[Drama, Musical]",nm0140459,6.8,30


In [111]:
# Use MultiLabelBinarizer() to break up and encode genre list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('Genres')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

Unnamed: 0,Title,Year,Runtime(Min),Directors,AverageRating,NumVotes,Action,Adult,Adventure,Animation,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,La tierra de los toros,2000,60,nm0615736,5.4,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Dama de noche,1993,102,nm0529960,6.2,20,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Frivolinas,2014,80,nm0136068,5.6,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Lebbra bianca,1951,100,nm0871077,5.4,42,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,El negro que tenía el alma blanca,1951,87,nm0140459,6.8,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [112]:
# Inspect directors
movie_data_2['Directors'].value_counts()

NaN                                        2250
nm0001238                                   139
nm0781261                                   120
nm0947998                                   106
nm0554924                                   102
                                           ... 
nm0456978,nm0621183                           1
nm4161605                                     1
nm0949726,nm2089848,nm2060322,nm1622696       1
nm6458459                                     1
nm0665327                                     1
Name: Directors, Length: 100809, dtype: int64

In [113]:
# Convert column to int
movie_data_2.Directors = movie_data_2.Directors.astype(object)

In [114]:
# Convert Directors column from string into a list
movie_data_2["Directors"] = movie_data_2.Directors.apply(lambda x: x.split(','))

In [115]:
movie_data_2

Unnamed: 0,Title,Year,Runtime(Min),Directors,AverageRating,NumVotes,Action,Adult,Adventure,Animation,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,La tierra de los toros,2000,60,[nm0615736],5.4,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Dama de noche,1993,102,[nm0529960],6.2,20,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Frivolinas,2014,80,[nm0136068],5.6,15,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,Lebbra bianca,1951,100,[nm0871077],5.4,42,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,El negro que tenía el alma blanca,1951,87,[nm0140459],6.8,30,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225663,The Mystery of a Buryat Lama,2018,94,[nm3308828],3.6,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
225664,Drømmeland,2019,72,[nm5684093],6.5,40,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
225665,Akelarre,2020,90,[nm1893148],7.3,35,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
225666,The Secret of China,2019,,[nm0910951],4.1,11,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0


In [116]:
# Use MultiLabelBinarizer() to break up and encode Directors list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('Directors')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

ValueError: columns overlap but no suffix specified: Index(['NaN'], dtype='object')

In [None]:
movie_data_2

## Encode Columns

# **3. Pre-Processing for ML**
## Split & Standardize Data

In [None]:
# Drop title coumn as it is not needed for the ML secton

In [None]:
# Split preprocessed data into features and target arrays

In [None]:
# Split the preprocessed data into a training and testing dataset

In [None]:
# Create a StandardScaler instance

In [None]:
# Fit the StandardScaler

In [None]:
# Scale the data

# **4. Create Learning Model**

In [None]:
# Try use another model, 

In [None]:
# Add hidden layers

In [None]:
# Add the output layer that uses a probability activation function

In [None]:
# Check the structure of the Sequential model

In [None]:
# Compile the Sequential model together and customize metrics

# **5. Train and Test Neural Network**

In [None]:
# Fit / train the model to the training data

In [None]:
# Evaluate model performance using the test data

# **6. Precitions / Conclusion**

# **7. Summary**
