<a href="https://colab.research.google.com/github/hemsmalli5/Final-Project---Week1/blob/master/Project_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# ***Movie Data Analysis***

Through this project we plan to focus on following business predictions/questions:

Predict popular movie ratings and/or genres within certain release period and intricate genres relationships based on investment and release years.

Analysis specific to one genre, predict if highest budget action movies delivered better revenue?



In [1]:
# Import dependencies

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# **Import Data**

### The Import Data and Merge Data sections will need to take place in SQL and then a single merged table will be pulled into the jupyter notebook / CoLab file for further editing

In [2]:
# read the data file
movie_akas = pd.read_csv('Resources/title.akas.tsv', sep='\t')
movie_akas.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,tconst,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [3]:
# read the data file
movie_basics = pd.read_csv('Resources/title.basics.tsv', sep='\t')
movie_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [4]:
# read the data file
movie_ratings = pd.read_csv('Resources/title.ratings.tsv', sep='\t')
movie_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1648
1,tt0000002,6.1,198
2,tt0000003,6.5,1352
3,tt0000004,6.2,120
4,tt0000005,6.2,2139


In [5]:
# read the data file
movie_crew = pd.read_csv('Resources/title.crew.tsv', sep='\t')
movie_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


### Due to the format of the akas file, all rows that do not have US as the region and all movies that have duplicate rows will need to be dropped in SQL before it can be merged with the other datasets. All other ETL steps can take place using Python in the Jupter notebook / CoLab file

### -OR- 

### Save movie_akas_2 as tsv file and load that into SQL

In [6]:
#Make copy of df for editing
movie_akas_2 = movie_akas.copy()

In [7]:
# Drop all rows that do not have region as US
movie_akas_2 = (movie_akas_2.loc[movie_akas_2['region'] == 'US'])

# Drop all rows where types is alternative
movie_akas_2 = movie_akas_2[movie_akas_2.types != 'alternative']

# Drop duplicate rows if there is more then one row per movie (keep first row)
movie_akas_2 = movie_akas_2.drop_duplicates(subset=['tconst'], keep='first')
movie_akas_2

Unnamed: 0,tconst,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,\N,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
40,tt0000005,6,Blacksmithing,US,\N,\N,informal alternative title,0
46,tt0000006,3,Chinese Opium Den,US,\N,\N,\N,0
50,tt0000007,1,Corbett and Courtney Before the Kinetograph,US,\N,\N,\N,0
...,...,...,...,...,...,...,...,...
17173152,tt5678950,1,Not Alone: The Life Above,US,\N,\N,\N,0
17173265,tt5678986,1,Secrets and Sins,US,\N,\N,\N,0
17173266,tt5678990,1,A Child's Smile,US,\N,\N,\N,0
17173267,tt5678994,1,Watch What You Ask For,US,\N,\N,\N,0


# **Merge Datasets**

In [8]:
# Merge four datasets into one dataframe
movie_data = pd.merge(movie_akas_2, movie_basics, on=["tconst", "tconst"])
movie_data = pd.merge(movie_data, movie_crew, on=["tconst", "tconst"])
movie_data = pd.merge(movie_data, movie_ratings, on=["tconst", "tconst"])

movie_data.head()

Unnamed: 0,tconst,ordering,title,region,language,types,attributes,isOriginalTitle,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,directors,writers,averageRating,numVotes
0,tt0000001,6,Carmencita,US,\N,\N,\N,0,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short",nm0005690,\N,5.6,1648
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short",nm0721526,\N,6.1,198
2,tt0000005,6,Blacksmithing,US,\N,\N,informal alternative title,0,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short",nm0005690,\N,6.2,2139
3,tt0000006,3,Chinese Opium Den,US,\N,\N,\N,0,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short,nm0005690,\N,5.3,115
4,tt0000007,1,Corbett and Courtney Before the Kinetograph,US,\N,\N,\N,0,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport","nm0374658,nm0005690",\N,5.5,656


# **1. Import SQL Table**

In [9]:
# Will need to establish SQL Database connection and read in dataset from DB
# pd.read_sql_table('table_name', 'postgres:///db_name')

# **2. Clean and Prep Data**

In [10]:
# Understand Numerical Features
movie_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 327075 entries, 0 to 327074
Data columns (total 20 columns):
 #   Column           Non-Null Count   Dtype  
---  ------           --------------   -----  
 0   tconst           327075 non-null  object 
 1   ordering         327075 non-null  int64  
 2   title            327075 non-null  object 
 3   region           327075 non-null  object 
 4   language         327075 non-null  object 
 5   types            327075 non-null  object 
 6   attributes       327075 non-null  object 
 7   isOriginalTitle  327075 non-null  object 
 8   titleType        327075 non-null  object 
 9   primaryTitle     327075 non-null  object 
 10  originalTitle    327075 non-null  object 
 11  isAdult          327075 non-null  int64  
 12  startYear        327075 non-null  object 
 13  endYear          327075 non-null  object 
 14  runtimeMinutes   327075 non-null  object 
 15  genres           327075 non-null  object 
 16  directors        327075 non-null  obje

In [11]:
#Make copy of df for editing
movie_data_2 = movie_data.copy()

In [12]:
# Inspect title types
movie_data_2['titleType'].value_counts()

movie           96914
tvEpisode       89782
short           52993
video           32609
tvSeries        23392
tvMovie         14841
tvSpecial        5714
videoGame        5645
tvShort          2659
tvMiniSeries     2526
Name: titleType, dtype: int64

In [13]:
# Drop all rows with titleTypes that are not movies from the df
movie_data_2 = (movie_data_2.loc[movie_data_2['titleType'] == 'movie'])

In [14]:
# Drop all adult film rows from the df
movie_data_2 = (movie_data_2.loc[movie_data_2['isAdult'] == 0])

In [15]:
# Inspect start years
movie_data_2['startYear'].value_counts()

2014    3562
2013    3397
2015    3395
2012    3286
2011    3067
        ... 
1903       2
1904       1
1897       1
2021       1
1894       1
Name: startYear, Length: 126, dtype: int64

In [16]:
# Replace "\N" with date
movie_data_2 = movie_data_2.replace(r'\\N','1700', regex=True)

# Convert column to int
movie_data_2.startYear = movie_data_2.startYear.astype(int)

# Filter year column
movie_data_2 = (movie_data_2.loc[movie_data_2['startYear'] > 1950])

# Inspect start years
movie_data_2['startYear'].value_counts()

2014    3562
2013    3397
2015    3395
2012    3286
2011    3067
        ... 
1956     462
1954     456
1960     453
1955     447
2021       1
Name: startYear, Length: 71, dtype: int64

In [17]:
# Replace 1700 with Nan
movie_data_2 = movie_data_2.replace(r'1700','NaN', regex=True)

In [18]:
# Drop all rows wih a runtime of NaN
movie_data_2 = (movie_data_2.loc[movie_data_2['runtimeMinutes'] != "NaN"])

In [19]:
# Drop unneeded columns
movie_data_2 = movie_data_2.drop(columns=['title', 'region', 'types', 'tconst', 'titleType', 'originalTitle', 'isAdult','endYear', 'writers','ordering', 'language', 'attributes', 'isOriginalTitle'])
movie_data_2.head()

Unnamed: 0,primaryTitle,startYear,runtimeMinutes,genres,directors,averageRating,numVotes
11757,Made in Germany - Die dramatische Geschichte d...,1957,101,"Biography,Drama",nm0772191,6.5,12
14126,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",nm0003506,6.4,78045
14424,Elephant Fury,1953,100,"Drama,War",nm0682176,4.2,6
14756,Mystery of the Black Jungle,1954,80,"Action,Adventure,Mystery","nm0614634,nm0130397",5.4,34
14818,"Another Time, Another Place",1983,118,"Drama,War",nm0705535,6.5,256


In [20]:
# Reset Index
movie_data_2 = movie_data_2.reset_index(drop=True)

In [21]:
# Rename columns
movie_data_2 = movie_data_2.rename(columns = {"primaryTitle":"Title", "startYear":"Year", "runtimeMinutes":"Runtime(Min)", "genres":"Genres", "directors":"Directors", "averageRating":"AverageRating", "numVotes":"NumVotes"})
movie_data_2

Unnamed: 0,Title,Year,Runtime(Min),Genres,Directors,AverageRating,NumVotes
0,Made in Germany - Die dramatische Geschichte d...,1957,101,"Biography,Drama",nm0772191,6.5,12
1,Kate & Leopold,2001,118,"Comedy,Fantasy,Romance",nm0003506,6.4,78045
2,Elephant Fury,1953,100,"Drama,War",nm0682176,4.2,6
3,Mystery of the Black Jungle,1954,80,"Action,Adventure,Mystery","nm0614634,nm0130397",5.4,34
4,"Another Time, Another Place",1983,118,"Drama,War",nm0705535,6.5,256
...,...,...,...,...,...,...,...
74280,The Long Way Home: Making the Martian,2016,80,Documentary,nm1361273,7.5,13
74281,About Paul,2016,63,"Biography,Documentary",nm2648766,6.1,11
74282,Regionrat,2019,99,"Comedy,Drama",nm0721494,6.3,63
74283,"Jerico, the Infinite Flight of Days",2016,77,Documentary,nm7058489,7.9,123


## Split Genres & Director Columns

Some videos belong to more than one genere. Splitting each genre into a new column will help the ML model.

In [22]:
# Convert genres column from string into a list
movie_data_2["Genres"] = movie_data_2.Genres.apply(lambda x: x.split(','))
movie_data_2.head()

Unnamed: 0,Title,Year,Runtime(Min),Genres,Directors,AverageRating,NumVotes
0,Made in Germany - Die dramatische Geschichte d...,1957,101,"[Biography, Drama]",nm0772191,6.5,12
1,Kate & Leopold,2001,118,"[Comedy, Fantasy, Romance]",nm0003506,6.4,78045
2,Elephant Fury,1953,100,"[Drama, War]",nm0682176,4.2,6
3,Mystery of the Black Jungle,1954,80,"[Action, Adventure, Mystery]","nm0614634,nm0130397",5.4,34
4,"Another Time, Another Place",1983,118,"[Drama, War]",nm0705535,6.5,256


In [23]:
# Use MultiLabelBinarizer() to break up and encode genre list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('Genres')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

Unnamed: 0,Title,Year,Runtime(Min),Directors,AverageRating,NumVotes,Action,Adult,Adventure,Animation,...,News,Reality-TV,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
0,Made in Germany - Die dramatische Geschichte d...,1957,101,nm0772191,6.5,12,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Kate & Leopold,2001,118,nm0003506,6.4,78045,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,Elephant Fury,1953,100,nm0682176,4.2,6,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,Mystery of the Black Jungle,1954,80,"nm0614634,nm0130397",5.4,34,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,"Another Time, Another Place",1983,118,nm0705535,6.5,256,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [24]:
# Inspect directors
movie_data_2['Directors'].value_counts()

NaN          395
nm0001238     79
nm0676248     56
nm0213983     53
nm0151653     48
            ... 
nm1070287      1
nm3264792      1
nm0182493      1
nm1072805      1
nm0907788      1
Name: Directors, Length: 40512, dtype: int64

In [25]:
# Convert column to int
movie_data_2.Directors = movie_data_2.Directors.astype(object)

In [26]:
# Convert Directors column from string into a list
movie_data_2["Directors"] = movie_data_2.Directors.apply(lambda x: x.split(','))

In [27]:
# Use MultiLabelBinarizer() to break up and encode Directors list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('Directors')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

ValueError: columns overlap but no suffix specified: Index(['NaN'], dtype='object')

In [None]:
movie_data_2

In [None]:
ccc

## Encode Columns

# **3. Pre-Processing for ML**
## Split & Standardize Data

In [None]:
# Drop title coumn as it is not needed for the ML secton

In [None]:
# Split preprocessed data into features and target arrays

In [None]:
# Split the preprocessed data into a training and testing dataset

In [None]:
# Create a StandardScaler instance

In [None]:
# Fit the StandardScaler

In [None]:
# Scale the data

# **4. Create Learning Model**

In [None]:
# Try use another model, 

In [None]:
# Add hidden layers

In [None]:
# Add the output layer that uses a probability activation function

In [None]:
# Check the structure of the Sequential model

In [None]:
# Compile the Sequential model together and customize metrics

# **5. Train and Test Neural Network**

In [None]:
# Fit / train the model to the training data

In [None]:
# Evaluate model performance using the test data

# **6. Precitions / Conclusion**

# **7. Summary**
