<a href="https://colab.research.google.com/github/hemsmalli5/Final-Project---Week1/blob/master/Project_Master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# ***Movie Data Analysis***

Through this project we plan to focus on following business predictions/questions:

Predict popular movie ratings and/or genres within certain release period and intricate genres relationships based on investment and release years.

Analysis specific to one genre, predict if highest budget action movies delivered better revenue?



In [1]:
# Import dependencies

import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# **Import Data**

### The Import Data and Merge Data sections will need to take place in SQL and then a single merged table will be pulled into the jupyter notebook / CoLab file for further editing

In [5]:
# read the data file
movie_akas = pd.read_csv('Resources/title.akas.tsv', sep='\t')
movie_akas.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [2]:
# read the data file
movie_basics = pd.read_csv('Resources/title.basics.tsv', sep='\t')
movie_basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


In [3]:
# read the data file
movie_ratings = pd.read_csv('Resources/title.ratings.tsv', sep='\t')
movie_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.6,1648
1,tt0000002,6.1,198
2,tt0000003,6.5,1352
3,tt0000004,6.2,120
4,tt0000005,6.2,2139


In [4]:
# read the data file
movie_crew = pd.read_csv('Resources/title.crew.tsv', sep='\t')
movie_crew.head()

Unnamed: 0,tconst,directors,writers
0,tt0000001,nm0005690,\N
1,tt0000002,nm0721526,\N
2,tt0000003,nm0721526,\N
3,tt0000004,nm0721526,\N
4,tt0000005,nm0005690,\N


### Due to the format of the akas file, all rows that do not have US as the region and all movies that have duplicate rows will need to be dropped in SQL before it can be merged with the other datasets. All other ETL steps can take place using Python in the Jupter notebook / CoLab file

In [6]:
#Make copy of df for editing
movie_akas_2 = movie_akas.copy()

In [7]:
# Drop all rows that do not have US as the region
movie_akas_2 = (movie_akas_2.loc[movie_akas_2['region'] == 'US'])

# Drop all rows where types is alternative
movie_akas_2 = movie_akas_2[movie_akas_2.types != 'alternative']

# Drop duplicate rows if there is more then one row per movie (keep first row)
movie_akas_2 = movie_akas_2.drop_duplicates(subset=['tconst'], keep='first')
movie_akas_2

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,\N,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
40,tt0000005,6,Blacksmithing,US,\N,\N,informal alternative title,0
46,tt0000006,3,Chinese Opium Den,US,\N,\N,\N,0
50,tt0000007,1,Corbett and Courtney Before the Kinetograph,US,\N,\N,\N,0
...,...,...,...,...,...,...,...,...
23666028,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
23666065,tt9916720,10,The Demonic Nun,US,\N,tv,\N,0
23666081,tt9916734,1,Manca: Peleo,US,\N,\N,\N,0
23666083,tt9916756,1,Pretty Pretty Black Girl,US,\N,\N,\N,0


# **Merge Datasets**

In [None]:
# Merge four datasets into one dataframe
movie_data = pd.merge(movie_akas_2, movie_basics, on=["tconst", "tconst"])
movie_data = pd.merge(movie_data, movie_crew, on=["tconst", "tconst"])
movie_data = pd.merge(movie_data, movie_ratings, on=["tconst", "tconst"])

movie_data.head()

# **1. Import SQL Table**

In [None]:
# Will need to establish SQL Database connection and read in dataset from DB
# pd.read_sql_table('table_name', 'postgres:///db_name')

# **2. Clean and Prep Data**

In [None]:
# Understand Numerical Features
movie_data.info()

In [None]:
#Make copy of df for editing
movie_data_2 = movie_data.copy()

In [None]:
# Inspect title types
movie_data_2['titleType'].value_counts()

In [None]:
# Drop all titleTypes that are not movies from the df
movie_data_2 = (movie_data_2.loc[movie_data_2['titleType'] == 'movie'])

In [None]:
# Drop all adult films from the df
movie_data_2 = (movie_data_2.loc[movie_data_2['isAdult'] == 0])

In [None]:
# Inspect start years
movie_data_2['startYear'].value_counts()

In [None]:
# Replace "\N" with date
movie_data_2 = movie_data_2.replace(r'\\N','1700', regex=True)

# Convert column to int
movie_data_2.startYear = movie_data_2.startYear.astype(int)

# Filter year column
movie_data_2 = (movie_data_2.loc[movie_data_2['startYear'] > 1950])

# Inspect start years
movie_data_2['startYear'].value_counts()

In [None]:
# Replace 1700 with Nan
movie_data_2 = movie_data_2.replace(r'1700','NaN', regex=True)

In [None]:
# Drop unneeded columns
movie_data_2 = movie_data_2.drop(columns=['tconst', 'titleType', 'originalTitle', 'isAdult','endYear', 'writers','ordering', 'language', 'attributes', 'isOriginalTitle'])
movie_data_2.head()

In [None]:
# Reset Index
movie_data_2 = movie_data_2.reset_index(drop=True)

#Should I make the movie title the index?

In [None]:
# Rename columns
movie_data_2 = movie_data_2.rename(columns = {"primaryTitle":"Title", "startYear":"Year", "runtimeMinutes":"Runtime(Min)", "genres":"Genres", "directors":"Directors", "averageRating":"AverageRating", "numVotes":"NumVotes"})
movie_data_2

In [None]:
jjj

## Split Genres & Director Columns

Some videos belong to more than one genere. Splitting each genre into a new column will help the ML model.

In [None]:
#Split genres and directors and writers

In [None]:
# Inspect genres
movie_data_2['Genres'].value_counts()

In [None]:
# Inspect genres
movie_data_2['Runtime(Min)'].value_counts()

In [None]:
# Drop all NaN runtime

In [None]:
# Convert genres column from string into a list
movie_data_2["Genres"] = movie_data_2.Genres.apply(lambda x: x.split(','))
movie_data_2.head()

In [None]:
# Use MultiLabelBinarizer() to break up and encode genre list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('Genres')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

In [None]:
# Inspect directors
movie_data_2['Directors'].value_counts()

In [None]:
# Convert column to int
movie_data_2.Directors = movie_data_2.Directors.astype(object)

In [None]:
# Convert Directors column from string into a list
movie_data_2["Directors"] = movie_data_2.Directors.apply(lambda x: x.split(','))

In [None]:
movie_data_2

In [None]:
# Use MultiLabelBinarizer() to break up and encode Directors list 
mlb = MultiLabelBinarizer()
movie_data_2 = movie_data_2.join(pd.DataFrame(mlb.fit_transform(movie_data_2.pop('Directors')), columns=mlb.classes_, index=movie_data_2.index))
movie_data_2.head()

In [None]:
movie_data_2

## Encode Columns

# **3. Pre-Processing for ML**
## Split & Standardize Data

In [None]:
# Drop title coumn as it is not needed for the ML secton

In [None]:
# Split preprocessed data into features and target arrays

In [None]:
# Split the preprocessed data into a training and testing dataset

In [None]:
# Create a StandardScaler instance

In [None]:
# Fit the StandardScaler

In [None]:
# Scale the data

# **4. Create Learning Model**

In [None]:
# Try use another model, 

In [None]:
# Add hidden layers

In [None]:
# Add the output layer that uses a probability activation function

In [None]:
# Check the structure of the Sequential model

In [None]:
# Compile the Sequential model together and customize metrics

# **5. Train and Test Neural Network**

In [None]:
# Fit / train the model to the training data

In [None]:
# Evaluate model performance using the test data

# **6. Precitions / Conclusion**

# **7. Summary**
