In [22]:
import pandas as pd
from sqlalchemy import create_engine


### Extract CSVs into DataFrames

In [23]:
# import the imdb data
imdb_file = "IMDb movies.csv"
imdb_df = pd.read_csv(imdb_file)

In [24]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81273 entries, 0 to 81272
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          81273 non-null  object 
 1   title                  81273 non-null  object 
 2   original_title         81273 non-null  object 
 3   year                   81273 non-null  int64  
 4   date_published         81273 non-null  object 
 5   genre                  81273 non-null  object 
 6   duration               81273 non-null  int64  
 7   country                81234 non-null  object 
 8   language               80518 non-null  object 
 9   director               81200 non-null  object 
 10  writer                 79780 non-null  object 
 11  production_company     76948 non-null  object 
 12  actors                 81207 non-null  object 
 13  description            78843 non-null  object 
 14  avg_vote               81273 non-null  float64
 15  vo

In [25]:
# import the netflix data
netflix_file = "netflix_titles.csv"
netflix_df = pd.read_csv(netflix_file)
netflix_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB


In [26]:
#split the netflix data into movies and TV shows
netflix_movies_df = netflix_df.loc[netflix_df['type']=='Movie',:]
netflix_movies_df.info()
netflix_TV_df = netflix_df.loc[netflix_df['type']=='TV Show']
netflix_TV_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4265 entries, 0 to 6231
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       4265 non-null   int64 
 1   type          4265 non-null   object
 2   title         4265 non-null   object
 3   director      4137 non-null   object
 4   cast          3905 non-null   object
 5   country       4070 non-null   object
 6   date_added    4264 non-null   object
 7   release_year  4265 non-null   int64 
 8   rating        4257 non-null   object
 9   duration      4265 non-null   object
 10  listed_in     4265 non-null   object
 11  description   4265 non-null   object
dtypes: int64(2), object(10)
memory usage: 433.2+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 1969 entries, 2 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       1969 non-null   int64 
 1   type          

In [27]:
# after checking the data we decided to merge on year, director + title
# to avoid mismatches in the data
#check for missing directors in the data and decided to focus on movies
bool_series = pd.isnull(netflix_movies_df["director"])  
print (f"There are {len(netflix_movies_df[bool_series])} movies with no director information") 
bool_series = pd.isnull(netflix_TV_df["director"])  
print (f"There are {len(netflix_TV_df[bool_series])} TV Shows with no director information") 
 

There are 128 movies with no director information
There are 1841 TV Shows with no director information


In [28]:
# merge the two data sets on title, year and director
combined_movie_df = pd.merge(netflix_movies_df, imdb_df, left_on=
['title','director','release_year'], 
right_on=['title','director','year'], suffixes=('_net', '_imdb') )
combined_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1715 entries, 0 to 1714
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   show_id                1715 non-null   int64  
 1   type                   1715 non-null   object 
 2   title                  1715 non-null   object 
 3   director               1715 non-null   object 
 4   cast                   1714 non-null   object 
 5   country_net            1711 non-null   object 
 6   date_added             1715 non-null   object 
 7   release_year           1715 non-null   int64  
 8   rating                 1715 non-null   object 
 9   duration_net           1715 non-null   object 
 10  listed_in              1715 non-null   object 
 11  description_net        1715 non-null   object 
 12  imdb_title_id          1715 non-null   object 
 13  original_title         1715 non-null   object 
 14  year                   1715 non-null   int64  
 15  date

In [29]:
#check for mismatches between the two data sets
#different movies with the same title
combined_movie_df["title"].value_counts()


Benji                                           2
One Day                                         2
Zoom                                            2
Blood Money                                     2
Drive                                           2
                                               ..
Annabelle Hooper and the Ghosts of Nantucket    1
Imagine That                                    1
The Tuxedo                                      1
Kon-Tiki                                        1
The Charnel House                               1
Name: title, Length: 1704, dtype: int64

In [30]:
# there are 11 entries that are duplicated
pd.options.display.max_rows = 1000 #this allows for full printing
duplicateDFRow = combined_movie_df[combined_movie_df["title"].duplicated()]
print(len(duplicateDFRow))
print(duplicateDFRow["title"])

11
377                   Drive
575            The Outsider
639                  Sarkar
653                   Benji
1053                   Solo
1215    People You May Know
1237            Blood Money
1412                One Day
1419                    Don
1439                   Zoom
1674            The Silence
Name: title, dtype: object


In [31]:
#We drop the duplicated items

combined_movie_df = combined_movie_df.drop_duplicates(subset = ["title"])

combined_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1714
Data columns (total 32 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   show_id                1704 non-null   int64  
 1   type                   1704 non-null   object 
 2   title                  1704 non-null   object 
 3   director               1704 non-null   object 
 4   cast                   1703 non-null   object 
 5   country_net            1700 non-null   object 
 6   date_added             1704 non-null   object 
 7   release_year           1704 non-null   int64  
 8   rating                 1704 non-null   object 
 9   duration_net           1704 non-null   object 
 10  listed_in              1704 non-null   object 
 11  description_net        1704 non-null   object 
 12  imdb_title_id          1704 non-null   object 
 13  original_title         1704 non-null   object 
 14  year                   1704 non-null   int64  
 15  date

In [32]:
# cut the columns that are missing a lot of information
#27 budget # 28 usa_gross_income #29 worlwide_gross_income #30 object metascore             
combined_movie_df = combined_movie_df[['show_id', 'type', 'title','director', 'cast', 'country_net', 'date_added', 'release_year', 'rating', 'duration_net',
'listed_in', 'description_net', 'imdb_title_id', 'original_title', 'year',
'date_published', 'genre', 'duration_imdb', 'country_imdb', 'language', 'actors', 'description_imdb', 'avg_vote',
'votes']].copy()
combined_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1714
Data columns (total 24 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   show_id           1704 non-null   int64  
 1   type              1704 non-null   object 
 2   title             1704 non-null   object 
 3   director          1704 non-null   object 
 4   cast              1703 non-null   object 
 5   country_net       1700 non-null   object 
 6   date_added        1704 non-null   object 
 7   release_year      1704 non-null   int64  
 8   rating            1704 non-null   object 
 9   duration_net      1704 non-null   object 
 10  listed_in         1704 non-null   object 
 11  description_net   1704 non-null   object 
 12  imdb_title_id     1704 non-null   object 
 13  original_title    1704 non-null   object 
 14  year              1704 non-null   int64  
 15  date_published    1704 non-null   object 
 16  genre             1704 non-null   object 


In [33]:
#Actors and cast are the same information, but cast is missing some data.
#The two durations have the same infomation, but imdb duration is integer
#and netflix is missing some data, so we cut netflix.
#Release year, year and date published are the same, so we cut year and date_published

combined_movie_df = combined_movie_df[['show_id', 'type', 'director', 
'title', 'date_added', 'country_net' ,'release_year', 'rating',
'listed_in', 'description_net', 'imdb_title_id', 'original_title',
'genre', 'duration_imdb',
'country_imdb', 'language', 'actors', 'description_imdb', 'avg_vote',
'votes']].copy()
combined_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1714
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   show_id           1704 non-null   int64  
 1   type              1704 non-null   object 
 2   director          1704 non-null   object 
 3   title             1704 non-null   object 
 4   date_added        1704 non-null   object 
 5   country_net       1700 non-null   object 
 6   release_year      1704 non-null   int64  
 7   rating            1704 non-null   object 
 8   listed_in         1704 non-null   object 
 9   description_net   1704 non-null   object 
 10  imdb_title_id     1704 non-null   object 
 11  original_title    1704 non-null   object 
 12  genre             1704 non-null   object 
 13  duration_imdb     1704 non-null   int64  
 14  country_imdb      1703 non-null   object 
 15  language          1693 non-null   object 
 16  actors            1704 non-null   object 


In [34]:
#Country columns are duplicated, and missing information
#We check if we can complete the information by combining the columns
#Check if the country null values are missing in both columns
bool_series = pd.isnull(combined_movie_df['country_net']) 
bool_series = pd.isnull(combined_movie_df['country_imdb'])  
combined_movie_df[bool_series].T  

Unnamed: 0,132
show_id,80211622
type,Movie
director,Tinge Krishnan
title,Been So Long
date_added,"October 26, 2018"
country_net,
release_year,2018
rating,TV-MA
listed_in,"Dramas, International Movies, Music & Musicals"
description_net,A single mother in London's Camden Town hears ...


In [35]:
#both datasets are missing one piece of data so we replace it with not available
# the imdb database has more country data so we cut country_net
combined_movie_df["country_imdb"].fillna("Not Available", inplace = True)

combined_movie_df = combined_movie_df[['show_id', 'type', 'title', 'director','date_added', 
'release_year', 'rating','listed_in', 'description_net', 'imdb_title_id', 'original_title',
'genre', 'duration_imdb', 'country_imdb', 'language', 'actors', 'description_imdb',
'avg_vote','votes']].copy()
combined_movie_df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1714
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   show_id           1704 non-null   int64  
 1   type              1704 non-null   object 
 2   title             1704 non-null   object 
 3   director          1704 non-null   object 
 4   date_added        1704 non-null   object 
 5   release_year      1704 non-null   int64  
 6   rating            1704 non-null   object 
 7   listed_in         1704 non-null   object 
 8   description_net   1704 non-null   object 
 9   imdb_title_id     1704 non-null   object 
 10  original_title    1704 non-null   object 
 11  genre             1704 non-null   object 
 12  duration_imdb     1704 non-null   int64  
 13  country_imdb      1704 non-null   object 
 14  language          1693 non-null   object 
 15  actors            1704 non-null   object 
 16  description_imdb  1704 non-null   object 


In [36]:
# fill the missing rating and language data as "not available"
combined_movie_df["rating"].fillna("Not Available", inplace = True)
combined_movie_df["language"].fillna("Not Available", inplace = True)

In [37]:
# check to see if the "original title information is important
different_title = combined_movie_df.title != combined_movie_df.original_title
different_title.sort_values()


0       False
1146    False
1145    False
1144    False
1143    False
        ...  
1714    False
1103     True
664      True
642      True
320      True
Length: 1704, dtype: bool

In [44]:
# manual checking shows that the original title data is not important
# ex: combined_movie_df.iloc[664 , : ]
#cut the original title column
#genre and listed_in information is the same, so cut listed_in
combined_movie_df = combined_movie_df[['show_id', 
'type', 'title', 'director','date_added', 'release_year', 'rating', 'description_net', 'imdb_title_id',
'genre', 'duration_imdb', 'country_imdb', 'language',
'actors', 'description_imdb', 'avg_vote','votes']].copy()
combined_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1714
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   show_id           1704 non-null   int64         
 1   type              1704 non-null   object        
 2   title             1704 non-null   object        
 3   director          1704 non-null   object        
 4   date_added        1704 non-null   datetime64[ns]
 5   release_year      1704 non-null   int64         
 6   rating            1704 non-null   object        
 7   description_net   1704 non-null   object        
 8   imdb_title_id     1704 non-null   object        
 9   genre             1704 non-null   object        
 10  duration_imdb     1704 non-null   int64         
 11  country_imdb      1704 non-null   object        
 12  language          1704 non-null   object        
 13  actors            1704 non-null   object        
 14  description_imdb  1704 n

In [45]:
combined_movie_df['date_added']= pd.to_datetime(combined_movie_df['date_added'])
combined_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1714
Data columns (total 17 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   show_id           1704 non-null   int64         
 1   type              1704 non-null   object        
 2   title             1704 non-null   object        
 3   director          1704 non-null   object        
 4   date_added        1704 non-null   datetime64[ns]
 5   release_year      1704 non-null   int64         
 6   rating            1704 non-null   object        
 7   description_net   1704 non-null   object        
 8   imdb_title_id     1704 non-null   object        
 9   genre             1704 non-null   object        
 10  duration_imdb     1704 non-null   int64         
 11  country_imdb      1704 non-null   object        
 12  language          1704 non-null   object        
 13  actors            1704 non-null   object        
 14  description_imdb  1704 n

In [52]:
#rearrange the columns to place important information first
# remove type as they are all movies
combined_movie_df = combined_movie_df[['show_id', 'imdb_title_id', 'title','genre', 'duration_imdb',  'director', 'actors',
'release_year', 'rating', 'description_net', 'description_imdb', 
 'country_imdb', 'language','date_added','avg_vote','votes']].copy()


Unnamed: 0,show_id,imdb_title_id,title,genre,duration_imdb,director,actors,release_year,rating,description_net,description_imdb,country_imdb,language,date_added,avg_vote,votes
0,81145628,tt9428190,Norm of the North: King Sized Adventure,"Animation, Adventure, Comedy",90,"Richard Finn, Tim Maltby","Jennifer Cameron, Brian Dobson, Michael Dobson...",2019,TV-PG,Before planning an awesome wedding for his gra...,An ancient Chinese artifact has been stolen by...,"USA, India, South Korea, China",English,2019-09-09,3.2,185
1,70304990,tt1361318,Good People,"Action, Crime, Thriller",90,Henrik Ruben Genz,"Sam Spruell, Michael Fox, Diarmaid Murtagh, Fr...",2014,R,A struggling couple can't believe their luck w...,Discovering a stash of cash in their dead tena...,"USA, UK, Denmark, Sweden",English,2017-09-08,5.5,14655
2,70299204,tt2917388,Kidnapping Mr. Heineken,"Action, Crime, Drama",95,Daniel Alfredson,"Jim Sturgess, Sam Worthington, Ryan Kwanten, A...",2015,R,"When beer magnate Alfred ""Freddy"" Heineken is ...","The inside story of the planning, execution, r...","Netherlands, Belgium, UK, USA","English, Dutch, German",2017-09-08,6.1,18168
3,80057969,tt3774694,Love,"Drama, Romance",135,Gaspar Noé,"Aomi Muyock, Karl Glusman, Klara Kristin, Ugo ...",2015,NR,A man in an unsatisfying marriage recalls the ...,Murphy is an American living in Paris who ente...,"France, Belgium","English, French",2017-09-08,6.1,39385
4,80046728,tt2718440,Moonwalkers,"Action, Comedy",107,Antoine Bardou-Jacquet,"Rupert Grint, Ron Perlman, Robert Sheehan, Ste...",2015,R,"A brain-addled war vet, a failing band manager...",After failing to locate the legendary,"France, Belgium",English,2017-09-08,6.1,7963


In [54]:
#rename the columns
final_movie_df = combined_movie_df.rename(columns={"show_id": "netflix_id", 
    "imdb_title_id": "imdb_id", "duration_imdb":"duration",
    "country_imdb": "country"})
final_movie_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1704 entries, 0 to 1714
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   netflix_id        1704 non-null   int64         
 1   imdb_id           1704 non-null   object        
 2   title             1704 non-null   object        
 3   genre             1704 non-null   object        
 4   duration          1704 non-null   int64         
 5   director          1704 non-null   object        
 6   actors            1704 non-null   object        
 7   release_year      1704 non-null   int64         
 8   rating            1704 non-null   object        
 9   description_net   1704 non-null   object        
 10  description_imdb  1704 non-null   object        
 11  country           1704 non-null   object        
 12  language          1704 non-null   object        
 13  date_added        1704 non-null   datetime64[ns]
 14  avg_vote          1704 n

In [None]:
# if we need to find additional data we can use this package
# pip install imdbpy

### Transform premise DataFrame

### Transform county DataFrame

### Create database connection

### Load DataFrames into database