In [2]:
import pandas as pd
from sqlalchemy import create_engine

### Extract CSVs into DataFrames

In [3]:
# import the imdb data
imdb_file = "IMDb movies.csv"
imdb_df = pd.read_csv(imdb_file)
imdb_df.head()

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0


In [4]:
imdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81273 entries, 0 to 81272
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   imdb_title_id          81273 non-null  object 
 1   title                  81273 non-null  object 
 2   original_title         81273 non-null  object 
 3   year                   81273 non-null  int64  
 4   date_published         81273 non-null  object 
 5   genre                  81273 non-null  object 
 6   duration               81273 non-null  int64  
 7   country                81234 non-null  object 
 8   language               80518 non-null  object 
 9   director               81200 non-null  object 
 10  writer                 79780 non-null  object 
 11  production_company     76948 non-null  object 
 12  actors                 81207 non-null  object 
 13  description            78843 non-null  object 
 14  avg_vote               81273 non-null  float64
 15  vo

In [5]:
# import the netflix data
netflix_file = "netflix_titles.csv"
netflix_df = pd.read_csv(netflix_file)
netflix_df.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,81145628,Movie,Norm of the North: King Sized Adventure,"Richard Finn, Tim Maltby","Alan Marriott, Andrew Toth, Brian Dobson, Cole...","United States, India, South Korea, China","September 9, 2019",2019,TV-PG,90 min,"Children & Family Movies, Comedies",Before planning an awesome wedding for his gra...
1,80117401,Movie,Jandino: Whatever it Takes,,Jandino Asporaat,United Kingdom,"September 9, 2016",2016,TV-MA,94 min,Stand-Up Comedy,Jandino Asporaat riffs on the challenges of ra...
2,70234439,TV Show,Transformers Prime,,"Peter Cullen, Sumalee Montano, Frank Welker, J...",United States,"September 8, 2018",2013,TV-Y7-FV,1 Season,Kids' TV,"With the help of three human allies, the Autob..."
3,80058654,TV Show,Transformers: Robots in Disguise,,"Will Friedle, Darren Criss, Constance Zimmer, ...",United States,"September 8, 2018",2016,TV-Y7,1 Season,Kids' TV,When a prison ship crash unleashes hundreds of...
4,80125979,Movie,#realityhigh,Fernando Lebrija,"Nesta Cooper, Kate Walsh, John Michael Higgins...",United States,"September 8, 2017",2017,TV-14,99 min,Comedies,When nerdy high schooler Dani finally attracts...


In [6]:
# netflix information
print(netflix_df["type"].value_counts())
print(netflix_df.info())

Movie      4265
TV Show    1969
Name: type, dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6234 entries, 0 to 6233
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       6234 non-null   int64 
 1   type          6234 non-null   object
 2   title         6234 non-null   object
 3   director      4265 non-null   object
 4   cast          5664 non-null   object
 5   country       5758 non-null   object
 6   date_added    6223 non-null   object
 7   release_year  6234 non-null   int64 
 8   rating        6224 non-null   object
 9   duration      6234 non-null   object
 10  listed_in     6234 non-null   object
 11  description   6234 non-null   object
dtypes: int64(2), object(10)
memory usage: 584.6+ KB
None


In [9]:
# merge the two data sets
#merge on title cuts the netflix data base from 6234 to 3040
# there are no duplicates
combined_df = pd.merge(netflix_df, imdb_df, left_on='title', 
                    right_on='title', suffixes=('_net', '_imdb') )
combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3040 entries, 0 to 3039
Data columns (total 33 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   show_id                3040 non-null   int64  
 1   type                   3040 non-null   object 
 2   title                  3040 non-null   object 
 3   director_net           2650 non-null   object 
 4   cast                   2974 non-null   object 
 5   country_net            2967 non-null   object 
 6   date_added             3037 non-null   object 
 7   release_year           3040 non-null   int64  
 8   rating                 3039 non-null   object 
 9   duration_net           3040 non-null   object 
 10  listed_in              3040 non-null   object 
 11  description_net        3040 non-null   object 
 12  imdb_title_id          3040 non-null   object 
 13  original_title         3040 non-null   object 
 14  year                   3040 non-null   int64  
 15  date

In [13]:
#based on Ahmad's advice we can cut the columns that are missing a lot of information
#27 budget # 28 usa_gross_income #29 worlwide_gross_income #30 object metascore             
short_combined_df = combined_df[['show_id', 'type', 'title', 'director_net',
    'cast', 'country_net', 'date_added', 'release_year', 'rating', 'duration_net',
'listed_in', 'description_net', 'imdb_title_id', 'original_title', 'year',
'date_published', 'genre', 'duration_imdb', 'country_imdb', 'language', 'director_imdb',
'writer', 'production_company', 'actors', 'description_imdb', 'avg_vote',
'votes']].copy()
short_combined_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3040 entries, 0 to 3039
Data columns (total 27 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   show_id             3040 non-null   int64  
 1   type                3040 non-null   object 
 2   title               3040 non-null   object 
 3   director_net        2650 non-null   object 
 4   cast                2974 non-null   object 
 5   country_net         2967 non-null   object 
 6   date_added          3037 non-null   object 
 7   release_year        3040 non-null   int64  
 8   rating              3039 non-null   object 
 9   duration_net        3040 non-null   object 
 10  listed_in           3040 non-null   object 
 11  description_net     3040 non-null   object 
 12  imdb_title_id       3040 non-null   object 
 13  original_title      3040 non-null   object 
 14  year                3040 non-null   int64  
 15  date_published      3040 non-null   object 
 16  genre 

In [None]:
# From here our problems are:
# Should we cut some more coumns?
# There are some duplicated columns (director, cast/actors, country,release year/year, duration, description)
# There is some missing data(most of these can be fixed by combiining data from the duplicated columns)
# there are some incorrect data types (release year/year, duration)

In [14]:
# to find null values
bool_series = pd.isnull(large_df["rating"])  
large_df[bool_series].T  


Unnamed: 0,1330
show_id,80144119
type,Movie
title,My Honor Was Loyalty
director_net,Alessandro Pepe
cast,"Leone Frisa, Paolo Vaccarino, Francesco Miglio..."
country_net,Italy
date_added,"March 1, 2017"
release_year,2015
rating,
duration_net,115 min


In [15]:
#to find repeated titles (make sure they are different movies not repeated data)
netflix_df["title"].value_counts()
netflix_df.loc[netflix_df["title"]=="Love", :]

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
19,80057969,Movie,Love,Gaspar Noé,"Karl Glusman, Klara Kristin, Aomi Muyock, Ugo ...","France, Belgium","September 8, 2017",2015,NR,135 min,"Cult Movies, Dramas, Independent Movies",A man in an unsatisfying marriage recalls the ...
1066,81033200,Movie,Love,"Kabir Bhatia, Titien Wattimena","Acha Septriasa, Darius Sinathrya, Fauzi Baadil...",Indonesia,"November 30, 2018",2008,TV-PG,120 min,"Dramas, International Movies, Romantic Movies","In Jakarta, five couples of varying ages and b..."
5825,80026506,TV Show,Love,,"Gillian Jacobs, Paul Rust, Claudia O'Doherty",United States,"March 9, 2018",2018,TV-MA,3 Seasons,"Romantic TV Shows, TV Comedies, TV Dramas",Rebellious Mickey and good-natured Gus navigat...


In [None]:
# if we need to find additional data we can use this package
# pip install imdbpy

In [None]:
# This is the code we used in class for cleaning data and exporting it

### Transform premise DataFrame

In [None]:
new_premise_df = premise_df[['Premises Name', 'County ID Code']].copy()
new_premise_df.head()

In [None]:
new_premise_df.rename(columns = {'Premises Name':'premise_name'}, inplace = True) 

In [None]:
new_premise_df.rename(columns = {'County ID Code':'county_id'}, inplace = True) 
new_premise_df.duplicated()
new_premise_df.drop_duplicates(['premise_name'])

### Transform county DataFrame

In [None]:
county_df.head()
new_county_df = county_df[['County Name (Licensee)', 'County ID Code', 'License Count']].copy()
new_county_df.count()



In [None]:
new_county_df.rename(columns = {'License Count':'license_count',
                               "County Name (Licensee)": "county_name",
                               "County ID Code": "county_id"}, inplace = True) 

# A `county` table that contains the columns `id`, `county_name`, 
#     `license_count` and `county_id`.
new_county_df

### Create database connection

In [None]:

rds_connection_string = "postgres:1117@localhost:5432/customer_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

In [None]:
# Confirm tables
engine.table_names()


### Load DataFrames into database

In [None]:
new_county_df.to_sql(name='county', con=engine, if_exists='append', index=False)

In [None]:
new_premise_df.to_sql(name='premise', con=engine, if_exists='append', index=False)