# Part 1:
---
# Python | Pandas for Data Clean-up

## Import Modules/ Libs

In [1]:
import pandas as pd
import csv
import datetime as dt
from datetime import datetime
import numpy as np

# for the progress bar (ONLY WORK in Jupyter Notebook)
from time import sleep
from tqdm.notebook import tqdm

# for making file path independently of OS
# for exporting Jupyter Notebook to Python at the end
import os

## Import SQL-Alchemy for SQL-Lite Data Load

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Float, Date
from sqlalchemy.orm import Session
from sqlalchemy import func
import pathlib
import sqlite3

## Define local functions to use

In [3]:
# convert to interger
def to_int (df_name, col_name):
    
    """
    1. Converting string, etc. into integer
    2. Usage:
        to_int (data-frame-name, column-name):
    """
    
    print (f"\n>> Processing Column: '{col_name}'")
    
    # convert empty string to number, if no num available, put NaN 
#     df_name[col_name] = pd.to_numeric(df_name[col_name], errors='coerce')

    try:
        df_name[col_name] = df_name[col_name].astype(np.int64)
        type_col = df_name[col_name].dtype
        print("---->>> No issue observed")
        
    except ValueError as error:
        
        print("---->>> An Exception has occured ::", str(error))
        print("---->>> Proceed with alternative routes, please wait...")
        # replace all empty space with 0
        df_name[col_name] =  df_name[col_name].fillna(0)
        df_name[col_name] = df_name[col_name].replace('',0, regex=True)

        # convert df to numpy array
        np_of_df = df_name[col_name].values

        # comvert to type as float
        np_of_df_float = np_of_df.astype(float)  

        df_name[col_name] = np_of_df_float
        df_name[col_name] = df_name[col_name].astype(np.int64)
        type_col = df_name[col_name].dtype
        

    print(f">> DONE Coverting to {type_col}\n\n{50*('=')}\n{50*('=')}")
   
    
    return df_name

In [4]:
def conv_currency (df_name, col_name):
    
    """
    1. Converting "$" to 'USD'
    2. Split column contains "currency" and "number" into 2 columns
        a. <original_column_name>_currency : tag of currency like 'USD', 'EUR', 'INR', etc.
    3. Usage: 
        conv_currency (data-frame-name, column-name)
    """
    df_name[col_name].replace(np.NaN, np.int64(0), inplace=True)
    
    print (f"\n>> Processing Column: '{col_name}'")
    
    # replace $ for USD and remove all "blanks"
    df_name.loc[:,col_name] = df_name.loc[:, col_name].str.replace("$", "USD ")
    df_name.loc[:,col_name] = df_name.loc[:, col_name].str.replace(",", "")
    print(df_name.loc[:,col_name].dtype)
    print(f">> DONE Replacing '$' with 'USD'!")
    
    # split currency and value into two cols
    
    try:
        new_col = f'{col_name}_currency'
        print(f'    >>> Creating new column named: "{new_col}"')
        df_name[new_col], df_name[col_name] = df_name[col_name].str.split(' ', 1).str
        print(df_name.loc[:,col_name].dtype)
        print(df_name.loc[:,new_col].dtype)
        print(f">> DONE Splitting Columns!")
              
    except ValueError:
        pass
    
    df_name[col_name].replace("", np.int64(0), inplace=True)
    print(f">> PROCESS COMPLETED !\n\n{50*('=')}\n{50*('=')}")
    
    return print(f">> PROCESS COMPLETED !\n\n{50*('=')}\n{50*('=')}")

In [5]:
def replace_str (df_name, col_name, to_be_repl, repl_to):
    
    """
    1. Replace a character to another character
    2. Usage:
        replace_str (data-frame-name, column-name, string-to-replace, replace-to-string):
    """
    print (f"\n>> Processing Column: '{col_name}'")
    
    df_name[col_name] = df_name.loc[:, col_name].str.replace(to_be_repl, repl_to)
    

    print(f">> DONE Replacing Character!\n\n{50*('=')}\n{50*('=')}")
    return df_name.head(10)

In [6]:
def to_str (df_name, col_name):
    
    """
    1. convert to string
    2. Usage:
        to_str (data-frame-name, column-name)
    """
    print (f"\n>> Processing Column: '{col_name}'")
    
   
    df_name[col_name] = df_name[col_name].astype('str') 
       

    print(f">> DONE Converting to String!\n\n{50*('=')}\n{50*('=')}")
    return df_name.head(10)

## Data Clean-up Process

### DATABASE RAW_DF

In [7]:
# define path to csv file
path=os.path.join("Resources_hle","IMDb_movies.csv")

# convert csv to pandas df
raw_df = pd.read_csv(path, encoding="UTF-8", low_memory=False) #, dtype={'usa_gross_income': "string", "worldwide_gross_income" : "string", "budget": "string"})

In [8]:
# review what imported
raw_df.head(20)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0429277,Zyzzyx Rd,Zyzzyx Rd,2006,2/24/2006,"Crime, Drama, Thriller",90,USA,English,John Penney,...,"Leo Grillo, Katherine Heigl, Tom Sizemore, Ric...",The family man accountant Grant travels to Las...,4.0,930,"$2,000,000",$30,$30,,11.0,1.0
1,tt0070913,Che?,Che?,1972,12/8/1972,Comedy,114,"Italy, France, West Germany","English, Italian, French",Roman Polanski,...,"Marcello Mastroianni, Sydne Rome, Hugh Griffit...","During her Italian vacation, a young and beaut...",5.7,3256,,$64,,,23.0,37.0
2,tt1986953,Storage 24,Storage 24,2012,6/29/2012,"Action, Horror, Mystery",87,UK,English,Johannes Roberts,...,"Noel Clarke, Colin O'Donoghue, Antonia Campbel...","In London, a military plane crashes leaving it...",4.4,6309,,$72,"$646,175",52.0,65.0,88.0
3,tt1865335,Confession of a Child of the Century,Confession of a Child of the Century,2012,8/29/2012,Drama,120,"France, Germany, UK",English,Sylvie Verheyde,...,"Charlotte Gainsbourg, Pete Doherty, August Die...","Paris, 1830: Octave, betrayed by his mistress,...",4.4,514,EUR 4000000,$74,"$146,155",,4.0,23.0
4,tt4195920,Chicas paranoicas,Chicas paranoicas,2015,9/16/2016,Comedy,100,Spain,Spanish,Pedro del Santo,...,"Patricia Valley, Mairen Muñoz, Marta Mir Martí...",'Chicas Paranoicas' is the first Spanish comed...,8.0,169,,$78,$78,,,2.0
5,tt1157631,Perro come perro,Perro come perro,2008,4/18/2008,Thriller,106,Colombia,Spanish,Carlos Moreno,...,"Marlon Moreno, Óscar Borda, Álvaro Rodríguez, ...","In the crime world of Colombia, there is an un...",6.7,1546,,$80,$80,,14.0,10.0
6,tt0962711,The Objective,The Objective,2008,4/24/2008,"Horror, Sci-Fi, Thriller",90,"USA, Morocco","English, Pushto",Daniel Myrick,...,"Jonas Ball, Matthew R. Anderson, Jon Huertas, ...","A military special operations team, led by a C...",5.5,7634,"$4,000,000",$95,$95,26.0,67.0,37.0
7,tt3789946,Dixie y la rebelión zombi,Dixie y la rebelión zombi,2014,11/7/2014,Animation,82,Spain,"Catalan, English, Basque, Spanish","Beñat Beitia, Ricardo Ramón",...,"Paula Ribó, Núria Trifol, Ivan Labanda, Elisab...","In the sequel to DADDY, I'M A ZOMBIE, the fate...",4.7,176,EUR 1800000,$120,$120,,5.0,3.0
8,tt1934452,Realms,Realms,2017,7/26/2019,"Horror, Mystery",90,USA,,Daric Gates,...,"Ryan Kelley, Madison McKinley, Praya Lundberg,...",Supernatural/Horror set against the backdrop o...,4.0,525,"$2,000,000",$147,"$15,419",,20.0,2.0
9,tt0213690,Ground Zero,Ground Zero,2000,5/12/2000,"Action, Drama",92,USA,English,Richard Friedman,...,"Janet Gunn, Jack Scalia, Scott Terra, Martin H...",L.A. is struck by a series of minor earthquake...,3.5,254,,$150,,,2.0,2.0


In [9]:
# print out current columns with template to create a dictionary for columns rename
# un-comment to print out the template
# for col in raw_df.columns:
#     print(f'"{col}" : "__",')

In [10]:
# define what will be renamed
cols = {
"imdb_title_id" : "imdb_id",
"production_company" : "prod_co",
"avg_vote" : "user_rating",
"votes" : "vote_num",
"worlwide_gross_income" : "worldwide_gross_income",
"metascore" : "web_rating"
    }

# process new col names
raw_df.rename(columns=cols, inplace=True)
# raw_df.head(10)

In [11]:
# # fill empty space
# raw_df.fillna("0")
# raw_df.head(10)

In [12]:
# print out all current column names
# for col in raw_df.columns:
#     print(f'"{col}",')

### DATABASE PROCESSED_DF

In [13]:
# del processed_df

In [14]:
# if table exist, delete the table
try:
    del processed_df

except Exception:
    pass

processed_df = raw_df
processed_df

Unnamed: 0,imdb_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,user_rating,vote_num,budget,usa_gross_income,worldwide_gross_income,web_rating,reviews_from_users,reviews_from_critics
0,tt0429277,Zyzzyx Rd,Zyzzyx Rd,2006,2/24/2006,"Crime, Drama, Thriller",90,USA,English,John Penney,...,"Leo Grillo, Katherine Heigl, Tom Sizemore, Ric...",The family man accountant Grant travels to Las...,4.0,930,"$2,000,000",$30,$30,,11.0,1.0
1,tt0070913,Che?,Che?,1972,12/8/1972,Comedy,114,"Italy, France, West Germany","English, Italian, French",Roman Polanski,...,"Marcello Mastroianni, Sydne Rome, Hugh Griffit...","During her Italian vacation, a young and beaut...",5.7,3256,,$64,,,23.0,37.0
2,tt1986953,Storage 24,Storage 24,2012,6/29/2012,"Action, Horror, Mystery",87,UK,English,Johannes Roberts,...,"Noel Clarke, Colin O'Donoghue, Antonia Campbel...","In London, a military plane crashes leaving it...",4.4,6309,,$72,"$646,175",52.0,65.0,88.0
3,tt1865335,Confession of a Child of the Century,Confession of a Child of the Century,2012,8/29/2012,Drama,120,"France, Germany, UK",English,Sylvie Verheyde,...,"Charlotte Gainsbourg, Pete Doherty, August Die...","Paris, 1830: Octave, betrayed by his mistress,...",4.4,514,EUR 4000000,$74,"$146,155",,4.0,23.0
4,tt4195920,Chicas paranoicas,Chicas paranoicas,2015,9/16/2016,Comedy,100,Spain,Spanish,Pedro del Santo,...,"Patricia Valley, Mairen Muñoz, Marta Mir Martí...",'Chicas Paranoicas' is the first Spanish comed...,8.0,169,,$78,$78,,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,tt9894098,Sathru,Sathru,2019,3/7/2019,Thriller,129,India,,Naveen Nanjundan,...,"Srushti Dange, Kathir, Laguparan, Marimuthu, N...",A kidnapping gone wrong leads to mounting tens...,6.1,163,,,"$8,683",,7.0,1.0
81269,tt9899880,Columbus,Columbus,2018,12/5/2018,"Comedy, Drama",82,Iran,"Persian, English",Hatef Alimardani,...,"Farhad Aslani, Majid Salehi, Saeed Poursamimi,...",A rich family are deciding to immigrate to the...,4.0,130,,,,,,13.0
81270,tt9903716,Jessie,Jessie,2019,3/15/2019,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,"Sritha Chandana, Pavani Gangireddy, Abhinav Go...","Set in an abandoned house, the film follows a ...",7.2,219,,,,,21.0,
81271,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,3/8/2019,Drama,130,India,Malayalam,Vineesh Aaradya,...,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",,8.4,369,,,,,,


In [15]:
# replacing "tt" character in id columns and convert id to integer
replace_str(processed_df, "imdb_id", "tt", "")


>> Processing Column: 'imdb_id'
>> DONE Replacing Character!



Unnamed: 0,imdb_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,user_rating,vote_num,budget,usa_gross_income,worldwide_gross_income,web_rating,reviews_from_users,reviews_from_critics
0,429277,Zyzzyx Rd,Zyzzyx Rd,2006,2/24/2006,"Crime, Drama, Thriller",90,USA,English,John Penney,...,"Leo Grillo, Katherine Heigl, Tom Sizemore, Ric...",The family man accountant Grant travels to Las...,4.0,930,"$2,000,000",$30,$30,,11.0,1.0
1,70913,Che?,Che?,1972,12/8/1972,Comedy,114,"Italy, France, West Germany","English, Italian, French",Roman Polanski,...,"Marcello Mastroianni, Sydne Rome, Hugh Griffit...","During her Italian vacation, a young and beaut...",5.7,3256,,$64,,,23.0,37.0
2,1986953,Storage 24,Storage 24,2012,6/29/2012,"Action, Horror, Mystery",87,UK,English,Johannes Roberts,...,"Noel Clarke, Colin O'Donoghue, Antonia Campbel...","In London, a military plane crashes leaving it...",4.4,6309,,$72,"$646,175",52.0,65.0,88.0
3,1865335,Confession of a Child of the Century,Confession of a Child of the Century,2012,8/29/2012,Drama,120,"France, Germany, UK",English,Sylvie Verheyde,...,"Charlotte Gainsbourg, Pete Doherty, August Die...","Paris, 1830: Octave, betrayed by his mistress,...",4.4,514,EUR 4000000,$74,"$146,155",,4.0,23.0
4,4195920,Chicas paranoicas,Chicas paranoicas,2015,9/16/2016,Comedy,100,Spain,Spanish,Pedro del Santo,...,"Patricia Valley, Mairen Muñoz, Marta Mir Martí...",'Chicas Paranoicas' is the first Spanish comed...,8.0,169,,$78,$78,,,2.0
5,1157631,Perro come perro,Perro come perro,2008,4/18/2008,Thriller,106,Colombia,Spanish,Carlos Moreno,...,"Marlon Moreno, Óscar Borda, Álvaro Rodríguez, ...","In the crime world of Colombia, there is an un...",6.7,1546,,$80,$80,,14.0,10.0
6,962711,The Objective,The Objective,2008,4/24/2008,"Horror, Sci-Fi, Thriller",90,"USA, Morocco","English, Pushto",Daniel Myrick,...,"Jonas Ball, Matthew R. Anderson, Jon Huertas, ...","A military special operations team, led by a C...",5.5,7634,"$4,000,000",$95,$95,26.0,67.0,37.0
7,3789946,Dixie y la rebelión zombi,Dixie y la rebelión zombi,2014,11/7/2014,Animation,82,Spain,"Catalan, English, Basque, Spanish","Beñat Beitia, Ricardo Ramón",...,"Paula Ribó, Núria Trifol, Ivan Labanda, Elisab...","In the sequel to DADDY, I'M A ZOMBIE, the fate...",4.7,176,EUR 1800000,$120,$120,,5.0,3.0
8,1934452,Realms,Realms,2017,7/26/2019,"Horror, Mystery",90,USA,,Daric Gates,...,"Ryan Kelley, Madison McKinley, Praya Lundberg,...",Supernatural/Horror set against the backdrop o...,4.0,525,"$2,000,000",$147,"$15,419",,20.0,2.0
9,213690,Ground Zero,Ground Zero,2000,5/12/2000,"Action, Drama",92,USA,English,Richard Friedman,...,"Janet Gunn, Jack Scalia, Scott Terra, Martin H...",L.A. is struck by a series of minor earthquake...,3.5,254,,$150,,,2.0,2.0


In [16]:


# extract day, month, year from the date_published columns
processed_df["date_published"] = pd.to_datetime(processed_df['date_published'])
processed_df['day'], processed_df['month'], processed_df['year']  = processed_df['date_published'].dt.day, processed_df['date_published'].dt.month, processed_df['date_published'].dt.year



In [17]:
# convert 'id' to numbers
to_int(processed_df, 'imdb_id')


>> Processing Column: 'imdb_id'
---->>> No issue observed
>> DONE Coverting to int64



Unnamed: 0,imdb_id,title,original_title,year,date_published,genre,duration,country,language,director,...,user_rating,vote_num,budget,usa_gross_income,worldwide_gross_income,web_rating,reviews_from_users,reviews_from_critics,day,month
0,429277,Zyzzyx Rd,Zyzzyx Rd,2006,2006-02-24,"Crime, Drama, Thriller",90,USA,English,John Penney,...,4.0,930,"$2,000,000",$30,$30,,11.0,1.0,24,2
1,70913,Che?,Che?,1972,1972-12-08,Comedy,114,"Italy, France, West Germany","English, Italian, French",Roman Polanski,...,5.7,3256,,$64,,,23.0,37.0,8,12
2,1986953,Storage 24,Storage 24,2012,2012-06-29,"Action, Horror, Mystery",87,UK,English,Johannes Roberts,...,4.4,6309,,$72,"$646,175",52.0,65.0,88.0,29,6
3,1865335,Confession of a Child of the Century,Confession of a Child of the Century,2012,2012-08-29,Drama,120,"France, Germany, UK",English,Sylvie Verheyde,...,4.4,514,EUR 4000000,$74,"$146,155",,4.0,23.0,29,8
4,4195920,Chicas paranoicas,Chicas paranoicas,2016,2016-09-16,Comedy,100,Spain,Spanish,Pedro del Santo,...,8.0,169,,$78,$78,,,2.0,16,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,9894098,Sathru,Sathru,2019,2019-03-07,Thriller,129,India,,Naveen Nanjundan,...,6.1,163,,,"$8,683",,7.0,1.0,7,3
81269,9899880,Columbus,Columbus,2018,2018-12-05,"Comedy, Drama",82,Iran,"Persian, English",Hatef Alimardani,...,4.0,130,,,,,,13.0,5,12
81270,9903716,Jessie,Jessie,2019,2019-03-15,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,7.2,219,,,,,21.0,,15,3
81271,9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,8.4,369,,,,,,,8,3


In [18]:
# for trouble shoot and debugging purposes
# break_point_here

In [19]:
col_to_covert = ['worldwide_gross_income', 'usa_gross_income', 'budget']

In [20]:
for ea_col in col_to_covert:
    conv_currency(processed_df, ea_col)
processed_df[['worldwide_gross_income', 'usa_gross_income', 'budget']]


>> Processing Column: 'worldwide_gross_income'
object
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "worldwide_gross_income_currency"
object
object
>> DONE Splitting Columns!
>> PROCESS COMPLETED !

>> PROCESS COMPLETED !


>> Processing Column: 'usa_gross_income'




object
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "usa_gross_income_currency"
object
object
>> DONE Splitting Columns!
>> PROCESS COMPLETED !

>> PROCESS COMPLETED !


>> Processing Column: 'budget'
object
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "budget_currency"
object
object
>> DONE Splitting Columns!
>> PROCESS COMPLETED !

>> PROCESS COMPLETED !



Unnamed: 0,worldwide_gross_income,usa_gross_income,budget
0,30,30,2000000
1,,64,
2,646175,72,
3,146155,74,4000000
4,78,78,
...,...,...,...
81268,8683,,
81269,,,
81270,,,
81271,,,


In [21]:
# create an folder to hole temporarily exported data of converted data
# folder name that will store the sql-lite database
fol_name = "archieve"

# if exist print a message for user
if pathlib.Path(fol_name).exists():
    print(f' >> Folder "{fol_name}" already exists!\n >> NO new folder was created ...')
    pass

# if not make a new one and let user know
else:
    os.mkdir(fol_name)
    print(f'Successfully created folder "{fol_name}"')

# export to csv for visual inspection or further process if needed
processed_df[["worldwide_gross_income_currency", 'worldwide_gross_income', "usa_gross_income_currency", 'usa_gross_income', "budget_currency", 'budget']].to_csv("archieve\exported_draft.csv")


 >> Folder "archieve" already exists!
 >> NO new folder was created ...


In [22]:

# print out the column names inside df
# useful as being able to copy and paste directly into cell without retyping all col names
# uncheck to use
# for col in processed_df.columns:
#     print (f'"{col}",')


In [23]:
print(processed_df.dtypes)

imdb_id                                     int64
title                                      object
original_title                             object
year                                        int64
date_published                     datetime64[ns]
genre                                      object
duration                                    int64
country                                    object
language                                   object
director                                   object
writer                                     object
prod_co                                    object
actors                                     object
description                                object
user_rating                               float64
vote_num                                    int64
budget                                     object
usa_gross_income                           object
worldwide_gross_income                     object
web_rating                                float64


In [24]:
# specify columns to be converted to string
col_to_str = [
    "imdb_id",
    "title",
    "usa_gross_income_currency",
    "worldwide_gross_income_currency",
    "budget_currency",
    "country",
    "language",
    "genre",
    "director",
    "writer",
    "prod_co",
    "actors",
    "description"    
]

# loop and replace one by one
for col in col_to_str:
    to_str(processed_df, col)


>> Processing Column: 'imdb_id'
>> DONE Converting to String!


>> Processing Column: 'title'
>> DONE Converting to String!


>> Processing Column: 'usa_gross_income_currency'
>> DONE Converting to String!


>> Processing Column: 'worldwide_gross_income_currency'
>> DONE Converting to String!


>> Processing Column: 'budget_currency'
>> DONE Converting to String!


>> Processing Column: 'country'
>> DONE Converting to String!


>> Processing Column: 'language'
>> DONE Converting to String!


>> Processing Column: 'genre'
>> DONE Converting to String!


>> Processing Column: 'director'
>> DONE Converting to String!


>> Processing Column: 'writer'
>> DONE Converting to String!


>> Processing Column: 'prod_co'
>> DONE Converting to String!


>> Processing Column: 'actors'
>> DONE Converting to String!


>> Processing Column: 'description'
>> DONE Converting to String!



In [25]:
# specify columns to be converted to integer
col_to_int = [
    "imdb_id",
    "day",
    "month",
    "year",
    "user_rating",
    "web_rating",
    "vote_num",
    "reviews_from_users",
    "reviews_from_critics",
    "usa_gross_income",
    "worldwide_gross_income",
    "budget",
    "duration",
]

# loop and replace one by one
for col in col_to_int:
    to_int(processed_df, col)


>> Processing Column: 'imdb_id'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'day'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'month'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'year'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'user_rating'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'web_rating'
---->>> An Exception has occured :: Cannot convert non-finite values (NA or inf) to integer
---->>> Proceed with alternative routes, please wait...
>> DONE Coverting to int64


>> Processing Column: 'vote_num'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'reviews_from_users'
---->>> An Exception has occured :: Cannot convert non-finite values (NA or inf) to integer
---->>> Proceed with alternative routes, please wait...
>> DONE Coverting to int64


>> Processing Column: 'reviews_from_critics'
----

In [26]:
processed_df.dtypes

imdb_id                                     int64
title                                      object
original_title                             object
year                                        int64
date_published                     datetime64[ns]
genre                                      object
duration                                    int64
country                                    object
language                                   object
director                                   object
writer                                     object
prod_co                                    object
actors                                     object
description                                object
user_rating                                 int64
vote_num                                    int64
budget                                      int64
usa_gross_income                            int64
worldwide_gross_income                      int64
web_rating                                  int64


In [27]:
# re-arrange columns in the df
# build a list of what need to be included and their positions
cols=[
"imdb_id",
"title",

"date_published",
"day",
"month",
"year",

"user_rating",
"web_rating",

"vote_num",
"reviews_from_users",
"reviews_from_critics",
    
"usa_gross_income_currency",
"usa_gross_income",

"worldwide_gross_income_currency",
"worldwide_gross_income",

"budget_currency",
"budget",


"duration",
"country",
"language",
"genre",
"director",
"writer",
"prod_co",
"actors",
"description"
]


# re-arrange:
processed_df = processed_df[cols]

In [28]:
processed_df.to_csv("archieve\processed_df.csv")

In [29]:
# template to print out all columns and get ready for CLASS creation
# for col in  cols:
#     print (f'{col} = Column(    )')

In [30]:
# Table export description:
t_shp = processed_df.shape
print(f"TABLE DESCRIPTIONS:\n{('-')*30}\n\
>>> Number of Rows: {'{:,.0f}'.format(t_shp[0])}\n\
>>> Number of Columns: {t_shp[1]}")

TABLE DESCRIPTIONS:
------------------------------
>>> Number of Rows: 81,273
>>> Number of Columns: 26


In [31]:
# brk_here

# Part 2: 
---
# SQLAlchemy -> SQL Lite

### Create Engine & Connection to SQL Lite DB

In [32]:
# create declarative base
Base = declarative_base()

# check current table available in the Base - should be nothing at this point
Base.metadata.tables


immutabledict({})

In [33]:
# folder name that will store the sql-lite database
fol_name = "SQLiteDB_Exported"

# if exist print a message for user
if pathlib.Path(fol_name).exists():
    print(f' >> Folder "{fol_name}" already exists!\n >> NO new folder was created ...')
    pass

# if not make a new one and let user know
else:
    os.mkdir(fol_name)
    print(f'Successfully created folder "{fol_name}"')

 >> Folder "SQLiteDB_Exported" already exists!
 >> NO new folder was created ...


In [34]:
sqlite_db_path = os.path.join(fol_name,"ETL_movies.db")
engine = create_engine(f"sqlite:///{sqlite_db_path}")
conn = engine.connect()

In [35]:
engine.execute("DROP TABLE IF EXISTS movie_imdb")

<sqlalchemy.engine.result.ResultProxy at 0x26c54b98c88>

In [36]:
# per docs found online, SQL Lite works up to int8
# while creating this file, I found lots of DataType Mismatch errors
# found this solutions as 2 lines belows to extend SQL to work with int64
# -------------------------------------------------------------
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

In [37]:
# print out list of cols for class creation
# i=0
# for col in processed_df.columns:
#     print(f'{col} = Column()')
#     i+=1
# print(f'\n{("-")*50}\n>> There are total {i} columns in the current data frame.')

### Create Template and Load Data from Python to SQL Lite DB

In [38]:
# class name == 'Movie' with table name 'movie'
# ------------------------------------------------------------------------------------

class Movie(Base):
    
    __tablename__ = 'imdb_movies'
    
    # leave this __table_args__ here will over ride all previous table made
    # new table with same name will be created with this class 
    __table_args__ = {'extend_existing': True} 
    
    id = Column(Integer, primary_key=True)
    imdb_id = Column(Integer)
    title = Column(String)
    date_published = Column(Date)
                   
    day = Column(Integer)
    month = Column(Integer)
    year = Column(Integer)
                   
    user_rating = Column(Integer)
    web_rating = Column(Integer)
    vote_num = Column(Integer)
    reviews_from_users = Column(Integer)
    reviews_from_critics = Column(Integer)
    
    usa_gross_income_currency = Column(String)
    usa_gross_income = Column(Integer)
    
    worldwide_gross_income_currency = Column(String)
    worldwide_gross_income = Column(Integer)
   
    budget_currency = Column(String)
    budget = Column(Integer)
       
    duration = Column(Integer)
                   
    country = Column(String)
    language = Column(String)
    genre = Column(String)
    director = Column(String)
    writer = Column(String)
    prod_co = Column(String)
    actors = Column(String)
    description = Column(String)

In [39]:
# Create a "Metadata" Layer That Abstracts our SQL Database
# this function upon executing will use the above class to make a schema
# and create a table in SQL Lite DB
# ----------------------------------
Base.metadata.create_all(engine)

In [40]:
# current in memory tables
# the table seeing here is what currently inside SQL Lite DB
Base.metadata.tables

immutabledict({'imdb_movies': Table('imdb_movies', MetaData(bind=None), Column('id', Integer(), table=<imdb_movies>, primary_key=True, nullable=False), Column('imdb_id', Integer(), table=<imdb_movies>), Column('title', String(), table=<imdb_movies>), Column('date_published', Date(), table=<imdb_movies>), Column('day', Integer(), table=<imdb_movies>), Column('month', Integer(), table=<imdb_movies>), Column('year', Integer(), table=<imdb_movies>), Column('user_rating', Integer(), table=<imdb_movies>), Column('web_rating', Integer(), table=<imdb_movies>), Column('vote_num', Integer(), table=<imdb_movies>), Column('reviews_from_users', Integer(), table=<imdb_movies>), Column('reviews_from_critics', Integer(), table=<imdb_movies>), Column('usa_gross_income_currency', String(), table=<imdb_movies>), Column('usa_gross_income', Integer(), table=<imdb_movies>), Column('worldwide_gross_income_currency', String(), table=<imdb_movies>), Column('worldwide_gross_income', Integer(), table=<imdb_movie

In [41]:
# redo again to make sure SQL lite register int64-variables
# -------------------------------------------------------------

sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

In [42]:
# orm requires session so rollbacks can occur etc.
session = Session(bind=engine)

In [43]:
### Begin looping thru dataframe and load data into template

In [44]:
# looping over every row of the database and export data into SQL Lite

# ==========================================================

# specify how much data want to load, in fraction
# ---------------------------

print(f">>> There are total: \033[1;31m{'{:,.0f}'.format(t_shp[0])}\033[0m Records")
data_load_perc = int(input (f">>> How much data would you like to load?\n\
>>> HINT: if 20%, input whole number 20\n--->>>User input: "))# in percentage 

total_to_load = data_load_perc * t_shp[0] // 100 # use '//' to get the integer as the next function only accepts integer
print(f">>> Preparing to load \033[1;31m{'{:,.0f}'.format(total_to_load)}\033[0m ({data_load_perc}%) Records")

# ==========================================================

values = range(total_to_load)


>>> There are total: [1;31m81,273[0m Records
>>> How much data would you like to load?
>>> HINT: if 20%, input whole number 20
--->>>User input: 100
>>> Preparing to load [1;31m81,273[0m (100%) Records


In [45]:
# use progress bar to help user keep track of the process
# build the iter-row within this progress bar
# bar update code is inside the iter-row
# =========================================

i = 0
with tqdm(total=len(values)) as pbar:
    for index, row in processed_df.head(n=total_to_load).iterrows():
        
        # calculate the # of loaded data and percentage
       
        i +=1
        perc = round(i / total_to_load * 100, 2)
       
        
        # Print out message for percentage 
        print (f">>> Loading: \033[1;31m{'{:,.0f}'.format(i)}\033[0m Records | \033[1;32m{perc}%\033[0m Complete", "\r", end ='' , flush=True)
        
        # this is to update the progress bar
        pbar.update(1) 
        
               
        # get the data from cleaned df
        movie = Movie( 
        imdb_id = row['imdb_id'],
        title = row['title'],
        date_published = row['date_published'],
        day = row['day'],
        month = row['month'],
        year = row['year'],
        user_rating = row['user_rating'],
        web_rating = row['web_rating'],
        vote_num = row['vote_num'],
        reviews_from_users = row['reviews_from_users'],
        reviews_from_critics = row['reviews_from_critics'],
        usa_gross_income_currency = row['usa_gross_income_currency'],
        usa_gross_income = row['usa_gross_income'],
        worldwide_gross_income_currency = row['worldwide_gross_income_currency'],
        worldwide_gross_income = row['worldwide_gross_income'],
        budget_currency = row['budget_currency'],
        budget = row['budget'],
        duration = row['duration'],
        country = row['country'],
        language = row['language'],
        genre = row['genre'],
        director = row['director'],
        writer = row['writer'],
        prod_co = row['prod_co'],
        actors = row['actors'],
        description = row['description']
        )
        
        
        # add data to SQL lite session, DB
        session.add(movie)
print(">>> Finished loading all records into memory")

HBox(children=(FloatProgress(value=0.0, max=81273.0), HTML(value='')))

>>> Loading: [1;31m81,273[0m Records | [1;32m100.0%[0m Complete                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     

In [46]:
# commit to hard write onto DB
print(">>> Prepare to write records into SQL Lite DB")
try:
    session.commit()
    print(">>> Successfully wrote all records into SQL Lite DB")
except Exception as errmess:
    print(">>> An Exception has occured ::", str(error))

>>> Prepare to write records into SQL Lite DB
>>> Successfully wrote all records into SQL Lite DB


In [47]:
# check if records are there, uncomment out to run if desire
# engine.execute("select * from Movie").fetchall()

In [48]:
# close out session after done loading data into db
session.close()
print(">>> All session(s) closed")

>>> All session(s) closed


# Part 3:
---
# Conver Jupyter Notebook to Python File

In [49]:
# define file name
python_file_name = 'SQLite_HLE.py'

# if there is already old file, then delete and reprocess a new one
if os.path.exists(python_file_name):
    os.remove(python_file_name)

# if exception raises, just skip the export process
try:
    !jupyter nbconvert --to python hle_IMDb.ipynb
    os.rename("hle_IMDb.py", python_file_name)
except Exception:
    print(Exception)
    pass

[NbConvertApp] Converting notebook hle_IMDb.ipynb to python
[NbConvertApp] Writing 17012 bytes to hle_IMDb.py
