# Part 1:
---
# Python | Pandas for Data Clean-up

## Import Modules/ Libs

In [1]:
import pandas as pd
import csv
import datetime as dt
from datetime import datetime
import numpy as np

# for the progress bar (ONLY WORK in Jupyter Notebook)
from time import sleep
from tqdm.notebook import tqdm

# for making file path independently of OS
# for exporting Jupyter Notebook to Python at the end
import os

## Import SQL-Alchemy for SQL-Lite Data Load

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Float, Date
from sqlalchemy.orm import Session
from sqlalchemy import func
import pathlib
import sqlite3

## Define local functions to use

In [3]:
# convert to interger
def to_int (df_name, col_name):
    
    """
    1. Converting string, etc. into integer
    2. Usage:
        to_int (data-frame-name, column-name):
    """
    
    print (f"\n>> Processing Column: '{col_name}'")
    
    # convert empty string to number, if no num available, put NaN 
#     df_name[col_name] = pd.to_numeric(df_name[col_name], errors='coerce')

    try:
        df_name[col_name] = df_name[col_name].astype(np.int64)
        type_col = df_name[col_name].dtype
        print("---->>> No issue observed")
        
    except ValueError as error:
        
        print("---->>> An Exception has occured ::", str(error))
        print("---->>> Proceed with alternative routes, please wait...")
        # replace all empty space with 0
        df_name[col_name] =  df_name[col_name].fillna(0)
        df_name[col_name] = df_name[col_name].replace('',0, regex=True)

        # convert df to numpy array
        np_of_df = df_name[col_name].values

        # comvert to type as float
        np_of_df_float = np_of_df.astype(float)  

        df_name[col_name] = np_of_df_float
        df_name[col_name] = df_name[col_name].astype(np.int64)
        type_col = df_name[col_name].dtype
        

    print(f">> DONE Coverting to {type_col}\n\n{50*('=')}\n{50*('=')}")
   
    
    return df_name

In [4]:
def conv_currency (df_name, col_name):
    
    """
    1. Converting "$" to 'USD'
    2. Split column contains "currency" and "number" into 2 columns
        a. <original_column_name>_currency : tag of currency like 'USD', 'EUR', 'INR', etc.
    3. Usage: 
        conv_currency (data-frame-name, column-name)
    """
    df_name[col_name].replace(np.NaN, np.int64(0), inplace=True)
    
    print (f"\n>> Processing Column: '{col_name}'")
    
    # replace $ for USD and remove all "blanks"
    df_name.loc[:,col_name] = df_name.loc[:, col_name].str.replace("$", "USD ")
    df_name.loc[:,col_name] = df_name.loc[:, col_name].str.replace(",", "")
    print(df_name.loc[:,col_name].dtype)
    print(f">> DONE Replacing '$' with 'USD'!")
    
    # split currency and value into two cols
    
    try:
        new_col = f'{col_name}_currency'
        print(f'    >>> Creating new column named: "{new_col}"')
        df_name[new_col], df_name[col_name] = df_name[col_name].str.split(' ', 1).str
        print(df_name.loc[:,col_name].dtype)
        print(df_name.loc[:,new_col].dtype)
        print(f">> DONE Splitting Columns!")
              
    except ValueError:
        pass
    
    df_name[col_name].replace("", np.int64(0), inplace=True)
    print(f">> PROCESS COMPLETED !\n\n{50*('=')}\n{50*('=')}")
    
    return print(f">> PROCESS COMPLETED !\n\n{50*('=')}\n{50*('=')}")

In [5]:
def replace_str (df_name, col_name, to_be_repl, repl_to):
    
    """
    1. Replace a character to another character
    2. Usage:
        replace_str (data-frame-name, column-name, string-to-replace, replace-to-string):
    """
    print (f"\n>> Processing Column: '{col_name}'")
    
    df_name[col_name] = df_name.loc[:, col_name].str.replace(to_be_repl, repl_to)
    

    print(f">> DONE Replacing Character!\n\n{50*('=')}\n{50*('=')}")
    return df_name.head(10)

In [6]:
def to_str (df_name, col_name):
    
    """
    1. convert to string
    2. Usage:
        to_str (data-frame-name, column-name)
    """
    print (f"\n>> Processing Column: '{col_name}'")
    
   
    df_name[col_name] = df_name[col_name].astype('str') 
       

    print(f">> DONE Converting to String!\n\n{50*('=')}\n{50*('=')}")
    return df_name.head(10)

## Data Clean-up Process

### DATABASE RAW_DF

In [7]:
# define path to csv file
path=os.path.join("Resources_hle","IMDb_movies.csv")

# convert csv to pandas df
raw_df = pd.read_csv(path, encoding="UTF-8", low_memory=False) #, dtype={'usa_gross_income': "string", "worldwide_gross_income" : "string", "budget": "string"})

In [8]:
# review what imported
raw_df.head(20)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0
5,tt0002423,Madame DuBarry,Madame DuBarry,1919,1919-11-26,"Biography, Drama, Romance",85,Germany,German,Ernst Lubitsch,...,"Pola Negri, Emil Jannings, Harry Liedtke, Edua...","The story of Madame DuBarry, the mistress of L...",6.8,709,,,,,11.0,9.0
6,tt0002445,Quo Vadis?,Quo Vadis?,1913,1913-03-01,"Drama, History",120,Italy,Italian,Enrico Guazzoni,...,"Amleto Novelli, Gustavo Serena, Carlo Cattaneo...","An epic Italian film ""Quo Vadis"" influenced ma...",6.2,241,ITL 45000,,,,6.0,4.0
7,tt0002452,Independenta Romaniei,Independenta Romaniei,1912,1912-09-01,"History, War",120,Romania,,Aristide Demetriade,...,"Aristide Demetriade, Constanta Demetriade, Con...",The movie depicts the Romanian War of Independ...,6.7,187,ROL 400000,,,,3.0,1.0
8,tt0002461,Richard III,Richard III,1912,1912-10-15,Drama,55,"France, USA",English,"André Calmettes, James Keane",...,"Robert Gemp, Frederick Warde, Albert Gardner, ...",Richard of Gloucester uses manipulation and mu...,5.5,211,$ 30000,,,,7.0,1.0
9,tt0002646,Atlantis,Atlantis,1913,1913-12-26,Drama,121,Denmark,Danish,August Blom,...,"Olaf Fønss, Ida Orloff, Ebba Thomsen, Carl Lau...",After Dr. Friedrich's wife becomes mentally un...,6.7,310,,,,,9.0,9.0


In [9]:
# print out current columns with template to create a dictionary for columns rename
# un-comment to print out the template
# for col in raw_df.columns:
#     print(f'"{col}" : "__",')

In [10]:
# define what will be renamed
cols = {
"imdb_title_id" : "imdb_id",
"production_company" : "prod_co",
"avg_vote" : "user_rating",
"votes" : "vote_num",
"worlwide_gross_income" : "worldwide_gross_income",
"metascore" : "web_rating"
    }

# process new col names
raw_df.rename(columns=cols, inplace=True)
# raw_df.head(10)

In [11]:
# # fill empty space
# raw_df.fillna("0")
# raw_df.head(10)

In [12]:
# print out all current column names
# for col in raw_df.columns:
#     print(f'"{col}",')

### DATABASE PROCESSED_DF

In [13]:
# del processed_df

In [14]:
# if table exist, delete the table
try:
    del processed_df

except Exception:
    pass

processed_df = raw_df
processed_df

Unnamed: 0,imdb_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,user_rating,vote_num,budget,usa_gross_income,worldwide_gross_income,web_rating,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,tt9903716,Jessie,Jessie,2019,2019-03-15,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,"Sritha Chandana, Pavani Gangireddy, Abhinav Go...","Set in an abandoned house, the film follows a ...",7.2,219,,,,,21.0,
81269,tt9905412,Ottam,Ottam,2019,2019-03-08,Drama,120,India,Malayalam,Zam,...,"Nandu Anand, Roshan Ullas, Manikandan R. Achar...","Set in Trivandrum, the story of Ottam unfolds ...",7.8,510,INR 4000000,,$ 4791,,,
81270,tt9905462,Pengalila,Pengalila,2019,2019-03-08,Drama,111,India,Malayalam,T.V. Chandran,...,"Lal, Akshara Kishor, Iniya, Narain, Renji Pani...",An unusual bond between a sixty year old Dalit...,8.4,604,INR 10000000,,,,,
81271,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",,8.4,369,,,,,,


In [15]:
# replacing "tt" character in id columns and convert id to integer
replace_str(processed_df, "imdb_id", "tt", "")


>> Processing Column: 'imdb_id'
>> DONE Replacing Character!



Unnamed: 0,imdb_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,user_rating,vote_num,budget,usa_gross_income,worldwide_gross_income,web_rating,reviews_from_users,reviews_from_critics
0,574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,1892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,2101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0
3,2130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...,7.0,2019,,,,,28.0,14.0
4,2199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1912,1913,"Biography, Drama",60,USA,English,Sidney Olcott,...,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ...",5.7,438,,,,,12.0,5.0
5,2423,Madame DuBarry,Madame DuBarry,1919,1919-11-26,"Biography, Drama, Romance",85,Germany,German,Ernst Lubitsch,...,"Pola Negri, Emil Jannings, Harry Liedtke, Edua...","The story of Madame DuBarry, the mistress of L...",6.8,709,,,,,11.0,9.0
6,2445,Quo Vadis?,Quo Vadis?,1913,1913-03-01,"Drama, History",120,Italy,Italian,Enrico Guazzoni,...,"Amleto Novelli, Gustavo Serena, Carlo Cattaneo...","An epic Italian film ""Quo Vadis"" influenced ma...",6.2,241,ITL 45000,,,,6.0,4.0
7,2452,Independenta Romaniei,Independenta Romaniei,1912,1912-09-01,"History, War",120,Romania,,Aristide Demetriade,...,"Aristide Demetriade, Constanta Demetriade, Con...",The movie depicts the Romanian War of Independ...,6.7,187,ROL 400000,,,,3.0,1.0
8,2461,Richard III,Richard III,1912,1912-10-15,Drama,55,"France, USA",English,"André Calmettes, James Keane",...,"Robert Gemp, Frederick Warde, Albert Gardner, ...",Richard of Gloucester uses manipulation and mu...,5.5,211,$ 30000,,,,7.0,1.0
9,2646,Atlantis,Atlantis,1913,1913-12-26,Drama,121,Denmark,Danish,August Blom,...,"Olaf Fønss, Ida Orloff, Ebba Thomsen, Carl Lau...",After Dr. Friedrich's wife becomes mentally un...,6.7,310,,,,,9.0,9.0


In [16]:


# extract day, month, year from the date_published columns
processed_df["date_published"] = pd.to_datetime(processed_df['date_published'])
processed_df['day'], processed_df['month'], processed_df['year']  = processed_df['date_published'].dt.day, processed_df['date_published'].dt.month, processed_df['date_published'].dt.year



In [17]:
# convert 'id' to numbers
to_int(processed_df, 'imdb_id')


>> Processing Column: 'imdb_id'
---->>> No issue observed
>> DONE Coverting to int64



Unnamed: 0,imdb_id,title,original_title,year,date_published,genre,duration,country,language,director,...,user_rating,vote_num,budget,usa_gross_income,worldwide_gross_income,web_rating,reviews_from_users,reviews_from_critics,day,month
0,574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,6.1,537,$ 2250,,,,7.0,7.0,26,12
1,1892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,5.9,171,,,,,4.0,2.0,19,8
2,2101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,5.2,420,$ 45000,,,,24.0,3.0,13,11
3,2130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,7.0,2019,,,,,28.0,14.0,6,3
4,2199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1913,1913-01-01,"Biography, Drama",60,USA,English,Sidney Olcott,...,5.7,438,,,,,12.0,5.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,9903716,Jessie,Jessie,2019,2019-03-15,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,7.2,219,,,,,21.0,,15,3
81269,9905412,Ottam,Ottam,2019,2019-03-08,Drama,120,India,Malayalam,Zam,...,7.8,510,INR 4000000,,$ 4791,,,,8,3
81270,9905462,Pengalila,Pengalila,2019,2019-03-08,Drama,111,India,Malayalam,T.V. Chandran,...,8.4,604,INR 10000000,,,,,,8,3
81271,9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,8.4,369,,,,,,,8,3


In [18]:
# for trouble shoot and debugging purposes
# break_point_here

In [19]:
col_to_covert = ['worldwide_gross_income', 'usa_gross_income', 'budget']

In [20]:
for ea_col in col_to_covert:
    conv_currency(processed_df, ea_col)
processed_df[['worldwide_gross_income', 'usa_gross_income', 'budget']]


>> Processing Column: 'worldwide_gross_income'
object
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "worldwide_gross_income_currency"
object
object
>> DONE Splitting Columns!
>> PROCESS COMPLETED !

>> PROCESS COMPLETED !


>> Processing Column: 'usa_gross_income'




object
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "usa_gross_income_currency"
object
object
>> DONE Splitting Columns!
>> PROCESS COMPLETED !

>> PROCESS COMPLETED !


>> Processing Column: 'budget'
object
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "budget_currency"
object
object
>> DONE Splitting Columns!
>> PROCESS COMPLETED !

>> PROCESS COMPLETED !



Unnamed: 0,worldwide_gross_income,usa_gross_income,budget
0,,,2250
1,,,
2,,,45000
3,,,
4,,,
...,...,...,...
81268,,,
81269,4791,,4000000
81270,,,10000000
81271,,,


In [21]:
# create an folder to hole temporarily exported data of converted data
# folder name that will store the sql-lite database
fol_name = "archieve"

# if exist print a message for user
if pathlib.Path(fol_name).exists():
    print(f' >> Folder "{fol_name}" already exists!\n >> NO new folder was created ...')
    pass

# if not make a new one and let user know
else:
    os.mkdir(fol_name)
    print(f'Successfully created folder "{fol_name}"')

# export to csv for visual inspection or further process if needed
processed_df[["worldwide_gross_income_currency", 'worldwide_gross_income', "usa_gross_income_currency", 'usa_gross_income', "budget_currency", 'budget']].to_csv("archieve\exported_draft.csv")


 >> Folder "archieve" already exists!
 >> NO new folder was created ...


In [22]:

# print out the column names inside df
# useful as being able to copy and paste directly into cell without retyping all col names
# uncheck to use
# for col in processed_df.columns:
#     print (f'"{col}",')


In [23]:
print(processed_df.dtypes)

imdb_id                                     int64
title                                      object
original_title                             object
year                                        int64
date_published                     datetime64[ns]
genre                                      object
duration                                    int64
country                                    object
language                                   object
director                                   object
writer                                     object
prod_co                                    object
actors                                     object
description                                object
user_rating                               float64
vote_num                                    int64
budget                                     object
usa_gross_income                           object
worldwide_gross_income                     object
web_rating                                float64


In [24]:
# specify columns to be converted to string
col_to_str = [
    "imdb_id",
    "title",
    "usa_gross_income_currency",
    "worldwide_gross_income_currency",
    "budget_currency",
    "country",
    "language",
    "genre",
    "director",
    "writer",
    "prod_co",
    "actors",
    "description"    
]

# loop and replace one by one
for col in col_to_str:
    to_str(processed_df, col)


>> Processing Column: 'imdb_id'
>> DONE Converting to String!


>> Processing Column: 'title'
>> DONE Converting to String!


>> Processing Column: 'usa_gross_income_currency'
>> DONE Converting to String!


>> Processing Column: 'worldwide_gross_income_currency'
>> DONE Converting to String!


>> Processing Column: 'budget_currency'
>> DONE Converting to String!


>> Processing Column: 'country'
>> DONE Converting to String!


>> Processing Column: 'language'
>> DONE Converting to String!


>> Processing Column: 'genre'
>> DONE Converting to String!


>> Processing Column: 'director'
>> DONE Converting to String!


>> Processing Column: 'writer'
>> DONE Converting to String!


>> Processing Column: 'prod_co'
>> DONE Converting to String!


>> Processing Column: 'actors'
>> DONE Converting to String!


>> Processing Column: 'description'
>> DONE Converting to String!



In [25]:
# specify columns to be converted to integer
col_to_int = [
    "imdb_id",
    "day",
    "month",
    "year",
    "user_rating",
    "web_rating",
    "vote_num",
    "reviews_from_users",
    "reviews_from_critics",
    "usa_gross_income",
    "worldwide_gross_income",
    "budget",
    "duration",
]

# loop and replace one by one
for col in col_to_int:
    to_int(processed_df, col)


>> Processing Column: 'imdb_id'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'day'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'month'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'year'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'user_rating'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'web_rating'
---->>> An Exception has occured :: Cannot convert non-finite values (NA or inf) to integer
---->>> Proceed with alternative routes, please wait...
>> DONE Coverting to int64


>> Processing Column: 'vote_num'
---->>> No issue observed
>> DONE Coverting to int64


>> Processing Column: 'reviews_from_users'
---->>> An Exception has occured :: Cannot convert non-finite values (NA or inf) to integer
---->>> Proceed with alternative routes, please wait...
>> DONE Coverting to int64


>> Processing Column: 'reviews_from_critics'
----

In [26]:
processed_df.dtypes

imdb_id                                     int64
title                                      object
original_title                             object
year                                        int64
date_published                     datetime64[ns]
genre                                      object
duration                                    int64
country                                    object
language                                   object
director                                   object
writer                                     object
prod_co                                    object
actors                                     object
description                                object
user_rating                                 int64
vote_num                                    int64
budget                                      int64
usa_gross_income                            int64
worldwide_gross_income                      int64
web_rating                                  int64


In [27]:
# re-arrange columns in the df
# build a list of what need to be included and their positions
cols=[
"imdb_id",
"title",

"date_published",
"day",
"month",
"year",

"user_rating",
"web_rating",

"vote_num",
"reviews_from_users",
"reviews_from_critics",
    
"usa_gross_income_currency",
"usa_gross_income",

"worldwide_gross_income_currency",
"worldwide_gross_income",

"budget_currency",
"budget",


"duration",
"country",
"language",
"genre",
"director",
"writer",
"prod_co",
"actors",
"description"
]


# re-arrange:
processed_df = processed_df[cols]

In [28]:
processed_df.to_csv("archieve\processed_df.csv")

In [29]:
# template to print out all columns and get ready for CLASS creation
# for col in  cols:
#     print (f'{col} = Column(    )')

In [30]:
# Table export description:
t_shp = processed_df.shape
print(f"TABLE DESCRIPTIONS:\n{('-')*30}\n\
>>> Number of Rows: {'{:,.0f}'.format(t_shp[0])}\n\
>>> Number of Columns: {t_shp[1]}")

TABLE DESCRIPTIONS:
------------------------------
>>> Number of Rows: 81,273
>>> Number of Columns: 26


In [31]:
# brk_here

# Part 2: 
---
# SQLAlchemy -> SQL Lite

### Create Engine & Connection to SQL Lite DB

In [32]:
# create declarative base
Base = declarative_base()

# check current table available in the Base - should be nothing at this point
Base.metadata.tables


immutabledict({})

In [33]:
# folder name that will store the sql-lite database
fol_name = "SQLiteDB_Exported"

# if exist print a message for user
if pathlib.Path(fol_name).exists():
    print(f' >> Folder "{fol_name}" already exists!\n >> NO new folder was created ...')
    pass

# if not make a new one and let user know
else:
    os.mkdir(fol_name)
    print(f'Successfully created folder "{fol_name}"')

 >> Folder "SQLiteDB_Exported" already exists!
 >> NO new folder was created ...


In [34]:
sqlite_db_path = os.path.join(fol_name,"ETL_movies.db")
engine = create_engine(f"sqlite:///{sqlite_db_path}")
conn = engine.connect()

In [35]:
engine.execute("DROP TABLE IF EXISTS movie_imdb")

<sqlalchemy.engine.result.ResultProxy at 0x213bee0f708>

In [36]:
# per docs found online, SQL Lite works up to int8
# while creating this file, I found lots of DataType Mismatch errors
# found this solutions as 2 lines belows to extend SQL to work with int64
# -------------------------------------------------------------
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

In [37]:
# print out list of cols for class creation
# i=0
# for col in processed_df.columns:
#     print(f'{col} = Column()')
#     i+=1
# print(f'\n{("-")*50}\n>> There are total {i} columns in the current data frame.')

### Create Template and Load Data from Python to SQL Lite DB

In [38]:
# class name == 'Movie' with table name 'movie'
# ------------------------------------------------------------------------------------

class Movie(Base):
    
    __tablename__ = 'imdb_movies'
    
    # leave this __table_args__ here will over ride all previous table made
    # new table with same name will be created with this class 
    __table_args__ = {'extend_existing': True} 
    
    id = Column(Integer, primary_key=True)
    imdb_id = Column(Integer)
    title = Column(String)
    date_published = Column(Date)
                   
    day = Column(Integer)
    month = Column(Integer)
    year = Column(Integer)
                   
    user_rating = Column(Integer)
    web_rating = Column(Integer)
    vote_num = Column(Integer)
    reviews_from_users = Column(Integer)
    reviews_from_critics = Column(Integer)
    
    usa_gross_income_currency = Column(String)
    usa_gross_income = Column(Integer)
    
    worldwide_gross_income_currency = Column(String)
    worldwide_gross_income = Column(Integer)
   
    budget_currency = Column(String)
    budget = Column(Integer)
       
    duration = Column(Integer)
                   
    country = Column(String)
    language = Column(String)
    genre = Column(String)
    director = Column(String)
    writer = Column(String)
    prod_co = Column(String)
    actors = Column(String)
    description = Column(String)

In [39]:
# Create a "Metadata" Layer That Abstracts our SQL Database
# this function upon executing will use the above class to make a schema
# and create a table in SQL Lite DB
# ----------------------------------
Base.metadata.create_all(engine)

In [40]:
# current in memory tables
# the table seeing here is what currently inside SQL Lite DB
Base.metadata.tables

immutabledict({'imdb_movies': Table('imdb_movies', MetaData(bind=None), Column('id', Integer(), table=<imdb_movies>, primary_key=True, nullable=False), Column('imdb_id', Integer(), table=<imdb_movies>), Column('title', String(), table=<imdb_movies>), Column('date_published', Date(), table=<imdb_movies>), Column('day', Integer(), table=<imdb_movies>), Column('month', Integer(), table=<imdb_movies>), Column('year', Integer(), table=<imdb_movies>), Column('user_rating', Integer(), table=<imdb_movies>), Column('web_rating', Integer(), table=<imdb_movies>), Column('vote_num', Integer(), table=<imdb_movies>), Column('reviews_from_users', Integer(), table=<imdb_movies>), Column('reviews_from_critics', Integer(), table=<imdb_movies>), Column('usa_gross_income_currency', String(), table=<imdb_movies>), Column('usa_gross_income', Integer(), table=<imdb_movies>), Column('worldwide_gross_income_currency', String(), table=<imdb_movies>), Column('worldwide_gross_income', Integer(), table=<imdb_movie

In [41]:
# redo again to make sure SQL lite register int64-variables
# -------------------------------------------------------------

sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

In [42]:
# orm requires session so rollbacks can occur etc.
session = Session(bind=engine)

In [43]:
### Begin looping thru dataframe and load data into template

In [55]:
# looping over every row of the database and export data into SQL Lite

# ==========================================================

# specify how much data want to load, in fraction
# ---------------------------

print(f">>> There are total: \033[1;31m{'{:,.0f}'.format(t_shp[0])}\033[0m Records")
data_load_perc = int(input (f">>> How much data would you like to load?\n\
>>> HINT: if 20%, input whole number 20\n--->>>User input: "))# in percentage 

total_to_load = data_load_perc * t_shp[0] // 100 # use '//' to get the integer as the next function only accepts integer
print(f">>> Preparing to load \033[1;31m{'{:,.0f}'.format(total_to_load)}\033[0m ({data_load_perc}%) Records")

# ==========================================================

values = range(total_to_load)


>>> There are total: [1;31m81,273[0m Records
>>> How much data would you like to load?
>>> HINT: if 20%, input whole number 20
--->>>User input: 100
>>> Preparing to load [1;31m81,273[0m (100%) Records


In [None]:
# use progress bar to help user keep track of the process
# build the iter-row within this progress bar
# bar update code is inside the iter-row
# =========================================

i = 0



import time
time.sleep(10)

with tqdm(total=len(values)) as pbar:
    for index, row in processed_df.head(n=total_to_load).iterrows():

        
        # calculate the # of loaded data and percentage
       
        i +=1
        perc = round(i / total_to_load * 100, 2)
       
        
        # Print out message for percentage 
        print (f">>> Loading: \033[1;31m{'{:,.0f}'.format(i)}\033[0m Records | \033[1;32m{perc}%\033[0m Complete", "\r", end ='' , flush=True)
        
        # this is to update the progress bar
        pbar.update(1) 
        
               
        # get the data from cleaned df
        movie = Movie( 
        imdb_id = row['imdb_id'],
        title = row['title'],
        date_published = row['date_published'],
        day = row['day'],
        month = row['month'],
        year = row['year'],
        user_rating = row['user_rating'],
        web_rating = row['web_rating'],
        vote_num = row['vote_num'],
        reviews_from_users = row['reviews_from_users'],
        reviews_from_critics = row['reviews_from_critics'],
        usa_gross_income_currency = row['usa_gross_income_currency'],
        usa_gross_income = row['usa_gross_income'],
        worldwide_gross_income_currency = row['worldwide_gross_income_currency'],
        worldwide_gross_income = row['worldwide_gross_income'],
        budget_currency = row['budget_currency'],
        budget = row['budget'],
        duration = row['duration'],
        country = row['country'],
        language = row['language'],
        genre = row['genre'],
        director = row['director'],
        writer = row['writer'],
        prod_co = row['prod_co'],
        actors = row['actors'],
        description = row['description']
        )
        
        
        # add data to SQL lite session, DB
        session.add(movie)
print(">>> Finished loading all records into memory")

HBox(children=(FloatProgress(value=0.0, max=81273.0), HTML(value='')))

>>> Loading: [1;31m13,096[0m Records | [1;32m16.11%[0m Complete                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [46]:
# commit to hard write onto DB
print(">>> Prepare to write records into SQL Lite DB")
try:
    session.commit()
    print(">>> Successfully wrote all records into SQL Lite DB")
except Exception as errmess:
    print(">>> An Exception has occured ::", str(error))

>>> Prepare to write records into SQL Lite DB
>>> Successfully wrote all records into SQL Lite DB


In [47]:
# check if records are there, uncomment out to run if desire
# engine.execute("select * from Movie").fetchall()

In [48]:
# close out session after done loading data into db
session.close()
print(">>> All session(s) closed")

>>> All session(s) closed


# Part 3:
---
# Conver Jupyter Notebook to Python File

In [49]:
# define file name
python_file_name = 'SQLite_HLE.py'

# if there is already old file, then delete and reprocess a new one
if os.path.exists(python_file_name):
    os.remove(python_file_name)

# if exception raises, just skip the export process
try:
    !jupyter nbconvert --to python hle_IMDb.ipynb
    os.rename("hle_IMDb.py", python_file_name)
except Exception:
    print(Exception)
    pass

[NbConvertApp] Converting notebook hle_IMDb.ipynb to python
[NbConvertApp] Writing 17045 bytes to hle_IMDb.py
