### Import Modules/ Libs for Data Clean-up

In [1]:
import pandas as pd
import csv
import datetime as dt
from datetime import datetime
import numpy as np

### Import SQL-Alchemy for SQL-Lite Data Load

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy.orm import Session
from sqlalchemy import func
import pathlib

### Data Clean-up Process

In [3]:
# define path to csv file
path="Resources_hle\IMDb_movies.csv"

# convert csv to pandas df
raw_df = pd.read_csv(path)

In [4]:
# review what imported
raw_df.head(3)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0


In [5]:
# describe the data    
raw_df.describe()

Unnamed: 0,year,duration,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics
count,81273.0,81273.0,81273.0,81273.0,12722.0,74196.0,70286.0
mean,1993.00721,100.565981,5.926587,9421.771,55.762695,43.753194,27.992758
std,23.992284,25.320189,1.243315,52202.45,17.757453,159.903568,58.708764
min,1906.0,40.0,1.0,99.0,1.0,1.0,1.0
25%,1979.0,88.0,5.2,206.0,43.0,4.0,3.0
50%,2002.0,96.0,6.1,495.0,56.0,9.0,8.0
75%,2012.0,108.0,6.8,1865.0,69.0,26.0,24.0
max,2019.0,3360.0,10.0,2159628.0,100.0,8302.0,987.0


In [6]:
# extract day, month, year from the date_published columns
raw_df["date_published"] = pd.to_datetime(raw_df['date_published'])
raw_df['day'], raw_df['month'], raw_df['year']  = raw_df['date_published'].dt.day, raw_df['date_published'].dt.month, raw_df['date_published'].dt.year

In [7]:
# print out current columns with template to create a dictionary for columns rename 
for col in raw_df.columns:
    print(f'"{col}" : "",')

"imdb_title_id" : "",
"title" : "",
"original_title" : "",
"year" : "",
"date_published" : "",
"genre" : "",
"duration" : "",
"country" : "",
"language" : "",
"director" : "",
"writer" : "",
"production_company" : "",
"actors" : "",
"description" : "",
"avg_vote" : "",
"votes" : "",
"budget" : "",
"usa_gross_income" : "",
"worlwide_gross_income" : "",
"metascore" : "",
"reviews_from_users" : "",
"reviews_from_critics" : "",
"day" : "",
"month" : "",


In [32]:
# define what will be renamed
cols = {
"imdb_title_id" : "id",
"production_company" : "prod_co",
"avg_vote" : "user_rating",
"votes" : "vote_num",
"metascore" : "web_rating",
    }

# process new col names
raw_df.rename(columns=cols, inplace=True)
raw_df

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,...,vote_num,budget,usa_gross_income,worlwide_gross_income,web_rating,reviews_from_users,reviews_from_critics,day,month,total_reviews
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,537,$ 2250,,,,7.0,7.0,26,12,14.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,171,,,,,4.0,2.0,19,8,6.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,420,$ 45000,,,,24.0,3.0,13,11,27.0
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,2019,,,,,28.0,14.0,6,3,42.0
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1913,1913-01-01,"Biography, Drama",60,USA,English,Sidney Olcott,...,438,,,,,12.0,5.0,1,1,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,tt9903716,Jessie,Jessie,2019,2019-03-15,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,219,,,,,21.0,,15,3,
81269,tt9905412,Ottam,Ottam,2019,2019-03-08,Drama,120,India,Malayalam,Zam,...,510,INR 4000000,,$ 4791,,,,8,3,
81270,tt9905462,Pengalila,Pengalila,2019,2019-03-08,Drama,111,India,Malayalam,T.V. Chandran,...,604,INR 10000000,,,,,,8,3,
81271,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,369,,,,,,,8,3,


In [9]:
# # replace all NaN with empty
raw_df.fillna('', inplace=True)

In [11]:
# convert to numbers
raw_df['reviews_from_users'] = pd.to_numeric(raw_df["reviews_from_users"], downcast="integer")
raw_df['reviews_from_critics'] = pd.to_numeric(raw_df["reviews_from_critics"],  downcast="integer")
raw_df['web_rating'] = pd.to_numeric(raw_df["web_rating"],  downcast="integer")

In [12]:
# create a new col combining reviews from users and critics
raw_df['total_reviews'] = raw_df['reviews_from_users'] + raw_df['reviews_from_critics']

In [13]:
# print out all current column names
for col in raw_df.columns:
    print(f'"{col}",')

"id",
"title",
"original_title",
"year",
"date_published",
"genre",
"duration",
"country",
"language",
"director",
"writer",
"prod co",
"actors",
"description",
"user_rating",
"vote_num",
"budget",
"usa_gross_income",
"worlwide_gross_income",
"web_rating",
"reviews_from_users",
"reviews_from_critics",
"day",
"month",
"total_reviews",


In [14]:
# filter out un-needed columns and re-arrange columns
processed_df = raw_df[[
"id",
"title",
"day",
"month",
"year",
"date_published",
"user_rating",
"web_rating",
"vote_num",
"reviews_from_users",
"reviews_from_critics",
"total_reviews",
"budget",
"usa_gross_income",
"worlwide_gross_income",
"duration",
"country",
"language",
"genre",
"director",
"writer",
"prod co",
"actors",
"description"
]]

In [15]:
# processed_df['worlwide_gross_income'].dtypes

dtype('O')

In [None]:
# for index, row in processed_df.iterrows():
#     row['id'] = row['id'].replace("tt", "") 
#     row['worlwide_gross_income'] =  row['worlwide_gross_income'].replace("$", "")
#     row['worlwide_gross_income'] =  row['worlwide_gross_income'].replace(" ", "")
# processed_df

In [None]:
# check the non-numeric of the worlwide_gross_income
a = []
b = []


for ea_row in np.arange(0, len(processed_df['worlwide_gross_income'])):
    
    # for $ sign - USD
    if processed_df['worlwide_gross_income'][ea_row][0:2] and processed_df['worlwide_gross_income'][ea_row][0:2] not in a:
            a.append(processed_df['worlwide_gross_income'][ea_row][0:2])

     # for other currency
    if processed_df['worlwide_gross_income'][ea_row][0:4] and processed_df['worlwide_gross_income'][ea_row][0:4] not in b:
            b.append(processed_df['worlwide_gross_income'][ea_row][0:4])

print (a, "\n", b)    

In [None]:
# check the non-numeric of the usa_gross_income
a = []
b = []
for ea_row in np.arange(0, len(processed_df['usa_gross_income'])):
    if processed_df['usa_gross_income'][ea_row][0:1] and processed_df['usa_gross_income'][ea_row][0:2] not in a:
            a.append(processed_df['usa_gross_income'][ea_row][0:2])

    if processed_df['usa_gross_income'][ea_row][0:4] and processed_df['usa_gross_income'][ea_row][0:4] not in b:
            b.append(processed_df['usa_gross_income'][ea_row][0:4])

print (a, "\n", b)   

In [None]:
# check the non-numeric of the budget 
a = []
b = []
for ea_row in np.arange(0, len(processed_df['budget'])):
    if processed_df['budget'][ea_row][0:1] and processed_df['budget'][ea_row][0:2] not in a:
            a.append(processed_df['budget'][ea_row][0:2])

    if processed_df['budget'][ea_row][0:4] and processed_df['budget'][ea_row][0:4] not in b:
            b.append(processed_df['budget'][ea_row][0:4])


In [None]:
for ea_row in np.arange(0, len(processed_df['worlwide_gross_income'])):
    processed_df['id'][ea_row] = processed_df['id'][ea_row].replace("tt", "")
    if "$ " in processed_df['worlwide_gross_income'][ea_row]:
        processed_df['worlwide_gross_income'][ea_row] = processed_df['worlwide_gross_income'][ea_row]
        processed_df['worlwide_gross_income'][ea_row] = processed_df['worlwide_gross_income'][ea_row].replace("$ ", "")
        processed_df['worlwide_gross_income'][ea_row] = processed_df['worlwide_gross_income'][ea_row].replace("$ ", "")
        processed_df['worlwide_gross_income'][ea_row] = int (processed_df['worlwide_gross_income'][ea_row])
  
    elif "$" in processed_df['worlwide_gross_income'][ea_row]:
        processed_df['worlwide_gross_income'][ea_row] = processed_df['worlwide_gross_income'][ea_row]
        processed_df['worlwide_gross_income'][ea_row] = processed_df['worlwide_gross_income'][ea_row].replace("$", "")
        processed_df['worlwide_gross_income'][ea_row] = processed_df['worlwide_gross_income'][ea_row].replace("$ ", "")
        processed_df['worlwide_gross_income'][ea_row] = int (processed_df['worlwide_gross_income'][ea_row])
        
processed_df.head(10)   

### SQL-Alchemy to load clean data from pandas DataFrame to SQL Lite DataBase

In [16]:
# create declarative base
Base = declarative_base()

# check current table available in the Base - should be nothing at this point
Base.metadata.tables


immutabledict({})

In [17]:
# folder name that will store the sql-lite database
fol_name = "Exp_SqlLiteDb"
if pathlib.Path(fol_name).exists():
    print(f' >> Folder "{fol_name}" already exists!\n >> No new folder was created ...')
    pass
else:
    ! mkdir Exp_SqlLiteDb
    print(f'Successfully created folder "{fol_name}"')

 >> Folder "Exp_SqlLiteDb" already exists!
 >> No new folder was created ...


In [18]:
# # folder name that will store the sql-lite database
# database_name = "Exp_SqlLiteDb"
# if pathlib.Path(fol_name).exists():
#     print(f' >> Folder "{fol_name}" already exists!\n >> No new folder was created ...')
#     pass
# else:
#     ! mkdir Exp_SqlLiteDb
#     print(f'Successfully created folder "{fol_name}"')

In [19]:
sqlLite_db_path = "Exp_SqlLiteDb\movie.db"
engine = create_engine(f"sqlite:///{sqlLite_db_path}")
conn = engine.connect()

In [20]:
# print out list of cols for class creation
i=0
for col in processed_df.columns:
    print(f'{col} = Column()')
    i+=1
print(f'\n{("-")*50}\n>> There are total {i} columns in the current data frame.')

id = Column()
title = Column()
day = Column()
month = Column()
year = Column()
date_published = Column()
user_rating = Column()
web_rating = Column()
vote_num = Column()
reviews_from_users = Column()
reviews_from_critics = Column()
total_reviews = Column()
budget = Column()
usa_gross_income = Column()
worlwide_gross_income = Column()
duration = Column()
country = Column()
language = Column()
genre = Column()
director = Column()
writer = Column()
prod co = Column()
actors = Column()
description = Column()

--------------------------------------------------
>> There are total 24 columns in the current data frame.


In [22]:
# column types in pd_df
processed_df.dtypes

id                               object
title                            object
day                               int64
month                             int64
year                              int64
date_published           datetime64[ns]
user_rating                     float64
web_rating                      float64
vote_num                          int64
reviews_from_users              float64
reviews_from_critics            float64
total_reviews                   float64
budget                           object
usa_gross_income                 object
worlwide_gross_income            object
duration                          int64
country                          object
language                         object
genre                            object
director                         object
writer                           object
prod co                          object
actors                           object
description                      object
dtype: object

In [23]:
id                               object
title                            object
day                               int64
month                             int64
year                              int64
date_published           datetime64[ns]
user_rating                     float64
web_rating                      float64
vote_num                          int64
reviews_from_users              float64
reviews_from_critics            float64
total_reviews                   float64
budget                           object
usa_gross_income                 object
worlwide_gross_income            object
duration                          int64
country                          object
language                         object
genre                            object
director                         object
writer                           object
prod co                          object
actors                           object
description                      object

SyntaxError: invalid syntax (<ipython-input-23-52e30a9d342f>, line 1)

In [39]:
class Movie(Base):
    table_name = input("Please name your table.")
    __tablename__ = table_name
    id = Column(Integer, primary_key=True)
    title = Column(String(255))
    day = Column(Integer)
    month = Column(Integer)
    year = Column(Integer)
    date_published = Column(String(15))
    user_rating = Column(REAL)
    web_rating = Column(REAL)
    vote_num = Column(Integer)
    reviews_from_users = Column(Integer)
    reviews_from_critics = Column(Integer)
    total_reviews = Column(Integer)
    budget = Column(String(255))
    usa_gross_income = Column(String(255))
    worlwide_gross_income = Column(Interger)
    duration = Column(Interger)
    country = Column(String(50)
    language = Column(String(50))
    genre = Column(String(50))
    director = Column(String(50))
    writer = Column(String(50))
    prod_co = Column(String(50))
    actors = Column(String(50))
    description = Column(String(500))


SyntaxError: invalid syntax (<ipython-input-39-e63db3454201>, line 24)

In [24]:
# Create a "Metadata" Layer That Abstracts our SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

In [25]:
# current in memory tables
Base.metadata.tables

immutabledict({})

In [26]:
# orm requires session so rollbacks can occur etc.
session = Session(bind=engine)

In [29]:
# print out all current column names
for col in processed_df.columns:
    print(f'{col} = ,')

id = ,
title = ,
day = ,
month = ,
year = ,
date_published = ,
user_rating = ,
web_rating = ,
vote_num = ,
reviews_from_users = ,
reviews_from_critics = ,
total_reviews = ,
budget = ,
usa_gross_income = ,
worlwide_gross_income = ,
duration = ,
country = ,
language = ,
genre = ,
director = ,
writer = ,
prod co = ,
actors = ,
description = ,


In [34]:
test = Movie(
id = 34364256,
title = "testing while running",
day = 25,
month = 4,
year = 2020,
date_published = 9-5-2001 ,
user_rating = 5.7 ,
web_rating = 7.6,
vote_num = 345,
reviews_from_users = 456,
reviews_from_critics = 234,
total_reviews = 836,
budget = 987654321,
usa_gross_income = 123456789,
worlwide_gross_income = 1234567891011,
duration = 70,
country = 'USA',
language = "American",
genre = "thriller",
director = " Mr. Jon Dhoe",
writer = "warming",
prod_co = "Universal",
actors = "Brad Pitt, Olando Bloom",
description = "This is a test movie just to make sure all is running well")

NameError: name 'Movie' is not defined

In [None]:
# for index, row in processed_df.iterrows():
#     row['id'] = row['id'].replace("tt", "") 
#     row['worlwide_gross_income'] =  row['worlwide_gross_income'].replace("$", "")
#     row['worlwide_gross_income'] =  row['worlwide_gross_income'].replace(" ", "")
# processed_df

In [None]:
# add one item
session.add(patterson)

In [None]:
session.commit()

In [None]:
session.commit()
engine.execute("select * from cat").fetchall()

In [None]:
session.close()