### Import Modules/ Libs for Data Clean-up

In [1]:
import pandas as pd
import csv
import datetime as dt
from datetime import datetime
import numpy as np

### Import SQL-Alchemy for SQL-Lite Data Load

In [2]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Float
from sqlalchemy.orm import Session
from sqlalchemy import func
import pathlib

### Data Clean-up Process

In [3]:
# define path to csv file
path="Resources_hle\IMDb_movies.csv"

# convert csv to pandas df
raw_df = pd.read_csv(path)

In [4]:
# review what imported
raw_df.head(3)

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...,6.1,537,$ 2250,,,,7.0,7.0
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...,5.9,171,,,,,4.0,2.0
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...,5.2,420,$ 45000,,,,24.0,3.0


In [5]:
# describe the data    
raw_df.describe()

Unnamed: 0,year,duration,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics
count,81273.0,81273.0,81273.0,81273.0,12722.0,74196.0,70286.0
mean,1993.00721,100.565981,5.926587,9421.771,55.762695,43.753194,27.992758
std,23.992284,25.320189,1.243315,52202.45,17.757453,159.903568,58.708764
min,1906.0,40.0,1.0,99.0,1.0,1.0,1.0
25%,1979.0,88.0,5.2,206.0,43.0,4.0,3.0
50%,2002.0,96.0,6.1,495.0,56.0,9.0,8.0
75%,2012.0,108.0,6.8,1865.0,69.0,26.0,24.0
max,2019.0,3360.0,10.0,2159628.0,100.0,8302.0,987.0


In [6]:
# extract day, month, year from the date_published columns
raw_df["date_published"] = pd.to_datetime(raw_df['date_published'])
raw_df['day'], raw_df['month'], raw_df['year']  = raw_df['date_published'].dt.day, raw_df['date_published'].dt.month, raw_df['date_published'].dt.year

In [7]:
# print out current columns with template to create a dictionary for columns rename 
for col in raw_df.columns:
    print(f'"{col}" : "",')

"imdb_title_id" : "",
"title" : "",
"original_title" : "",
"year" : "",
"date_published" : "",
"genre" : "",
"duration" : "",
"country" : "",
"language" : "",
"director" : "",
"writer" : "",
"production_company" : "",
"actors" : "",
"description" : "",
"avg_vote" : "",
"votes" : "",
"budget" : "",
"usa_gross_income" : "",
"worlwide_gross_income" : "",
"metascore" : "",
"reviews_from_users" : "",
"reviews_from_critics" : "",
"day" : "",
"month" : "",


In [8]:
# define what will be renamed
cols = {
"imdb_title_id" : "id",
"production_company" : "prod co",
"avg_vote" : "user_rating",
"votes" : "vote_num",
"metascore" : "web_rating",
    }

# process new col names
raw_df.rename(columns=cols, inplace=True)
raw_df

Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,...,user_rating,vote_num,budget,usa_gross_income,worlwide_gross_income,web_rating,reviews_from_users,reviews_from_critics,day,month
0,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,1906-12-26,"Biography, Crime, Drama",70,Australia,,Charles Tait,...,6.1,537,$ 2250,,,,7.0,7.0,26,12
1,tt0001892,Den sorte drøm,Den sorte drøm,1911,1911-08-19,Drama,53,"Germany, Denmark",,Urban Gad,...,5.9,171,,,,,4.0,2.0,19,8
2,tt0002101,Cleopatra,Cleopatra,1912,1912-11-13,"Drama, History",100,USA,English,Charles L. Gaskill,...,5.2,420,$ 45000,,,,24.0,3.0,13,11
3,tt0002130,L'Inferno,L'Inferno,1911,1911-03-06,"Adventure, Drama, Fantasy",68,Italy,Italian,"Francesco Bertolini, Adolfo Padovan",...,7.0,2019,,,,,28.0,14.0,6,3
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...","From the Manger to the Cross; or, Jesus of Naz...",1913,1913-01-01,"Biography, Drama",60,USA,English,Sidney Olcott,...,5.7,438,,,,,12.0,5.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,tt9903716,Jessie,Jessie,2019,2019-03-15,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,7.2,219,,,,,21.0,,15,3
81269,tt9905412,Ottam,Ottam,2019,2019-03-08,Drama,120,India,Malayalam,Zam,...,7.8,510,INR 4000000,,$ 4791,,,,8,3
81270,tt9905462,Pengalila,Pengalila,2019,2019-03-08,Drama,111,India,Malayalam,T.V. Chandran,...,8.4,604,INR 10000000,,,,,,8,3
81271,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,8.4,369,,,,,,,8,3


In [9]:
# create a new col combining reviews from users and critics
raw_df['total_reviews'] = raw_df['reviews_from_users'] + raw_df['reviews_from_critics']

In [10]:
# print out all current column names
for col in raw_df.columns:
    print(f'"{col}",')

"id",
"title",
"original_title",
"year",
"date_published",
"genre",
"duration",
"country",
"language",
"director",
"writer",
"prod co",
"actors",
"description",
"user_rating",
"vote_num",
"budget",
"usa_gross_income",
"worlwide_gross_income",
"web_rating",
"reviews_from_users",
"reviews_from_critics",
"day",
"month",
"total_reviews",


In [11]:
# filter out un-needed columns and re-arrange columns
processed_df = raw_df[[
"id",
"title",
"day",
"month",
"year",
"date_published",
"user_rating",
"web_rating",
"vote_num",
"reviews_from_users",
"reviews_from_critics",
"total_reviews",
"budget",
"usa_gross_income",
"worlwide_gross_income",
"duration",
"country",
"language",
"genre",
"director",
"writer",
"prod co",
"actors",
"description"
]]

In [12]:
# replace all NaN with empty
processed_df.fillna('', inplace=True)
processed_df.head(5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  downcast=downcast,


Unnamed: 0,id,title,day,month,year,date_published,user_rating,web_rating,vote_num,reviews_from_users,...,worlwide_gross_income,duration,country,language,genre,director,writer,prod co,actors,description
0,tt0000574,The Story of the Kelly Gang,26,12,1906,1906-12-26,6.1,,537,7,...,,70,Australia,,"Biography, Crime, Drama",Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...
1,tt0001892,Den sorte drøm,19,8,1911,1911-08-19,5.9,,171,4,...,,53,"Germany, Denmark",,Drama,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...
2,tt0002101,Cleopatra,13,11,1912,1912-11-13,5.2,,420,24,...,,100,USA,English,"Drama, History",Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...
3,tt0002130,L'Inferno,6,3,1911,1911-03-06,7.0,,2019,28,...,,68,Italy,Italian,"Adventure, Drama, Fantasy","Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...",1,1,1913,1913-01-01,5.7,,438,12,...,,60,USA,English,"Biography, Drama",Sidney Olcott,Gene Gauntier,Kalem Company,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ..."


In [13]:
# replace all NaN with empty space
a = processed_df['worlwide_gross_income'][81269].replace("$ ", "")
processed_df['worlwide_gross_income'][81269] = a
print (a)

processed_df['worlwide_gross_income'][81269]

# .str.replace("$ ","", regex=True)
# processed_df.str.replace("$","", regex=True)
# print(processed_df['worlwide_gross_income'][81269])

4791


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


'4791'

In [14]:
for ea_row in np.arange(0, len(processed_df['worlwide_gross_income'])):
    temp_val = processed_df['worlwide_gross_income'][ea_row]
    if "$ " in temp_val:
        temp_val = temp_val.replace("$ ", "")
  
    elif "$" in temp_val:
        temp_val = temp_val.replace("$", "")
    
    elif "INR" in temp_val:
        temp_val = temp_val.replace("INR ", "")
   
    elif "INR" in temp_val:
        temp_val = temp_val.replace("INR", "")
        temp_val = int ()
processed_df       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,id,title,day,month,year,date_published,user_rating,web_rating,vote_num,reviews_from_users,...,worlwide_gross_income,duration,country,language,genre,director,writer,prod co,actors,description
0,tt0000574,The Story of the Kelly Gang,26,12,1906,1906-12-26,6.1,,537,7,...,,70,Australia,,"Biography, Crime, Drama",Charles Tait,Charles Tait,J. and N. Tait,"Elizabeth Tait, John Tait, Norman Campbell, Be...",True story of notorious Australian outlaw Ned ...
1,tt0001892,Den sorte drøm,19,8,1911,1911-08-19,5.9,,171,4,...,,53,"Germany, Denmark",,Drama,Urban Gad,"Urban Gad, Gebhard Schätzler-Perasini",Fotorama,"Asta Nielsen, Valdemar Psilander, Gunnar Helse...",Two men of high rank are both wooing the beaut...
2,tt0002101,Cleopatra,13,11,1912,1912-11-13,5.2,,420,24,...,,100,USA,English,"Drama, History",Charles L. Gaskill,Victorien Sardou,Helen Gardner Picture Players,"Helen Gardner, Pearl Sindelar, Miss Fielding, ...",The fabled queen of Egypt's affair with Roman ...
3,tt0002130,L'Inferno,6,3,1911,1911-03-06,7.0,,2019,28,...,,68,Italy,Italian,"Adventure, Drama, Fantasy","Francesco Bertolini, Adolfo Padovan",Dante Alighieri,Milano Film,"Salvatore Papa, Arturo Pirovano, Giuseppe de L...",Loosely adapted from Dante's Divine Comedy and...
4,tt0002199,"From the Manger to the Cross; or, Jesus of Naz...",1,1,1913,1913-01-01,5.7,,438,12,...,,60,USA,English,"Biography, Drama",Sidney Olcott,Gene Gauntier,Kalem Company,"R. Henderson Bland, Percy Dyer, Gene Gauntier,...","An account of the life of Jesus Christ, based ..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,tt9903716,Jessie,15,3,2019,2019-03-15,7.2,,219,21,...,,106,India,Telugu,"Horror, Thriller",Aswani Kumar V.,Aswani Kumar V.,Ekaa Art Production,"Sritha Chandana, Pavani Gangireddy, Abhinav Go...","Set in an abandoned house, the film follows a ..."
81269,tt9905412,Ottam,8,3,2019,2019-03-08,7.8,,510,,...,4791,120,India,Malayalam,Drama,Zam,Rajesh k Narayan,Thomas Thiruvalla Films,"Nandu Anand, Roshan Ullas, Manikandan R. Achar...","Set in Trivandrum, the story of Ottam unfolds ..."
81270,tt9905462,Pengalila,8,3,2019,2019-03-08,8.4,,604,,...,,111,India,Malayalam,Drama,T.V. Chandran,T.V. Chandran,Benzy Productions,"Lal, Akshara Kishor, Iniya, Narain, Renji Pani...",An unusual bond between a sixty year old Dalit...
81271,tt9911774,Padmavyuhathile Abhimanyu,8,3,2019,2019-03-08,8.4,,369,,...,,130,India,Malayalam,Drama,Vineesh Aaradya,"Vineesh Aaradya, Vineesh Aaradya",RMCC Productions,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",


In [15]:
processed_df['worlwide_gross_income'].replace("$", "", regex=True)

0            
1            
2            
3            
4            
         ... 
81268        
81269    4791
81270        
81271        
81272    2833
Name: worlwide_gross_income, Length: 81273, dtype: object

### SQL-Alchemy to load clean data from pandas DataFrame to SQL Lite DataBase

In [16]:
# create declarative base
Base = declarative_base()

# check current table available in the Base - should be nothing at this point
Base.metadata.tables


immutabledict({})

In [17]:
# folder name that will store the sql-lite database
fol_name = "Exp_SqlLiteDb"
if pathlib.Path(fol_name).exists():
    print(f' >> Folder "{fol_name}" already exists!\n >> No new folder was created ...')
    pass
else:
    ! mkdir Exp_SqlLiteDb
    print(f'Successfully created folder "{fol_name}"')

 >> Folder "Exp_SqlLiteDb" already exists!
 >> No new folder was created ...


In [22]:
# # folder name that will store the sql-lite database
# database_name = "Exp_SqlLiteDb"
# if pathlib.Path(fol_name).exists():
#     print(f' >> Folder "{fol_name}" already exists!\n >> No new folder was created ...')
#     pass
# else:
#     ! mkdir Exp_SqlLiteDb
#     print(f'Successfully created folder "{fol_name}"')

In [19]:
sqlLite_db_path = "Exp_SqlLiteDb\movie.db"
engine = create_engine(f"sqlite:///{sqlLite_db_path}")
conn = engine.connect()

In [20]:
# print out list of cols for class creation
i=0
for col in processed_df.columns:
    print(f'{col} = Column()')
    i+=1
print(f'\n{("-")*50}\n>> There are total {i} columns in the current data frame.')

id = Column()
title = Column()
day = Column()
month = Column()
year = Column()
date_published = Column()
user_rating = Column()
web_rating = Column()
vote_num = Column()
reviews_from_users = Column()
reviews_from_critics = Column()
total_reviews = Column()
budget = Column()
usa_gross_income = Column()
worlwide_gross_income = Column()
duration = Column()
country = Column()
language = Column()
genre = Column()
director = Column()
writer = Column()
prod co = Column()
actors = Column()
description = Column()

--------------------------------------------------
>> There are total 24 columns in the current data frame.


In [21]:
class Movie(Base):
    table_name = input("Please name your table.")
    __tablename__ = table_name
    id = Column(Integer, primary_key=True)
    name = Column(String(255))
    color = Column(String(255))
    age = Column(Integer)

    id = Column(Integer, primary_key=True)
    title = Column(String(255))
    day = Column()
    month = Column()
    year = Column()
    date_published = Column()
    user_rating = Column()
    web_rating = Column()
    vote_num = Column()
    reviews_from_users = Column()
    reviews_from_critics = Column()
    total_reviews = Column()
    budget = Column()
    usa_gross_income = Column()
    worlwide_gross_income = Column(Interger)
    duration = Column(Interger)
    country = Column(String(50)
    language = Column(String(50))
    genre = Column(String(50))
    director = Column(String(50))
    writer = Column(String(50))
    prod co = Column(String(50))
    actors = Column(String(50))
    description = Column(String(500))


SyntaxError: invalid syntax (<ipython-input-21-742719c869ef>, line 26)