## Import Modules/ Libs for Data Clean-up

In [1]:
import pandas as pd
import csv
import datetime as dt
from datetime import datetime
import numpy as np

## Import SQL-Alchemy for SQL-Lite Data Load

In [291]:
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy import Column, Integer, String, Float, Date
from sqlalchemy.orm import Session
from sqlalchemy import func
import pathlib
import sqlite3

## Define local functions to use

In [3]:
# convert to interger
def to_int (df_name, col_name):
    
    """
    1. Converting string, etc. into integer
    2. Usage:
        to_int (data-frame-name, column-name):
    """
    
    print (f"\n>> Processing Column: '{col_name}'")
    
    df_name[col_name] = pd.to_numeric(df_name[col_name], downcast="integer")
    
   
    print(f">> DONE Coverting to Integer | Float!\n\n{50*('=')}\n{50*('=')}")
   
    
    return df_name

In [4]:
def conv_currency (df_name, col_name):
    
    """
    1. Converting "$" to 'USD'
    2. Split column contains "currency" and "number" into 2 columns
        a. <original_column_name>_currency : tag of currency like 'USD', 'EUR', 'INR', etc.
    3. Usage: 
        conv_currency (data-frame-name, column-name)
    """
    
    print (f"\n>> Processing Column: '{col_name}'")
    
    # replace $ for USD and remove all "blanks"
    df_name.loc[:,col_name] = df_name.loc[:, col_name].str.replace("$", "USD ")
    df_name.loc[:,col_name] = df_name.loc[:, col_name].str.replace(",", "")
    print(f">> DONE Replacing '$' with 'USD'!")
    
    # split currency and value into two cols
    
    try:
        new_col = f'{col_name}_currency'
        print(f'    >>> Creating new column named: "{new_col}"')
        df_name[new_col], df_name[col_name] = df_name[col_name].str.split(' ', 1).str
        print(f">> DONE Splitting Columns!")
              
    except ValueError:
        pass
    
    df_name.fillna('', inplace=True)
    print(f">> PROCESS COMPLETED !\n\n{50*('=')}\n{50*('=')}")
    
    return df_name

In [5]:
def replace_str (df_name, col_name, to_be_repl, repl_to):
    
    """
    1. Replace a character to another character
    2. Usage:
        replace_str (data-frame-name, column-name, string-to-replace, replace-to-string):
    """
    print (f"\n>> Processing Column: '{col_name}'")
    
    df_name[col_name] = df_name.loc[:, col_name].str.replace(to_be_repl, repl_to)
    

    print(f">> DONE Replacing Character!\n\n{50*('=')}\n{50*('=')}")
    return df_name

In [6]:
def to_str (df_name, col_name):
    
    """
    1. convert to string
    2. Usage:
        to_str (data-frame-name, column-name)
    """
    print (f"\n>> Processing Column: '{col_name}'")
    
   
    df_name[col_name] = df_name[col_name].astype('str') 
       

    print(f">> DONE Converting to String!\n\n{50*('=')}\n{50*('=')}")
    return df_name

## Data Clean-up Process

### DATABASE RAW_DF

In [7]:
# define path to csv file
path="Resources_hle\IMDb_movies.csv"

# convert csv to pandas df
raw_df = pd.read_csv(path, encoding="UTF-8", dtype={'imdb_title_id': "string", 'production_company': "string", 'usa_gross_income': "string", "worldwide_gross_income" : "string", "budget": "string"})

1. choose all cols to keep (netflix id, imdb id, title, ddmmyy, dd, mo,yr,duration, avg_vote, votes, reviews_from_users	reviews_from_critics, budget,usa_gross_icome, ww income, coutry, type, genre, actor, description)
[[""]]
2. break the date published col into ye, mo, dd
3. table name messed up worlwide -->> worldwide
3. combine reviews
4. rename columns
5. load into SQL-pgadmin4
6. report
       a. where data found, how
       b. transforming process
       c. load SQL, pic of SQL

In [8]:
raw_df['production_company']

0                                Zyzzyx LLC
1        Compagnia Cinematografica Champion
2                           Medient Studios
3                      Les Films du Veyrier
4                                      <NA>
                        ...                
81268                      Milestone Movies
81269                                  <NA>
81270                   Ekaa Art Production
81271                      RMCC Productions
81272                           Gizem Ajans
Name: production_company, Length: 81273, dtype: string

In [9]:
# review what imported
raw_df.dtypes

imdb_title_id             string
title                     object
original_title            object
year                       int64
date_published            object
genre                     object
duration                   int64
country                   object
language                  object
director                  object
writer                    object
production_company        string
actors                    object
description               object
avg_vote                 float64
votes                      int64
budget                    string
usa_gross_income          string
worlwide_gross_income     object
metascore                float64
reviews_from_users       float64
reviews_from_critics     float64
dtype: object

In [10]:
# review what imported
raw_df

Unnamed: 0,imdb_title_id,title,original_title,year,date_published,genre,duration,country,language,director,...,actors,description,avg_vote,votes,budget,usa_gross_income,worlwide_gross_income,metascore,reviews_from_users,reviews_from_critics
0,tt0429277,Zyzzyx Rd,Zyzzyx Rd,2006,2/24/2006,"Crime, Drama, Thriller",90,USA,English,John Penney,...,"Leo Grillo, Katherine Heigl, Tom Sizemore, Ric...",The family man accountant Grant travels to Las...,4.0,930,"$2,000,000",$30,$30,,11.0,1.0
1,tt0070913,Che?,Che?,1972,12/8/1972,Comedy,114,"Italy, France, West Germany","English, Italian, French",Roman Polanski,...,"Marcello Mastroianni, Sydne Rome, Hugh Griffit...","During her Italian vacation, a young and beaut...",5.7,3256,,$64,,,23.0,37.0
2,tt1986953,Storage 24,Storage 24,2012,6/29/2012,"Action, Horror, Mystery",87,UK,English,Johannes Roberts,...,"Noel Clarke, Colin O'Donoghue, Antonia Campbel...","In London, a military plane crashes leaving it...",4.4,6309,,$72,"$646,175",52.0,65.0,88.0
3,tt1865335,Confession of a Child of the Century,Confession of a Child of the Century,2012,8/29/2012,Drama,120,"France, Germany, UK",English,Sylvie Verheyde,...,"Charlotte Gainsbourg, Pete Doherty, August Die...","Paris, 1830: Octave, betrayed by his mistress,...",4.4,514,EUR 4000000,$74,"$146,155",,4.0,23.0
4,tt4195920,Chicas paranoicas,Chicas paranoicas,2015,9/16/2016,Comedy,100,Spain,Spanish,Pedro del Santo,...,"Patricia Valley, Mairen Muñoz, Marta Mir Martí...",'Chicas Paranoicas' is the first Spanish comed...,8.0,169,,$78,$78,,,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,tt9894098,Sathru,Sathru,2019,3/7/2019,Thriller,129,India,,Naveen Nanjundan,...,"Srushti Dange, Kathir, Laguparan, Marimuthu, N...",A kidnapping gone wrong leads to mounting tens...,6.1,163,,,"$8,683",,7.0,1.0
81269,tt9899880,Columbus,Columbus,2018,12/5/2018,"Comedy, Drama",82,Iran,"Persian, English",Hatef Alimardani,...,"Farhad Aslani, Majid Salehi, Saeed Poursamimi,...",A rich family are deciding to immigrate to the...,4.0,130,,,,,,13.0
81270,tt9903716,Jessie,Jessie,2019,3/15/2019,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,"Sritha Chandana, Pavani Gangireddy, Abhinav Go...","Set in an abandoned house, the film follows a ...",7.2,219,,,,,21.0,
81271,tt9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,3/8/2019,Drama,130,India,Malayalam,Vineesh Aaradya,...,"Anoop Chandran, Indrans, Sona Nair, Simon Brit...",,8.4,369,,,,,,


In [11]:
# describe the data    
raw_df.describe()

Unnamed: 0,year,duration,avg_vote,votes,metascore,reviews_from_users,reviews_from_critics
count,81273.0,81273.0,81273.0,81273.0,12722.0,74196.0,70286.0
mean,1993.00721,100.565981,5.926587,9421.771,55.762695,43.753194,27.992758
std,23.992284,25.320189,1.243315,52202.45,17.757453,159.903568,58.708764
min,1906.0,40.0,1.0,99.0,1.0,1.0,1.0
25%,1979.0,88.0,5.2,206.0,43.0,4.0,3.0
50%,2002.0,96.0,6.1,495.0,56.0,9.0,8.0
75%,2012.0,108.0,6.8,1865.0,69.0,26.0,24.0
max,2019.0,3360.0,10.0,2159628.0,100.0,8302.0,987.0


In [12]:
# extract day, month, year from the date_published columns
raw_df["date_published"] = pd.to_datetime(raw_df['date_published'])
raw_df['day'], raw_df['month'], raw_df['year']  = raw_df['date_published'].dt.day, raw_df['date_published'].dt.month, raw_df['date_published'].dt.year


In [13]:
# print out current columns with template to create a dictionary for columns rename
# un-comment to print out the template
# for col in raw_df.columns:
#     print(f'"{col}" : "__",')

In [14]:
# define what will be renamed
cols = {
"imdb_title_id" : "id",
"production_company" : "prod_co",
"avg_vote" : "user_rating",
"votes" : "vote_num",
"worlwide_gross_income" : "worldwide_gross_income",
"metascore" : "web_rating",
    }

# process new col names
raw_df.rename(columns=cols, inplace=True)
# raw_df.head(10)

In [15]:
raw_df['prod_co']

0                                Zyzzyx LLC
1        Compagnia Cinematografica Champion
2                           Medient Studios
3                      Les Films du Veyrier
4                                      <NA>
                        ...                
81268                      Milestone Movies
81269                                  <NA>
81270                   Ekaa Art Production
81271                      RMCC Productions
81272                           Gizem Ajans
Name: prod_co, Length: 81273, dtype: string

In [16]:
# # replace all NaN with empty
raw_df.fillna('', inplace=True)
# raw_df.head(10)

In [17]:
raw_df['prod_co']

0                                Zyzzyx LLC
1        Compagnia Cinematografica Champion
2                           Medient Studios
3                      Les Films du Veyrier
4                                          
                        ...                
81268                      Milestone Movies
81269                                      
81270                   Ekaa Art Production
81271                      RMCC Productions
81272                           Gizem Ajans
Name: prod_co, Length: 81273, dtype: string

In [18]:
# print out all current column names
for col in raw_df.columns:
    print(f'"{col}",')

"id",
"title",
"original_title",
"year",
"date_published",
"genre",
"duration",
"country",
"language",
"director",
"writer",
"prod_co",
"actors",
"description",
"user_rating",
"vote_num",
"budget",
"usa_gross_income",
"worldwide_gross_income",
"web_rating",
"reviews_from_users",
"reviews_from_critics",
"day",
"month",


1. choose all cols to keep (netflix id, imdb id, title, ddmmyy, dd, mo,yr,duration, avg_vote, votes, reviews_from_users	reviews_from_critics, budget,usa_gross_icome, ww income, coutry, type, genre, actor, description)
[[""]]
2. break the date published col into ye, mo, dd
3. combine reviews
4. rename columns
5. load into SQL-pgadmin4
6. report
       a. where data found, how
       b. transforming process
       c. load SQL, pic of SQL

### DATABASE PROCESSED_DF

In [19]:
# del processed_df

In [20]:
# # filter out un-needed columns and re-arrange columns
try:
    del processed_df

except Exception:
    pass

processed_df = raw_df

In [21]:
# replacing "tt" character in id columns and convert id to integer
replace_str(processed_df, "id", "tt", "")



>> Processing Column: 'id'
>> DONE Replacing Character!



Unnamed: 0,id,title,original_title,year,date_published,genre,duration,country,language,director,...,user_rating,vote_num,budget,usa_gross_income,worldwide_gross_income,web_rating,reviews_from_users,reviews_from_critics,day,month
0,0429277,Zyzzyx Rd,Zyzzyx Rd,2006,2006-02-24,"Crime, Drama, Thriller",90,USA,English,John Penney,...,4.0,930,"$2,000,000",$30,$30,,11,1,24,2
1,0070913,Che?,Che?,1972,1972-12-08,Comedy,114,"Italy, France, West Germany","English, Italian, French",Roman Polanski,...,5.7,3256,,$64,,,23,37,8,12
2,1986953,Storage 24,Storage 24,2012,2012-06-29,"Action, Horror, Mystery",87,UK,English,Johannes Roberts,...,4.4,6309,,$72,"$646,175",52,65,88,29,6
3,1865335,Confession of a Child of the Century,Confession of a Child of the Century,2012,2012-08-29,Drama,120,"France, Germany, UK",English,Sylvie Verheyde,...,4.4,514,EUR 4000000,$74,"$146,155",,4,23,29,8
4,4195920,Chicas paranoicas,Chicas paranoicas,2016,2016-09-16,Comedy,100,Spain,Spanish,Pedro del Santo,...,8.0,169,,$78,$78,,,2,16,9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81268,9894098,Sathru,Sathru,2019,2019-03-07,Thriller,129,India,,Naveen Nanjundan,...,6.1,163,,,"$8,683",,7,1,7,3
81269,9899880,Columbus,Columbus,2018,2018-12-05,"Comedy, Drama",82,Iran,"Persian, English",Hatef Alimardani,...,4.0,130,,,,,,13,5,12
81270,9903716,Jessie,Jessie,2019,2019-03-15,"Horror, Thriller",106,India,Telugu,Aswani Kumar V.,...,7.2,219,,,,,21,,15,3
81271,9911774,Padmavyuhathile Abhimanyu,Padmavyuhathile Abhimanyu,2019,2019-03-08,Drama,130,India,Malayalam,Vineesh Aaradya,...,8.4,369,,,,,,,8,3


In [22]:
# convert 'id' to numbers
to_int(processed_df, 'id')
processed_df["id"]


>> Processing Column: 'id'
>> DONE Coverting to Integer | Float!



0         429277
1          70913
2        1986953
3        1865335
4        4195920
          ...   
81268    9894098
81269    9899880
81270    9903716
81271    9911774
81272    9914286
Name: id, Length: 81273, dtype: int32

In [23]:
# for trouble shoot and debugging purposes
# break_point_here

In [24]:
col_to_covert = ['worldwide_gross_income', 'usa_gross_income', 'budget']

In [25]:
for ea_col in col_to_covert:
    conv_currency(processed_df, ea_col)
processed_df[['worldwide_gross_income', 'usa_gross_income', 'budget']]


>> Processing Column: 'worldwide_gross_income'
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "worldwide_gross_income_currency"




>> DONE Splitting Columns!
>> PROCESS COMPLETED !


>> Processing Column: 'usa_gross_income'
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "usa_gross_income_currency"
>> DONE Splitting Columns!
>> PROCESS COMPLETED !


>> Processing Column: 'budget'
>> DONE Replacing '$' with 'USD'!
    >>> Creating new column named: "budget_currency"
>> DONE Splitting Columns!
>> PROCESS COMPLETED !



Unnamed: 0,worldwide_gross_income,usa_gross_income,budget
0,30,30,2000000
1,,64,
2,646175,72,
3,146155,74,4000000
4,78,78,
...,...,...,...
81268,8683,,
81269,,,
81270,,,
81271,,,


In [26]:

# create an folder to hole temporarily exported data of converted data
!mkdir archieve

# export to csv for visual inspection or further process if needed
processed_df[["worldwide_gross_income_currency", 'worldwide_gross_income', "usa_gross_income_currency", 'usa_gross_income', "budget_currency", 'budget']].to_csv("archieve\exported_draft.csv")


A subdirectory or file archieve already exists.


In [27]:

# print out the column names inside df
# useful as being able to copy and paste directly into cell without retyping all col names
# uncheck to use
# for col in processed_df.columns:
#     print (f'"{col}",')


In [28]:
print(processed_df.dtypes)

id                                          int32
title                                      object
original_title                             object
year                                        int64
date_published                     datetime64[ns]
genre                                      object
duration                                    int64
country                                    object
language                                   object
director                                   object
writer                                     object
prod_co                                    string
actors                                     object
description                                object
user_rating                               float64
vote_num                                    int64
budget                                     object
usa_gross_income                           object
worldwide_gross_income                     object
web_rating                                 object


In [29]:
processed_df.prod_co

0                                Zyzzyx LLC
1        Compagnia Cinematografica Champion
2                           Medient Studios
3                      Les Films du Veyrier
4                                          
                        ...                
81268                      Milestone Movies
81269                                      
81270                   Ekaa Art Production
81271                      RMCC Productions
81272                           Gizem Ajans
Name: prod_co, Length: 81273, dtype: string

In [30]:
# processed_df.prod_co = processed_df.prod_co.astype(str)

In [31]:
processed_df.title.dtype

dtype('O')

In [32]:
# # specify columns to be converted to string
# col_to_str = [
#     "id",
#     "title",
#     "usa_gross_income_currency",
#     "worldwide_gross_income_currency",
#     "budget_currency",
#     "country",
#     "language",
#     "genre",
#     "director",
#     "writer",
#     "prod_co",
#     "actors",
#     "description"    
# ]

# # loop and replace one by one
# for col in col_to_str:
#     to_str(processed_df, col)

In [33]:
# break_here

In [34]:
# specify columns to be converted to integer
col_to_int = [
    "id",
    "day",
    "month",
    "year",
    "user_rating",
    "web_rating",
    "vote_num",
    "reviews_from_users",
    "reviews_from_critics",
    "usa_gross_income",
    "worldwide_gross_income",
    "budget",
    "duration",
]

# loop and replace one by one
for col in col_to_int:
    to_int(processed_df, col)


>> Processing Column: 'id'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'day'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'month'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'year'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'user_rating'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'web_rating'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'vote_num'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'reviews_from_users'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'reviews_from_critics'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'usa_gross_income'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'worldwide_gross_income'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'budget'
>> DONE Coverting to Integer | Float!


>> Processing Column: 'duration'
>> DONE Coverting to Integer | Float!




web_rating                                 object  - float
reviews_from_users                         object  -int
reviews_from_critics                       object    - int
total_reviews                              object     - int
usa_gross_income                           object     - int
worldwide_gross_income                     object     - int
budget                                     object      - int

In [35]:
# re-arrange columns in the df
# build a list of what need to be included and their positions
cols=[
"id",
"title",

"date_published",
"day",
"month",
"year",

"user_rating",
"web_rating",

"vote_num",
"reviews_from_users",
"reviews_from_critics",
    
"usa_gross_income_currency",
"usa_gross_income",

"worldwide_gross_income_currency",
"worldwide_gross_income",

"budget_currency",
"budget",


"duration",
"country",
"language",
"genre",
"director",
"writer",
"prod_co",
"actors",
"description"
]


# re-arrange:
processed_df = processed_df[cols]

In [36]:
processed_df.to_csv("archieve\processed_df.csv")

In [37]:
# template to print out all columns and get ready for CLASS creation
# for col in  cols:
#     print (f'{col} = Column(    )')

In [221]:
processed_df.index[0]

0

### SQL-Alchemy to load clean data from pandas DataFrame to SQL Lite DataBase

In [258]:
# create declarative base
Base = declarative_base()

# check current table available in the Base - should be nothing at this point
Base.metadata.tables


immutabledict({})

In [259]:
# folder name that will store the sql-lite database
fol_name = "Exp_SqlLiteDb"
if pathlib.Path(fol_name).exists():
    print(f' >> Folder "{fol_name}" already exists!\n >> No new folder was created ...')
    pass
else:
    ! mkdir Exp_SqlLiteDb
    print(f'Successfully created folder "{fol_name}"')

 >> Folder "Exp_SqlLiteDb" already exists!
 >> No new folder was created ...


In [260]:
sqlLite_db_path = "Exp_SqlLiteDb\movie.db"
engine = create_engine(f"sqlite:///{sqlLite_db_path}")
conn = engine.connect()

In [261]:
# print out list of cols for class creation
i=0
for col in processed_df.columns:
    print(f'{col} = Column()')
    i+=1
print(f'\n{("-")*50}\n>> There are total {i} columns in the current data frame.')

id = Column()
title = Column()
date_published = Column()
day = Column()
month = Column()
year = Column()
user_rating = Column()
web_rating = Column()
vote_num = Column()
reviews_from_users = Column()
reviews_from_critics = Column()
usa_gross_income_currency = Column()
usa_gross_income = Column()
worldwide_gross_income_currency = Column()
worldwide_gross_income = Column()
budget_currency = Column()
budget = Column()
duration = Column()
country = Column()
language = Column()
genre = Column()
director = Column()
writer = Column()
prod_co = Column()
actors = Column()
description = Column()

--------------------------------------------------
>> There are total 26 columns in the current data frame.


In [262]:
processed_df.dtypes

id                                          int32
title                                      object
date_published                     datetime64[ns]
day                                          int8
month                                        int8
year                                        int16
user_rating                               float64
web_rating                                float64
vote_num                                    int32
reviews_from_users                        float64
reviews_from_critics                      float64
usa_gross_income_currency                  object
usa_gross_income                          float64
worldwide_gross_income_currency            object
worldwide_gross_income                    float64
budget_currency                            object
budget                                    float64
duration                                    int16
country                                    object
language                                   object


In [263]:
class Movie(Base):
    
    __tablename__ = 'movie'
    __table_args__ = {'extend_existing': True} 
    
    id = Column(Integer, primary_key=True)
    title = Column(String)
                   
#     date_published = Column(String)
                   
    day = Column(Integer)
#     month = Column(Integer)
#     year = Column(Integer)
                   
#     user_rating = Column(Float)
#     web_rating = Column(Float)
#     vote_num = Column(Integer)
#     reviews_from_users = Column(Integer)
#     reviews_from_critics = Column(Integer)
    
#     usa_gross_income_currency = Column(String(10))
#     usa_gross_income = Column(Integer)
    
#     worldwide_gross_income_currency = Column(String(10))
#     worldwide_gross_income = Column(Integer)
   
#     budget_currency = Column(String(10))
#     budget = Column(Integer)
       
#     duration = Column(Integer)
                   
#     country = Column(String(50))
#     language = Column(String(50))
#     genre = Column(String(50))
#     director = Column(String(50))
#     writer = Column(String(50))
#     prod_co = Column(String(50))
#     actors = Column(String(50))
#     description = Column(String(2000))
    
#     [parameters: (429277, 
#                   'Zyzzyx Rd',
#                   '2006-02-24',
#                   24,
#                   2,
#                   2006,
#                   4.0,
#                   nan,
#                   930,
#                   11.0,
#                   1.0,
#                   'USD',
#                   30.0,
#                   'USD', 30.0, 'USD', 2000000.0, 90, 'USA', 'English', 'Crime, Drama, Thriller', 'John Penney', 'John Penney', 'Zyzzyx LLC', 'Leo Grillo, Katherine Heigl, Tom Sizemore, Rickey Medlocke, Yorlin Madera, Meguire Grillo, Di Koob, Nancy Linari', 'The family man accountant Grant travels to Las Vegas and meets the lascivious "Lolita" Marissa in a casino. While in the motel with Marissa, her violent ex-boyfriend Joey surprises them on ...')]

In [264]:
# Create a "Metadata" Layer That Abstracts our SQL Database
# ----------------------------------
Base.metadata.create_all(engine)

In [265]:
# current in memory tables
Base.metadata.tables

immutabledict({'movie': Table('movie', MetaData(bind=None), Column('id', Integer(), table=<movie>, primary_key=True, nullable=False), Column('title', String(), table=<movie>), Column('day', Integer(), table=<movie>), schema=None)})

In [292]:
sqlite3.register_adapter(np.int64, lambda val: int(val))
sqlite3.register_adapter(np.int32, lambda val: int(val))

In [266]:
# orm requires session so rollbacks can occur etc.
session = Session(bind=engine)

In [267]:
processed_df.dtypes



id                                          int32
title                                      object
date_published                     datetime64[ns]
day                                          int8
month                                        int8
year                                        int16
user_rating                               float64
web_rating                                float64
vote_num                                    int32
reviews_from_users                        float64
reviews_from_critics                      float64
usa_gross_income_currency                  object
usa_gross_income                          float64
worldwide_gross_income_currency            object
worldwide_gross_income                    float64
budget_currency                            object
budget                                    float64
duration                                    int16
country                                    object
language                                   object


In [268]:
# print out all current column names 
# for col in processed_df.columns:
#     print(f"{col} = processed_df['{col}'][0],")

In [293]:
# processed_df['id'][]
day = processed_df['day'][5]
day


18

In [294]:
movie1 = Movie(

    id = processed_df.index[5],
    title = processed_df['title'][5],
#     date_published = processed_df['date_published'][0],
    day = np.int64(processed_df['day'][5])
#     month = processed_df['month'][0],
#     year = processed_df['year'][0]
#     user_rating = processed_df['user_rating'][0],
#     web_rating = 5 
#     vote_num = processed_df['vote_num'][0],
#     reviews_from_users = processed_df['reviews_from_users'][0],
#     reviews_from_critics = processed_df['reviews_from_critics'][0],
#     usa_gross_income_currency = processed_df['usa_gross_income_currency'][0],
#     usa_gross_income = processed_df['usa_gross_income'][0],
#     worldwide_gross_income_currency = processed_df['worldwide_gross_income_currency'][0],
#     worldwide_gross_income = processed_df['worldwide_gross_income'][0],
#     budget_currency = processed_df['budget_currency'][0],
#     budget = processed_df['budget'][0],
#     duration = processed_df['duration'][0],
#     country = processed_df['country'][0],
#     language = processed_df['language'][0],
#     genre = processed_df['genre'][0],
#     director = processed_df['director'][0],
#     writer = processed_df['writer'][0],
#     prod_co = processed_df['prod_co'][0],
#     actors = processed_df['actors'][0],
#     description = processed_df['description'][0]  
                )



In [271]:
# movie3 = Movie(id=5, day=30)
# movie3.day

In [272]:
# movie1.web_rating

In [273]:
# for index, row in processed_df.iterrows():
#     row['id'] = row['id'].replace("tt", "") 
#     row['worldwide_gross_income'] =  row['worldwide_gross_income'].replace("$", "")
#     row['worldwide_gross_income'] =  row['worldwide_gross_income'].replace(" ", "")
# processed_df

In [295]:
# add one item
session.add(movie1)


In [285]:
# session.rollback()

In [296]:
session.commit()

In [297]:
session.commit()
engine.execute("select * from Movie").fetchall()

[(0, 'Zyzzyx Rd', b'\x18'),
 (2, 'Storage 24', b'\x1d\x00\x00\x00\x00\x00\x00\x00'),
 (5, 'Perro come perro', 18)]

In [239]:
session.close()