In [1]:
import pandas as pd
import json
from sqlalchemy import create_engine

In [2]:
# Function to process 1 to Many relationship
# Parameters: Original data frame and the col to be extracted/processed
# Return: a list of original values (no duplicates)
def rel_1M (original_df, col):
#     # extract the column into a working data frame
#     processing_df = original_df[[col]].copy()

    # Create a list that holds dictionaries with data from the complex column
    parent_data = []
    # Loop as many times as rows there are in the working data frame
    for idx in range(original_df.shape[0]):
        # Extract the data of the 1 to M column (parameter)
        col_str = original_df[col][idx]  
        # If column is evaluated as float, it means there is a NaN value and we skip it
        if type(col_str) != float:
            col_str = col_str.replace('None', '\'\'')
            # replace ' with " to and jsonify it     
            col_list = json.loads(col_str.replace('\'', '\"'))  
            # add the various values to a dictionary and the add then dictionary to a list
            parent_data.append({'id':col_list['id'], 'name':col_list['name'], 'poster_path':col_list['poster_path'], \
                             'backdrop_path':col_list['backdrop_path']})
            # The necessary entry in the original data (foreign key) is made
            original_df['collection_id'][idx] = col_list['id']

    # Deduplicate and turn the list into a dataframe
    parent_data_df = pd.DataFrame(parent_data).set_index('id').drop_duplicates().reset_index()
            
    # return the list (this will become the parent table)
    return parent_data_df

In [3]:
# Function to process M to Many relationship
# Parameters: Original data frame and the col to be extracted/processed
# Return: 
#    1) Parent table - this will use all the original information in the column. It will be dedupped
#    2) Association table - the id of the parent and the corresponding movie id (imdb_id)  
def rel_MM (original_df, col):
    # Create a list that holds dictionaries with data from the complex column
    parent_data = []
    # Create an association list with the id of the complex column and the movie id
    associate_data = []
    # Loop as many times as rows there are in the working data frame    
    for idx in range(original_df.shape[0]):
        # Extract the data of the M to M column (parameter)
        col_str = original_df[col][idx]
        # replace ' with " to and jsonify it 
        col_list = json.loads(col_str.replace('\'', '\"'))
        # The column contains a list of dictionaries. Iterate the list to get the infromation out
        for row in col_list:
            # add the various values to a dictionary and then add the dictionary to a list (parent table)
            parent_data.append({'id':row['id'], 'name':row['name']})
            # add the various values to a dictionary and then add the dictionary to a list (association table)
            associate_data.append({'genres_id':row['id'], 'imdb_id':original_df['imdb_id'][idx]})
    
    # Deduplicate and turn the list into a dataframe
    parent_data_df = pd.DataFrame(parent_data).set_index('id').drop_duplicates().reset_index()
    associate_data_df = pd.DataFrame(associate_data)
    
    # Return a dataframe with the parent and the association data
    return parent_data_df, associate_data_df

In [9]:
# Provide the path to the file
csv_file = "movies_metadata.csv"

# Read the file into a dataframe
movies_alldata_df = pd.read_csv(csv_file)

# Add column for collections, it will be popolated later
movies_alldata_df['collection_id'] = ""

# movies_alldata_df.head(2)

In [14]:
rds_connection_string = "postgres:Wel_Come#123@localhost:5432/movies_db"
engine = create_engine(f'postgresql://{rds_connection_string}')
connection = engine.connect

In [None]:
# Confirm tables
engine.table_names()

In [11]:
# Process 1 to M columns
cols_1M_lst = [{'col': 'belongs_to_collection', 'table': 'collection', 'primary': 'collection_id'}]

for col in cols_1M_lst:
    parent_df = rel_1M (movies_alldata_df, col['col'])
    # Write data frame to the table
    parent_df = parent_df.rename(columns={'id': col['primary']})
    parent_df.to_sql(name=col['table'], con=engine, if_exists='append', index=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


DataError: (psycopg2.DataError) value too long for type character varying(50)

[SQL: INSERT INTO collection (collection_id, backdrop_path, name, poster_path) VALUES (%(collection_id)s, %(backdrop_path)s, %(name)s, %(poster_path)s)]
[parameters: ({'collection_id': 10194, 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg', 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg'}, {'collection_id': 119050, 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg', 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg'}, {'collection_id': 96871, 'backdrop_path': '/7qwE57OVZmMJChBpLEbJEmzUydk.jpg', 'name': 'Father of the Bride Collection', 'poster_path': '/nts4iOmNnq7GNicycMJ9pSAn204.jpg'}, {'collection_id': 645, 'backdrop_path': '/6VcVl48kNKvdXOZfJPdarlUGOsk.jpg', 'name': 'James Bond Collection', 'poster_path': '/HORpg5CSkmeQlAolx3bKMrKgfi.jpg'}, {'collection_id': 117693, 'backdrop_path': '/9VM5LiJV0bGb1st1KyHA3cVnO2G.jpg', 'name': 'Balto Collection', 'poster_path': '/w0ZgH6Lgxt2bQYnf1ss74UvYftm.jpg'}, {'collection_id': 3167, 'backdrop_path': '/bswWgdDsLu0fhWMYUzLF8XgiK4h.jpg', 'name': 'Ace Ventura Collection', 'poster_path': '/qCxH543pScFed1CycwJ1nVgrkOc.jpg'}, {'collection_id': 91698, 'backdrop_path': '/uWaANGQeoSs5vSP1CWtlkDrkqei.jpg', 'name': 'Chili Palmer Collection', 'poster_path': '/ae3smJDdWrMJ77tDpYOrpo4frKq.jpg'}, {'collection_id': 9435, 'backdrop_path': '/zRWuGdIlkWqXMQcs1t3sPy1XQ6x.jpg', 'name': 'Babe Collection', 'poster_path': '/jgnah0k4AumfcZE4HHMJywYHvG6.jpg'}  ... displaying 10 of 1695 total bound parameter sets ...  {'collection_id': 152918, 'backdrop_path': '/5uoPsNiFpUYNamSGqE8okN27VRK.jpg', 'name': 'Mister Blot Collection', 'poster_path': '/44PYEwwjGts8pAob59RHd6zlkKc.jpg'}, {'collection_id': 200641, 'backdrop_path': '/3fhHbLeO3DqdHvgHg5szs399eBb.jpg', 'name': 'Red Lotus Collection', 'poster_path': '/yf9Eod9ANXyHTzDpAxz9ezgvlL4.jpg'})]
(Background on this error at: http://sqlalche.me/e/9h9h)

In [16]:
parent_df.to_sql(name=col['table'], con=engine, if_exists='append', index=False)
trans.commit()
connection.close()

# try:
#     r1 = connection.execute(table1.select())
#     connection.execute(table1.insert(), col1=7, col2='this is some data')
#     trans.commit()
# except:
#     trans.rollback()
#     raise

DataError: (psycopg2.DataError) value too long for type character varying(50)

[SQL: INSERT INTO collection (collection_id, backdrop_path, name, poster_path) VALUES (%(collection_id)s, %(backdrop_path)s, %(name)s, %(poster_path)s)]
[parameters: ({'collection_id': 10194, 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg', 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg'}, {'collection_id': 119050, 'backdrop_path': '/hypTnLot2z8wpFS7qwsQHW1uV8u.jpg', 'name': 'Grumpy Old Men Collection', 'poster_path': '/nLvUdqgPgm3F85NMCii9gVFUcet.jpg'}, {'collection_id': 96871, 'backdrop_path': '/7qwE57OVZmMJChBpLEbJEmzUydk.jpg', 'name': 'Father of the Bride Collection', 'poster_path': '/nts4iOmNnq7GNicycMJ9pSAn204.jpg'}, {'collection_id': 645, 'backdrop_path': '/6VcVl48kNKvdXOZfJPdarlUGOsk.jpg', 'name': 'James Bond Collection', 'poster_path': '/HORpg5CSkmeQlAolx3bKMrKgfi.jpg'}, {'collection_id': 117693, 'backdrop_path': '/9VM5LiJV0bGb1st1KyHA3cVnO2G.jpg', 'name': 'Balto Collection', 'poster_path': '/w0ZgH6Lgxt2bQYnf1ss74UvYftm.jpg'}, {'collection_id': 3167, 'backdrop_path': '/bswWgdDsLu0fhWMYUzLF8XgiK4h.jpg', 'name': 'Ace Ventura Collection', 'poster_path': '/qCxH543pScFed1CycwJ1nVgrkOc.jpg'}, {'collection_id': 91698, 'backdrop_path': '/uWaANGQeoSs5vSP1CWtlkDrkqei.jpg', 'name': 'Chili Palmer Collection', 'poster_path': '/ae3smJDdWrMJ77tDpYOrpo4frKq.jpg'}, {'collection_id': 9435, 'backdrop_path': '/zRWuGdIlkWqXMQcs1t3sPy1XQ6x.jpg', 'name': 'Babe Collection', 'poster_path': '/jgnah0k4AumfcZE4HHMJywYHvG6.jpg'}  ... displaying 10 of 1695 total bound parameter sets ...  {'collection_id': 152918, 'backdrop_path': '/5uoPsNiFpUYNamSGqE8okN27VRK.jpg', 'name': 'Mister Blot Collection', 'poster_path': '/44PYEwwjGts8pAob59RHd6zlkKc.jpg'}, {'collection_id': 200641, 'backdrop_path': '/3fhHbLeO3DqdHvgHg5szs399eBb.jpg', 'name': 'Red Lotus Collection', 'poster_path': '/yf9Eod9ANXyHTzDpAxz9ezgvlL4.jpg'})]
(Background on this error at: http://sqlalche.me/e/9h9h)

In [13]:
connection.close()

NameError: name 'connection' is not defined

In [None]:
# Save movies information
# Extract the required columns
movies_df = movies_alldata_df[['imdb_id', 'adult', 'budget', 'homepage', 'id', 'original_language', 'original_title', \
                               'overview', 'popularity', 'poster_path', 'release_date', 'revenue', 'runtime', \
                               'status', 'tagline', 'title', 'video', 'vote_average', 'vote_count', 'collection_id']]
# Write data frame to the table
movies_df.to_sql(name='movie', con=engine, if_exists='append', index=False)

In [None]:
# Process M to M columns
cols_MM_lst = ['genres', 'production_companies', 'production_countries', 'spoken_languages']

for col in cols_MM_lst:
    print(col)
    parent_df, associate_df = rel_MM (movies_alldata_df, col)
    # Write to DB

In [None]:
parent_df.head()

In [None]:
associate_df.head()

In [None]:
!jupyter nbconvert --to script --output "Movies_ETL" Movies_ETL-MAG.ipynb