## Using pandas to ingest

In [2]:
import pandas as pd
from sqlalchemy import create_engine

# create a connection to postgres database
db_connection = create_engine(f"postgresql://postgres:abem1593574628@localhost:5432/trial_error")
display(db_connection.connect())

<sqlalchemy.engine.base.Connection at 0x28d8cc77fd0>

In [8]:
df_customers = pd.read_csv("./dataset/customers_dataset.csv")
display(df_customers.shape, df_customers.sample(3), df_customers.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  object
 1   customer_unique_id        99441 non-null  object
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(4)
memory usage: 3.8+ MB


(99441, 5)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
92119,6d59eb54b9df0cb6c0d67669819e02c5,2ad53a13954ae9d3d8e30a7e3ba1de18,31270,belo horizonte,MG
60663,5d9b63267a7c9e53f6cabd2a9a0bb708,0ae0a5af2c330de902fb04c10454ba81,78035,cuiaba,MT
27346,8ec31207d29ab8044ced7bfbc6595c3f,7ef15a4426641bd3343b717007cc01d0,22260,rio de janeiro,RJ


None

In [14]:
def read_dataframe(input_path):
    return pd.read_csv(input_path)

def table_header(df, table_name, db_connection, schema_name):
    df.head(n=0).to_sql(
        name=table_name, 
        con=db_connection, 
        schema=schema_name,
        if_exists='replace', 
        index=False
    )
    
def ingest_to_database(df, table_name, db_connection, schema_name):
    df.to_sql(
        name=table_name, 
        con=db_connection,
        schema=schema_name,
        if_exists='append', 
        index=False
    )

def main():
    db_connection = create_engine(f"postgresql://postgres:abem1593574628@localhost:5432/trial_error")
    
    dataset_name = "customers_dataset"
    table_name = dataset_name.split("_")[0]
    schema_name = 'ecommerce'
    
    df = read_dataframe(f"./dataset/{dataset_name}.csv")
    table_header(df, table_name, db_connection, schema_name)
    ingest_to_database(df, table_name, db_connection, schema_name)

main()

In [15]:
query = """
SELECT * FROM ecommerce.customers
LIMIT 10
"""
pd.read_sql(query, con=db_connection)

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP
5,879864dab9bc3047522c92c82e1212b8,4c93744516667ad3b8f1fb645a3116a4,89254,jaragua do sul,SC
6,fd826e7cf63160e536e0908c76c3f441,addec96d2e059c80c30fe6871d30d177,4534,sao paulo,SP
7,5e274e7a0c3809e14aba7ad5aae0d407,57b2a98a409812fe9618067b6b8ebe4f,35182,timoteo,MG
8,5adf08e34b2e993982a47070956c5c65,1175e95fb47ddff9de6b2b06188f7e0d,81560,curitiba,PR
9,4b7139f34592b3a31687243a302fa75b,9afe194fb833f79e300e37e580171f22,30575,belo horizonte,MG


In [18]:
dataset_name = "customers_dataset"
df1 = read_dataframe(f"./dataset/{dataset_name}.csv")
df1['customer_zip_code_prefix'].nunique()

14994

In [19]:
dataset_name = "geolocation_dataset"
df2 = read_dataframe(f"./dataset/{dataset_name}.csv")
df2['geolocation_zip_code_prefix'].nunique()

19015

## Upload dataset to database

In [None]:
# check all files name
!ls ./dataset

In [None]:
dataset_name = "geolocation_dataset"
df = pd.read_csv(f"./dataset/{dataset_name}.csv")
df.sample(3)

In [None]:
template = """
CREATE TABLE table_name (
    column_names
);
COPY table_name (column_names)
FROM 'C:/Users/Abrisyaf/Portofolio/business-performance-analysis/dataset/dataset_name.csv'
DELIMITER ','
CSV HEADER;
"""

In [None]:
draft = """
-- sellers data


-- product data



-- orders data
CREATE TABLE orders (
    "order_id" VARCHAR(32)
    "customer_id" VARCHAR(32),
    "order_status" VARCHAR,
    "order_purchase_timestamp" TIMESTAMP,
    "order_approved_at" TIMESTAMP,
    "order_delivered_carrier_date" TIMESTAMP,
    "order_delivered_customer_date" TIMESTAMP,
    "order_estimated_delivery_date" DATE
);

COPY orders ("order_id","customer_id","order_status","order_purchase_timestamp","order_approved_at",
             "order_delivered_carrier_date","order_delivered_customer_date","order_estimated_delivery_date")
FROM 'C:/Users/Abrisyaf/Portofolio/business-performance-analysis/dataset/orders_dataset.csv'
DELIMITER ','
CSV HEADER;

-- order reviews data
CREATE TABLE order_reviews (
    "review_id" VARCHAR(32),
    "order_id" VARCHAR(32),
    "review_score" INT,
    "review_comment_title" VARCHAR,
    "review_comment_message" VARCHAR,
    "review_creation_date" DATE,
    "review_answer_timestamp" TIMESTAMP
);

COPY order_reviews ("review_id","order_id","review_score","review_comment_title","review_comment_message","review_creation_date","review_answer_timestamp")
FROM 'C:/Users/Abrisyaf/Portofolio/business-performance-analysis/dataset/order_reviews_dataset.csv'
DELIMITER ','
CSV HEADER;


-- order payments data
CREATE TABLE order_payments (
    "order_id" VARCHAR(32),
    "payment_sequential" INT,
    "payment_type" VARCHAR(255),
    "payment_installments" INT,
    "payment_value" NUMERIC
);

COPY order_payments ("order_id","payment_sequential","payment_type","payment_installments","payment_value")
FROM 'C:/Users/Abrisyaf/Portofolio/business-performance-analysis/dataset/order_payments_dataset.csv'
DELIMITER ','
CSV HEADER;

-- order items data
CREATE TABLE order_items (
    "order_id" VARCHAR(32),
    "order_item_id" INT,
    "product_id" VARCHAR(32),
    "seller_id" VARCHAR(32),
    "shipping_limit_date" TIMESTAMP,
    "price" NUMERIC,
    "freight_value" NUMERIC
);

COPY order_items ("order_id","order_item_id","product_id","seller_id","shipping_limit_date","price","freight_value")
FROM 'C:/Users/Abrisyaf/Portofolio/business-performance-analysis/dataset/order_items_dataset.csv'
DELIMITER ','
CSV HEADER;

"""