# Initial exploratory

In [1]:
import pandas as pd

In [2]:
# import file 'olist_customers_dataset' (1/9 files)
customers = pd.read_csv('olist_customers_dataset.csv')

In [3]:
# Explore number of rows and columns
shape = customers.shape
print(f'There are {shape[0]} rows and {shape[1]} columns.')

There are 99441 rows and 5 columns.


In [4]:
# Explore datatype
customers.dtypes

customer_id                 object
customer_unique_id          object
customer_zip_code_prefix     int64
customer_city               object
customer_state              object
dtype: object

In [5]:
# Checking for duplication
dup = customers.duplicated().sum()
print(f'There are {dup} duplicated rows.')

There are 0 duplicated rows.


In [6]:
# Checking dataset content from the first five rows
customers.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


## Checking the number of orders for each unique customer

In [7]:
# Checking the number of orders for top 20 of each unique customer
customers.groupby('customer_unique_id').count().sort_values('customer_id', ascending=False).head(20)

Unnamed: 0_level_0,customer_id,customer_zip_code_prefix,customer_city,customer_state
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8d50f5eadf50201ccdcedfb9e2ac8455,17,17,17,17
3e43e6105506432c953e165fb2acf44c,9,9,9,9
6469f99c1f9dfae7733b25662e7f1782,7,7,7,7
ca77025e7201e3b30c44b472ff346268,7,7,7,7
1b6c7548a2a1f9037c1fd3ddfed95f33,7,7,7,7
12f5d6e1cbf93dafd9dcc19095df0b3d,6,6,6,6
de34b16117594161a6a89c50b289d35a,6,6,6,6
63cfc61cee11cbe306bff5857d00bfe4,6,6,6,6
f0e310a6839dce9de1638e0fe5ab282a,6,6,6,6
47c1a3033b8b77b3ab6e109eb4d5fdf3,6,6,6,6


## Fixing zipcode missing leading zero due to its nature of interger datatype

In [8]:
# Add zero (0) character by using .str.pad() and limit the zipcode length to 5 digits
customers['customer_zip_code_prefix'] = customers['customer_zip_code_prefix'].astype(str).str.pad(5, side='left', fillchar='0')

In [9]:
# Check the result zipcode
customers.head()

Unnamed: 0,customer_id,customer_unique_id,customer_zip_code_prefix,customer_city,customer_state
0,06b8999e2fba1a1fbc88172c00ba8bc7,861eff4711a542e4b93843c6dd7febb0,14409,franca,SP
1,18955e83d337fd6b2def6b18a428ac77,290c77bc529b7ac935b93aa66c333dc3,9790,sao bernardo do campo,SP
2,4e7b3e00288586ebd08712fdd0374a03,060e732b5b29e8181a18229c7b0b2b5e,1151,sao paulo,SP
3,b2b6027bc5c5109e529d4dc6358b12c3,259dac757896d24d7702b9acbbff3f3c,8775,mogi das cruzes,SP
4,4f2d8ab171c80ec8364f7c12e35b23ad,345ecd01c38d18a9036ed96c73b8d066,13056,campinas,SP


# Load DataFrame to database as a SQL

In [10]:
# use sqlalchemy to connect to the database
import sqlalchemy
from sqlalchemy import create_engine

# Define database connection details
engine = create_engine('postgresql+psycopg2://adminadmin:InsightJDE03@jde03.postgres.database.azure.com:5432/postgres')

# Write the DataFrame to the database table
customers.to_sql('customers', engine, index=False, if_exists='replace')

print(f'DataFrame loaded to table "customers" successfully!')

DataFrame loaded to table "customers" successfully!


## Drop some columns to reduce redundancy with other dataset

In [None]:
# Drops city and state columns
# customers = customers.drop(['customer_city', 'customer_state'], axis=1)

In [None]:
# Check the result to see the remaining columns
# customers.head()