In [152]:
import pandas as pd
import numpy as np

from display_config import enable_clean_display
from db_connector import fetch_data

In [153]:
enable_clean_display()  # configures horizontal scrolling for wide DataFrames -- wasn't happening through built-in options

Sales tables and geographical data are in separate schemas, and cross-schema joins are not allowed in PostgreSQL.</br>
So read queries on each schema into their own DataFrames, then merge.

In [154]:
sales_df = fetch_data('''
    SELECT
        il.invoice_line_id,
        il.invoice_id,
        il.stock_item_id,
        il.quantity,
        il.unit_price,
        il.extended_price,
        o.order_date,
        o.expected_delivery_date,
        i.confirmed_delivery_time,
        c.delivery_city_id
    FROM sales.invoice_lines il
        INNER JOIN sales.invoices i
            ON il.invoice_id = i.invoice_id
        INNER JOIN sales.orders o
            ON i.order_id = o.order_id
        INNER JOIN sales.customers c
            ON o.customer_id = c.customer_id;
''')

print(sales_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228265 entries, 0 to 228264
Data columns (total 10 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   invoice_line_id          228265 non-null  int64         
 1   invoice_id               228265 non-null  int64         
 2   stock_item_id            228265 non-null  int64         
 3   quantity                 228265 non-null  int64         
 4   unit_price               228265 non-null  float64       
 5   extended_price           228265 non-null  float64       
 6   order_date               228265 non-null  object        
 7   expected_delivery_date   228265 non-null  object        
 8   confirmed_delivery_time  227981 non-null  datetime64[ns]
 9   delivery_city_id         228265 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(5), object(2)
memory usage: 17.4+ MB
None


In [155]:
geo_df = fetch_data('''
    SELECT
        c.city_id,
        c.city_name,
        sp.state_province_code,
        sp.state_province_name,
        sp.sales_territory
    FROM application.cities c
        JOIN application.state_provinces sp
            ON c.state_province_id = sp.state_province_id;
''').rename(columns={
    'state_province_code': 'state_code',
    'state_province_name': 'state_name'
})

print(geo_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37940 entries, 0 to 37939
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   city_id          37940 non-null  int64 
 1   city_name        37940 non-null  object
 2   state_code       37940 non-null  object
 3   state_name       37940 non-null  object
 4   sales_territory  37940 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.4+ MB
None


In [156]:
# The join columns in sales_df and geo_df are the same dtype (int64), so proceed with the merge.
df = (
    pd.merge(sales_df, geo_df, left_on='delivery_city_id', right_on='city_id')
    .drop(columns=['city_id'])
)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228265 entries, 0 to 228264
Data columns (total 14 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   invoice_line_id          228265 non-null  int64         
 1   invoice_id               228265 non-null  int64         
 2   stock_item_id            228265 non-null  int64         
 3   quantity                 228265 non-null  int64         
 4   unit_price               228265 non-null  float64       
 5   extended_price           228265 non-null  float64       
 6   order_date               228265 non-null  object        
 7   expected_delivery_date   228265 non-null  object        
 8   confirmed_delivery_time  227981 non-null  datetime64[ns]
 9   delivery_city_id         228265 non-null  int64         
 10  city_name                228265 non-null  object        
 11  state_code               228265 non-null  object        
 12  state_name      

</br>Convert column dtypes where needed:
</br> * All dates and date-times should be 'datetime64'. Opt to use pd.to_datetime() to avoid errors.
</br> * All ID columns should be Pandas 'Int64'.
</br> * All monetary amounts should be 'float64'.
</br> * Suitable categorical columns should be 'category'.
</br>
</br>Set index to the GUID, 'invoice_line_id'.

In [157]:
df['order_date'] = pd.to_datetime(df['order_date'])
df['expected_delivery_date'] = pd.to_datetime(df['expected_delivery_date'])

dtype_dict = {
    'invoice_line_id': 'Int64',
    'invoice_id': 'Int64',
    'stock_item_id': 'Int64',
    'quantity': 'Int64',
    'delivery_city_id': 'Int64',
    'city_name': 'category',
    'state_code': 'category',
    'state_name': 'category',
    'sales_territory': 'category'
}

df = df.astype(dtype_dict).set_index('invoice_line_id')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 228265 entries, 1 to 228265
Data columns (total 13 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   invoice_id               228265 non-null  Int64         
 1   stock_item_id            228265 non-null  Int64         
 2   quantity                 228265 non-null  Int64         
 3   unit_price               228265 non-null  float64       
 4   extended_price           228265 non-null  float64       
 5   order_date               228265 non-null  datetime64[ns]
 6   expected_delivery_date   228265 non-null  datetime64[ns]
 7   confirmed_delivery_time  227981 non-null  datetime64[ns]
 8   delivery_city_id         228265 non-null  Int64         
 9   city_name                228265 non-null  category      
 10  state_code               228265 non-null  category      
 11  state_name               228265 non-null  category      
 12  sales_territory      

In [158]:
# Check for missing values:
df.isna().sum()

invoice_id                   0
stock_item_id                0
quantity                     0
unit_price                   0
extended_price               0
order_date                   0
expected_delivery_date       0
confirmed_delivery_time    284
delivery_city_id             0
city_name                    0
state_code                   0
state_name                   0
sales_territory              0
dtype: int64

Since I may analyze aspects of the data that don't involve deliveries,</br>
keep rows with missing datetimes in the base table and filter out in summary table(s).

In [159]:
# Check for duplicates index values:
print(df.index.duplicated().sum())

0


In [160]:
# Check for entire duplicate rows, excluding index:
print(df.duplicated().sum())

0
