In [378]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from display_config import enable_clean_display
from db_connector import fetch_data

In [379]:
enable_clean_display()  # configures horizontal scrolling for wide DataFrames -- wasn't happening through built-in options

Sales tables, geographical data, and UOM names are in separate schemas, and cross-schema joins are not allowed in PostgreSQL.</br>
So read queries on each schema into their own DataFrames, then merge.

In [380]:
sales_df = fetch_data('''
    SELECT
        il.invoice_line_id,
        il.invoice_id,
        il.stock_item_id,
        il.quantity,
        il.package_type_id,
        il.unit_price,
        il.extended_price,
        o.order_date,
        o.expected_delivery_date,
        i.confirmed_delivery_time,
        c.delivery_city_id
    FROM sales.invoice_lines il
        INNER JOIN sales.invoices i
            ON il.invoice_id = i.invoice_id
        INNER JOIN sales.orders o
            ON i.order_id = o.order_id
        INNER JOIN sales.customers c
            ON o.customer_id = c.customer_id;
''')

print(sales_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228265 entries, 0 to 228264
Data columns (total 11 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   invoice_line_id          228265 non-null  int64         
 1   invoice_id               228265 non-null  int64         
 2   stock_item_id            228265 non-null  int64         
 3   quantity                 228265 non-null  int64         
 4   package_type_id          228265 non-null  int64         
 5   unit_price               228265 non-null  float64       
 6   extended_price           228265 non-null  float64       
 7   order_date               228265 non-null  object        
 8   expected_delivery_date   228265 non-null  object        
 9   confirmed_delivery_time  227981 non-null  datetime64[ns]
 10  delivery_city_id         228265 non-null  int64         
dtypes: datetime64[ns](1), float64(2), int64(6), object(2)
memory usage: 19.2+ MB
N

In [381]:
geo_df = fetch_data('''
    SELECT
        c.city_id,
        c.city_name,
        sp.state_province_code,
        sp.state_province_name,
        sp.sales_territory
    FROM application.cities c
        JOIN application.state_provinces sp
            ON c.state_province_id = sp.state_province_id;
''').rename(columns={
    'state_province_code': 'state_code',
    'state_province_name': 'state_name'
})

print(geo_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37940 entries, 0 to 37939
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   city_id          37940 non-null  int64 
 1   city_name        37940 non-null  object
 2   state_code       37940 non-null  object
 3   state_name       37940 non-null  object
 4   sales_territory  37940 non-null  object
dtypes: int64(1), object(4)
memory usage: 1.4+ MB
None


In [382]:
uom_df = fetch_data('''
    SELECT
        package_type_id,
        package_type_name
    FROM warehouse.package_types;
''')

print(uom_df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14 entries, 0 to 13
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   package_type_id    14 non-null     int64 
 1   package_type_name  14 non-null     object
dtypes: int64(1), object(1)
memory usage: 356.0+ bytes
None


In [383]:
# The join columns between all 3 tables are the same dtype (int64), so proceed with the merge
df = (pd.merge(sales_df, geo_df, 
    left_on='delivery_city_id',
    right_on='city_id',
    how='inner')
    .drop(columns=['city_id'])
)

df = (pd.merge(df, uom_df,
    left_on='package_type_id',
    right_on='package_type_id',
    how='inner')
)

print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 228265 entries, 0 to 228264
Data columns (total 16 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   invoice_line_id          228265 non-null  int64         
 1   invoice_id               228265 non-null  int64         
 2   stock_item_id            228265 non-null  int64         
 3   quantity                 228265 non-null  int64         
 4   package_type_id          228265 non-null  int64         
 5   unit_price               228265 non-null  float64       
 6   extended_price           228265 non-null  float64       
 7   order_date               228265 non-null  object        
 8   expected_delivery_date   228265 non-null  object        
 9   confirmed_delivery_time  227981 non-null  datetime64[ns]
 10  delivery_city_id         228265 non-null  int64         
 11  city_name                228265 non-null  object        
 12  state_code      

</br>Convert column dtypes where needed:
</br> * All dates and date-times should be 'datetime64'. Opt to use pd.to_datetime() to avoid errors.
</br> * All ID columns should be Pandas 'Int64'.
</br> * All monetary amounts should be 'float64'.
</br> * Suitable categorical columns should be 'category'.
</br>
</br>Set index to the GUID, 'invoice_line_id'.

In [384]:
df['order_date'] = pd.to_datetime(df['order_date'])
df['expected_delivery_date'] = pd.to_datetime(df['expected_delivery_date'])

dtype_dict = {
    'invoice_line_id': 'Int64',
    'invoice_id': 'Int64',
    'stock_item_id': 'Int64',
    'quantity': 'Int64',
    'package_type_id': 'Int64',
    'delivery_city_id': 'Int64',
    'city_name': 'category',
    'state_code': 'category',
    'state_name': 'category',
    'sales_territory': 'category',
    'package_type_name': 'category'
}

df = df.astype(dtype_dict).set_index('invoice_line_id')

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 228265 entries, 1 to 228265
Data columns (total 15 columns):
 #   Column                   Non-Null Count   Dtype         
---  ------                   --------------   -----         
 0   invoice_id               228265 non-null  Int64         
 1   stock_item_id            228265 non-null  Int64         
 2   quantity                 228265 non-null  Int64         
 3   package_type_id          228265 non-null  Int64         
 4   unit_price               228265 non-null  float64       
 5   extended_price           228265 non-null  float64       
 6   order_date               228265 non-null  datetime64[ns]
 7   expected_delivery_date   228265 non-null  datetime64[ns]
 8   confirmed_delivery_time  227981 non-null  datetime64[ns]
 9   delivery_city_id         228265 non-null  Int64         
 10  city_name                228265 non-null  category      
 11  state_code               228265 non-null  category      
 12  state_name           

In [385]:
# Inspect the package_type_name column -- my best option for UOM values
print(df['package_type_name'].unique())

['Each', 'Packet', 'Pair', 'Bag']
Categories (4, object): ['Bag', 'Each', 'Packet', 'Pair']


</br>The 'package_type_name' column was the best available option for a UOM column.
</br>But for this analysis it would be purely descriptive, since all values indicate unit-level UOMs.
</br>I will treat all items as individual units, suitable for aggregation.

In [386]:
# Check for missing values
df.isna().sum()

invoice_id                   0
stock_item_id                0
quantity                     0
package_type_id              0
unit_price                   0
extended_price               0
order_date                   0
expected_delivery_date       0
confirmed_delivery_time    284
delivery_city_id             0
city_name                    0
state_code                   0
state_name                   0
sales_territory              0
package_type_name            0
dtype: int64

Since I may analyze aspects of the data that don't involve deliveries,</br>
keep rows with missing datetimes in the base table and filter out in summary table(s).

In [387]:
# Check for duplicates index values
print(df.index.duplicated().sum())

0


In [388]:
# Check for entire duplicate rows, excluding index
print(df.duplicated().sum())

0


In [389]:
# Create date and time metric columns

# Extract date components
df['order_mo'] = df['order_date'].dt.month
df['order_yr'] = df['order_date'].dt.year

# Calculate delivery durations
df['days_to_deliver'] = (
    df['confirmed_delivery_time'] - df['order_date']
    ).dt.days.astype('Int64')

df['days_delivery_lag'] = (
    df['confirmed_delivery_time'] - df['expected_delivery_date']
    ).dt.days.astype('Int64')

In [390]:
# Create order volume metric columns, measured at the invoice/order level

# Calculate a metric for order magnitude
df['total_order_qty'] = df.groupby('invoice_id')['quantity'].transform('sum')

# Calculate a metric for order complexity -- defined as number of lines in an order
df['line_item_count'] = df.groupby('invoice_id')['quantity'].transform('count')

# Calculate the average quantity per line within each order
df['avg_qty_per_line'] = (df['total_order_qty'] / df['line_item_count']).round(2)

</br>Since this is an extremely clean dataset, focus outlier flagging on meaningful business anomalies instead of possible data errors.

In [391]:
# Create outlier-flagging columns, measured at the invoice/order level

# Calculate components of high volume flag
q1_a, q3_a = df['total_order_qty'].quantile([0.25, 0.75])
upper_bound = q3_a + (1.5 * (q3_a - q1_a))

# Flag any order whose total_order_qty is above the upper bound as "volume anomaly"
df['is_volume_anomaly'] = df['total_order_qty'] > upper_bound

# Calculate components of inefficiency flag
q1_b, q3_b = df['avg_qty_per_line'].quantile([0.25, 0.75])
lower_bound = q1_b - (1.5 * (q3_b - q1_b))

# Flag any order whose avg_units_per_line is below the lower bound as "inefficient"
df['is_low_efficiency_anomaly'] = df['avg_qty_per_line'] < lower_bound

In [392]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 228265 entries, 1 to 228265
Data columns (total 24 columns):
 #   Column                     Non-Null Count   Dtype         
---  ------                     --------------   -----         
 0   invoice_id                 228265 non-null  Int64         
 1   stock_item_id              228265 non-null  Int64         
 2   quantity                   228265 non-null  Int64         
 3   package_type_id            228265 non-null  Int64         
 4   unit_price                 228265 non-null  float64       
 5   extended_price             228265 non-null  float64       
 6   order_date                 228265 non-null  datetime64[ns]
 7   expected_delivery_date     228265 non-null  datetime64[ns]
 8   confirmed_delivery_time    227981 non-null  datetime64[ns]
 9   delivery_city_id           228265 non-null  Int64         
 10  city_name                  228265 non-null  category      
 11  state_code                 228265 non-null  category     