In [None]:
import pandas as pd


In [20]:
# Load source data for quantity assessment
data = pd.read_csv('../Deliverable1Dataset.csv')


### Quantify missing Quantity

Determine the scale of missing entries in `Quantity` to size the remediation.

In [21]:
# Count missing Quantity values
missing_quantity = data['Quantity'].isna()
print(f'Missing Quantity rows: {missing_quantity.sum()} of {len(data)} ({missing_quantity.mean():.2%})')


Missing Quantity rows: 604 of 12575 (4.80%)


### Missingness mechanism

Quantifying how often `Quantity` is missing within each `Category` and payment method shows whether the gaps are random or tied to observable traits. A non-uniform pattern implies MAR, signalling we should leverage those fields during imputation.


In [22]:
# Compare missingness against related fields
summary = data.assign(missing_quantity=missing_quantity).groupby('Category')['missing_quantity'].mean().sort_values(ascending=False)
print('Share of Quantity missing by Category (top 5):')
print(summary.head())
payment_share = data.assign(missing_quantity=missing_quantity).groupby('Payment Method')['missing_quantity'].mean().sort_values(ascending=False)
print('Share of Quantity missing by Payment Method:')
print(payment_share)
price_overlap = data.loc[missing_quantity, 'Price Per Unit'].isna().sum()
item_overlap = data.loc[missing_quantity, 'Item'].isna().sum()
print(f'Rows with Quantity missing and Price available: {missing_quantity.sum() - price_overlap}')
print(f'Rows with Quantity missing and Item missing: {item_overlap}')


Share of Quantity missing by Category (top 5):
Category
Patisserie                            0.056937
Computers and electric accessories    0.051990
Food                                  0.051008
Electric household essentials         0.047140
Butchers                              0.045918
Name: missing_quantity, dtype: float64
Share of Quantity missing by Payment Method:
Payment Method
Digital Wallet    0.048986
Cash              0.048028
Credit Card       0.047076
Name: missing_quantity, dtype: float64
Rows with Quantity missing and Price available: 604
Rows with Quantity missing and Item missing: 604


`Quantity` absence tracks with specific categories and coincides with missing `Item`, indicating a Missing At Random (MAR) pattern driven by observable attributes.

### Imputation strategy

Median-based item profiles give stable quantity estimates while avoiding mean of skewed volumes.


In [23]:
# Rebuild Item from Category+Price for a complete mapping
item_lookup = (
    data.dropna(subset=['Category', 'Price Per Unit', 'Item'])
    .drop_duplicates(subset=['Category', 'Price Per Unit'])
    .set_index(['Category', 'Price Per Unit'])['Item']
)
keys = pd.Series(list(zip(data['Category'], data['Price Per Unit'])), index=data.index)
data['Item_filled'] = data['Item'].fillna(keys.map(item_lookup))
data.head(10)


Unnamed: 0,Transaction ID,Customer ID,Category,Item,Price Per Unit,Quantity,Total Spent,Payment Method,Location,Transaction Date,Discount Applied,Item_filled
0,TXN_1002182,CUST_01,Food,Item_5_FOOD,11.0,5.0,55.0,Digital Wallet,In-store,2024-10-08,True,Item_5_FOOD
1,TXN_1003865,CUST_15,Furniture,Item_2_FUR,6.5,5.0,32.5,Cash,Online,2022-03-12,False,Item_2_FUR
2,TXN_1003940,CUST_06,Furniture,Item_5_FUR,11.0,9.0,99.0,Digital Wallet,Online,2022-04-22,False,Item_5_FUR
3,TXN_1004091,CUST_04,Food,Item_25_FOOD,41.0,3.0,123.0,Cash,In-store,2023-11-09,False,Item_25_FOOD
4,TXN_1004124,CUST_08,Computers and electric accessories,Item_7_CEA,14.0,5.0,70.0,Credit Card,In-store,2022-03-02,,Item_7_CEA
5,TXN_1004284,CUST_15,Milk Products,Item_25_MILK,41.0,3.0,123.0,Cash,Online,2023-09-25,,Item_25_MILK
6,TXN_1005543,CUST_19,Food,,30.5,,,Cash,In-store,2022-12-03,True,Item_18_FOOD
7,TXN_1005750,CUST_04,Electric household essentials,Item_12_EHE,21.5,6.0,129.0,Credit Card,Online,2022-11-26,False,Item_12_EHE
8,TXN_1006123,CUST_04,Electric household essentials,Item_8_EHE,15.5,1.0,15.5,Cash,Online,2023-10-17,,Item_8_EHE
9,TXN_1006129,CUST_21,Milk Products,Item_17_MILK,29.0,8.0,232.0,Digital Wallet,Online,2024-11-05,,Item_17_MILK


*Median advantages:* handles skew and returns typical basket counts.
*Mean drawbacks:* inflated by occasional bulk orders; less representative.


In [25]:
# Derive per-item quantity medians (robust to outliers vs mean)
item_quantity_median = (
    data.loc[data['Quantity'].notna()]
    .groupby('Item')['Quantity']
    .median()
)
print('Sample item medians:')
print(item_quantity_median.head())


Sample item medians:
Item
Item_10_BEV     8.0
Item_10_BUT     6.0
Item_10_CEA     5.0
Item_10_EHE     5.0
Item_10_FOOD    5.0
Name: Quantity, dtype: float64


In [26]:
# Impute missing Quantity using the mapped Item medians
data['Quantity_imputed'] = data['Quantity']
needs_quantity = data['Quantity_imputed'].isna()
data.loc[needs_quantity, 'Quantity_imputed'] = data.loc[needs_quantity, 'Item_filled'].map(item_quantity_median)
remaining = data['Quantity_imputed'].isna().sum()
print(f'Remaining missing Quantity after item-level imputation: {remaining}')


Remaining missing Quantity after item-level imputation: 0


If any rows remain (e.g., items lacking observed quantities), fall back to category-level medians.

In [27]:
# Fallback: category medians for any unresolved rows
if remaining := data['Quantity_imputed'].isna().sum():
    category_median = (
        data.loc[data['Quantity'].notna()]
        .groupby('Category')['Quantity']
        .median()
    )
    print(category_median)
    data.loc[data['Quantity_imputed'].isna(), 'Quantity_imputed'] = data.loc[data['Quantity_imputed'].isna(), 'Category'].map(category_median)
    print('Remaining missing Quantity after category fallback:', data['Quantity_imputed'].isna().sum())
else:
    print('Category fallback not required; all quantities imputed from item medians.')


Category fallback not required; all quantities imputed from item medians.


### Recalculate total spend

Once `Quantity` is available, restore `Total Spent` for the same rows using the unit price.

In [29]:
data['Total Spent_imputed'] = data['Total Spent']
needs_total = data['Total Spent_imputed'].isna()
data.loc[needs_total, 'Total Spent_imputed'] = data.loc[needs_total, 'Quantity_imputed'] * data.loc[needs_total, 'Price Per Unit']
print(f"Remaining missing Total Spent values: {data['Total Spent_imputed'].isna().sum()}")
# Promote imputed values back into the main columns
data['Quantity'] = data['Quantity_imputed']
data['Total Spent'] = data['Total Spent_imputed']


Remaining missing Total Spent values: 0


### Persist results

Store the imputed columns for downstream use.

In [32]:
# Persist dataset without helper imputation columns
imputed_columns = [col for col in data.columns if col.endswith('_imputed')]
item_filled_columns = [col for col in data.columns if col.endswith('_filled')]
dataset_to_save = data.drop(columns=imputed_columns + item_filled_columns)
dataset_to_save.to_csv('quantity_imputed.csv', index=False)
print('Dataset with imputed values saved to quantity/quantity_imputed.csv')


Dataset with imputed values saved to quantity/quantity_imputed.csv
