In [2]:
import pandas as pd
import numpy as np

### Load data

In [4]:
source = pd.read_csv('../Deliverable1Dataset.csv')
data = source.copy()

### Item

- Find all the observations that missing values

In [5]:
total_missing_items = data['Item'].isnull().sum()
print(f'Total number of missing rows for Item: {total_missing_items}')

total_missing_items_observations = data[data['Item'].isnull()]
total_missing_items_observations.head(50)

Total number of missing rows for Item: 1213


Unnamed: 0,Transaction ID,Customer ID,Category,Item,Price Per Unit,Quantity,Total Spent,Payment Method,Location,Transaction Date,Discount Applied
6,TXN_1005543,CUST_19,Food,,30.5,,,Cash,In-store,2022-12-03,True
13,TXN_1007496,CUST_01,Butchers,,,10.0,155.0,Credit Card,Online,2024-02-05,True
51,TXN_1032287,CUST_16,Food,,,2.0,43.0,Cash,Online,2022-07-27,
64,TXN_1041483,CUST_19,Electric household essentials,,15.5,,,Cash,In-store,2024-03-19,
65,TXN_1041890,CUST_03,Furniture,,27.5,,,Digital Wallet,Online,2022-07-02,False
71,TXN_1044590,CUST_12,Electric household essentials,,,4.0,56.0,Cash,Online,2024-05-07,False
73,TXN_1046262,CUST_14,Milk Products,,,5.0,70.0,Cash,In-store,2022-11-19,False
74,TXN_1046367,CUST_21,Computers and electric accessories,,,10.0,185.0,Cash,In-store,2022-06-21,
79,TXN_1051223,CUST_08,Patisserie,,,9.0,45.0,Credit Card,Online,2023-10-25,
90,TXN_1058643,CUST_23,Food,,,2.0,19.0,Digital Wallet,In-store,2024-10-01,False


### Item missingness analysis

- Evaluate where `Item` values are absent to understand dependencies.

In [70]:
item_missing = data['Item'].isna()

print(f'Missing Item count: {item_missing.sum()} of {len(data)} rows ({item_missing.mean():.2%})')
print('Share of Item missing by Category (top 5):')
print(data.assign(item_missing=item_missing).groupby('Category')['item_missing'].mean().sort_values(ascending=False).head())
print('Share of Item missing by Payment Method:')
print(data.assign(item_missing=item_missing).groupby('Payment Method')['item_missing'].mean().sort_values(ascending=False))


Missing Item count: 1213 of 12575 rows (9.65%)
Share of Item missing by Category (top 5):
Category
Patisserie                            0.104058
Computers and electric accessories    0.103338
Food                                  0.102015
Milk Products                         0.100379
Electric household essentials         0.096794
Name: item_missing, dtype: float64
Share of Item missing by Payment Method:
Payment Method
Digital Wallet    0.103282
Credit Card       0.095123
Cash              0.091183
Name: item_missing, dtype: float64


### Identify the missingness

`Item` missed along with `Price Per Unit` and `Category`, indicating a Missing At Random (MAR) mechanism.

### Handle MAR issue
- Need to fill in all the missing values, first to check to see their relationship for those observations that has values 
- Will need to fill in the missing values based on the existing observations but those existed observation have to be unique in order to create a mapping between


In [6]:

lookup_conflicts = (
    data.dropna(subset=['Category', 'Price Per Unit', 'Item'])
    .groupby(['Category', 'Price Per Unit'])['Item']
    .nunique()
)
conflicts = lookup_conflicts[lookup_conflicts > 1]
print(f'Category+Price combinations without unique Item mapping: {len(conflicts)}')


Category+Price combinations without unique Item mapping: 0


### Fill in missing values for Item

Derive deterministic `Item` values from `Category` and `Price Per Unit`. When unit price is missing but `Quantity` and `Total Spent` are present, reconstruct the price before mapping.

In [7]:
# Flag rows where Item and Price per Unit are missing but quantity and total are available
needs_price = (
    data['Item'].isna() &
    data['Price Per Unit'].isna() &
    data['Quantity'].notna() &
    data['Total Spent'].notna()
)
# Recompute unit prices for those rows before attempting Item lookup
data.loc[needs_price, 'Price Per Unit'] = (
    data.loc[needs_price, 'Total Spent'] / data.loc[needs_price, 'Quantity']
).round(1)
print(data)
# Build Category+Price lookup table mapping back to the expected Item code
# only need to set index for all the rows that have the values of the category and price per unit (remove all the rows that have invalid values for outliers)
# do the same with duplicated rows then create a mapping table between [category and price per unit] and item e.g ('Furniture', 6.5) -> Item_18_FOOD
item_lookup = (
    data.dropna(subset=['Category', 'Price Per Unit', 'Item'])
    .drop_duplicates(subset=['Category', 'Price Per Unit'])
    .set_index(['Category', 'Price Per Unit'])['Item']
)

print(item_lookup)
# Apply lookup to rows still missing Item values
# create a mapping table between category and price per unit e.g, ('Food', 11.0), ('Furniture', 6.5)
keys = list(zip(data['Category'], data['Price Per Unit']))
data['Item'] = data['Item'].fillna(pd.Series(keys, index=data.index).map(item_lookup))


      Transaction ID Customer ID                            Category  \
0        TXN_1002182     CUST_01                                Food   
1        TXN_1003865     CUST_15                           Furniture   
2        TXN_1003940     CUST_06                           Furniture   
3        TXN_1004091     CUST_04                                Food   
4        TXN_1004124     CUST_08  Computers and electric accessories   
...              ...         ...                                 ...   
12570    TXN_9996909     CUST_23                           Furniture   
12571    TXN_9997234     CUST_02                            Butchers   
12572    TXN_9998575     CUST_13                                Food   
12573    TXN_9999124     CUST_02                            Butchers   
12574    TXN_9999729     CUST_15                            Butchers   

               Item  Price Per Unit  Quantity  Total Spent  Payment Method  \
0       Item_5_FOOD            11.0       5.0         55.

In [8]:
# Confirm that no Item or Price Per Unit values remain missing after imputation
remaining = data['Item'].isna().sum()
print(f'Remaining missing Item values: {remaining}')
data.to_csv("dataset_with_item_imputed.csv", index=False)

Remaining missing Item values: 0
