In [1]:
import pandas as pd
import numpy as np


In [2]:

data = pd.read_csv('../../output_data/3_item/item_imputed.csv')


### Quantify missing Discount Applied

Determine the scale of missing entries in `Discount Applied` after STEP 3 completion. This is a binary categorical field (TRUE/FALSE) with substantial missingness (33%)


In [3]:
# Count missing Discount Applied values
# data['Discount Applied'].isna() creates a boolean Series where True indicates missing values
# .sum() counts the number of True values (i.e., missing entries)
missing_discount = data['Discount Applied'].isna()
# len(data) returns the total number of rows in the dataset
# .mean() on boolean Series gives the proportion of True values
print(f'Missing Discount Applied rows: {missing_discount.sum()} of {len(data)} ({missing_discount.mean():.2%})')

# Show distribution of non-missing values
print(f'\nDiscount Applied value distribution:')
# .value_counts() counts frequency of each unique value
# dropna=False includes NaN in the count
print(data['Discount Applied'].value_counts(dropna=False))

# Verify that critical columns are complete (from previous steps)
print(f'\nVerification of previous steps:')
print(f'  Item missing: {data["Item"].isna().sum()} (should be 0 from STEP 3)')
print(f'  Price Per Unit missing: {data["Price Per Unit"].isna().sum()} (should be 0 from STEP 2)')
print(f'  Quantity missing: {data["Quantity"].isna().sum()} (should be 0 from STEP 1)')
print(f'  Total Spent missing: {data["Total Spent"].isna().sum()} (should be 0 from STEP 1)')


Missing Discount Applied rows: 3988 of 11971 (33.31%)

Discount Applied value distribution:
Discount Applied
True     4019
NaN      3988
False    3964
Name: count, dtype: int64

Verification of previous steps:
  Item missing: 0 (should be 0 from STEP 3)
  Price Per Unit missing: 0 (should be 0 from STEP 2)
  Quantity missing: 0 (should be 0 from STEP 1)
  Total Spent missing: 0 (should be 0 from STEP 1)


### Missingness mechanism

Quantifying how often Discount Applied is missing across different dimensions to determine if the pattern is random (MCAR) or systematic (MAR/MNAR)


In [4]:
# Analyze missingness patterns across categories
# .assign creates a new column 'missing_discount' with the boolean missing indicator
# .groupby('Category') groups all rows by their category value
# ['missing_discount'].mean() calculates the proportion of missing values per category
# .sort_values(ascending=False) sorts categories by missing proportion (highest first)
summary = data.assign(missing_discount=missing_discount).groupby('Category')['missing_discount'].mean().sort_values(ascending=False)
print('Share of Discount Applied missing by Category:')
print(summary)

# Analyze missingness patterns across payment methods
# Same logic as above, but grouped by 'Payment Method'
payment_share = data.assign(missing_discount=missing_discount).groupby('Payment Method')['missing_discount'].mean().sort_values(ascending=False)
print('\nShare of Discount Applied missing by Payment Method:')
print(payment_share)

# Analyze missingness patterns across locations
# Same logic as above, but grouped by 'Location'
location_share = data.assign(missing_discount=missing_discount).groupby('Location')['missing_discount'].mean().sort_values(ascending=False)
print('\nShare of Discount Applied missing by Location:')
print(location_share)

# Check variation to assess randomness
print('\n' + '=' * 80)
print('Variation Analysis (for MCAR assessment):')
print('=' * 80)
# Calculate coefficient of variation for missingness rates
# Lower variation suggests more uniform distribution (MCAR)
# Higher variation suggests systematic pattern (MAR/MNAR)
category_std = summary.std()
category_mean = summary.mean()
cv_category = (category_std / category_mean) * 100 if category_mean > 0 else 0

print(f'Category missingness - Mean: {category_mean:.4f}, Std: {category_std:.4f}, CV: {cv_category:.2f}%')
print(f'Payment missingness - Range: {payment_share.min():.4f} to {payment_share.max():.4f}')
print(f'Location missingness - Range: {location_share.min():.4f} to {location_share.max():.4f}')

if cv_category < 10:
    print('\n✓ Low variation suggests MCAR (Missing Completely At Random)')
    print('  Missingness is relatively uniform across categories')
else:
    print('\n⚠ High variation suggests MAR (Missing At Random) or systematic pattern')


Share of Discount Applied missing by Category:
Category
Furniture                             0.355410
Beverages                             0.352941
Electric household essentials         0.340369
Patisserie                            0.337266
Food                                  0.331785
Butchers                              0.325535
Milk Products                         0.314607
Computers and electric accessories    0.306703
Name: missing_discount, dtype: float64

Share of Discount Applied missing by Payment Method:
Payment Method
Cash              0.340726
Credit Card       0.330023
Digital Wallet    0.328343
Name: missing_discount, dtype: float64

Share of Discount Applied missing by Location:
Location
In-store    0.335423
Online      0.330916
Name: missing_discount, dtype: float64

Variation Analysis (for MCAR assessment):
Category missingness - Mean: 0.3331, Std: 0.0172, CV: 5.15%
Payment missingness - Range: 0.3283 to 0.3407
Location missingness - Range: 0.3309 to 0.3354

✓ Low

### Observed value distribution analysis

Examine the distribution of observed non-missing values to understand the balance between TRUE and FALSE


In [5]:
# Analyze distribution of observed values
# Filter to non-missing values only
# data['Discount Applied'].notna() creates boolean mask for non-missing values
observed_values = data[data['Discount Applied'].notna()]['Discount Applied']

print('Distribution of observed (non-missing) Discount Applied values:')
print('=' * 80)
# .value_counts() counts frequency of each unique value
# .sort_index() sorts by the value itself (False, True) for consistent display
value_counts = observed_values.value_counts().sort_index()
print(value_counts)

# Calculate proportions
print('\n' + '=' * 80)
print('Proportions of observed values:')
print('=' * 80)
total_observed = len(observed_values)
for value, count in value_counts.items():
    proportion = (count / total_observed) * 100
    print(f'{str(value):10s}: {count:5d} ({proportion:5.2f}%)')

# Check if distribution is balanced
true_count = value_counts.get(True, 0)
false_count = value_counts.get(False, 0)
balance_ratio = true_count / false_count if false_count > 0 else 0

print(f'\nTrue/False ratio: {balance_ratio:.3f}')
if 0.9 <= balance_ratio <= 1.1:
    print('✓ Nearly balanced distribution (50/50 split)')
elif 0.8 <= balance_ratio <= 1.2:
    print('✓ Reasonably balanced distribution')
else:
    print('⚠ Imbalanced distribution - one category is more frequent')


Distribution of observed (non-missing) Discount Applied values:
Discount Applied
False    3964
True     4019
Name: count, dtype: int64

Proportions of observed values:
False     :  3964 (49.66%)
True      :  4019 (50.34%)

True/False ratio: 1.014
✓ Nearly balanced distribution (50/50 split)


### Missing data classification

**Classification: MCAR (Missing Completely At Random)**

**Rationale:**
- Missingness is evenly distributed (~33%) across all categories, payment methods, and locations
- The distribution of TRUE/FALSE in observed data is nearly balanced (approximately 50/50)
- The missing pattern shows no relationship with other variables
- Low coefficient of variation indicates uniform missingness across groups
- This appears to be a data collection issue where the field was simply not recorded for 1/3 of transactions
- The missingness does NOT depend on observed or unobserved data

**Key finding:** This is a clear case of MCAR where the field was randomly not filled during data entry, likely due to an optional field or system issue that affected transactions randomly


### Handling strategy: Create Unknown category

**Justification for Unknown category (not deletion or imputation):**

1. **MCAR pattern:** Since missing is completely random, any handling method is theoretically valid
2. **Too much data to drop:** Deleting 33% of rows would lose 3,988 valuable transactions
3. **Preserves transparency:** Unknown category explicitly indicates missing information
4. **No false assumptions:** Avoids incorrectly imputing TRUE or FALSE when we do not know
5. **Maintains all other complete data:** All critical columns (Item, Price, Quantity, Total) are 100% complete

**Why Unknown (not other methods):**
- **Deletion:** Would lose 33% of dataset - too much valuable data
- **Random imputation:** Adds uncertainty and does not add information
- **Mode/Mean imputation:** Creates false certainty - we genuinely do not know the values
- **Predictive model:** Overly complex for MCAR data, no predictive signal available
- **Unknown category:** Most transparent and preserves all transaction data

**Alternative considered:** Random imputation based on observed distribution
- Could maintain 50/50 TRUE/FALSE ratio
- But adds false certainty to unknown values
- Unknown is more honest and allows analysts to handle missing data explicitly


In [6]:
# Display sample of rows with missing Discount Applied
print('Sample of rows with missing Discount Applied (to be handled):')
print('=' * 80)
# data[missing_discount] filters to show only rows where Discount Applied is missing
# .head(10) shows the first 10 such rows
# This allows visual inspection of the data before handling
print(data[missing_discount][['Transaction ID', 'Category', 'Item', 'Total Spent', 'Discount Applied']].head(10))

print('\nObservations about rows with missing Discount Applied:')
print('- All other columns are complete (Item, Price, Quantity, Total)')
print('- Missing values distributed randomly across all categories')
print('- No pattern or relationship with other variables')
print('- Will be labeled as "Unknown" to preserve data and maintain transparency')


Sample of rows with missing Discount Applied (to be handled):
   Transaction ID                            Category          Item  \
4     TXN_1004124  Computers and electric accessories    Item_7_CEA   
5     TXN_1004284                       Milk Products  Item_25_MILK   
7     TXN_1006123       Electric household essentials    Item_8_EHE   
8     TXN_1006129                       Milk Products  Item_17_MILK   
10    TXN_1007144                           Beverages    Item_2_BEV   
16    TXN_1010976  Computers and electric accessories   Item_12_CEA   
17    TXN_1011669                                Food  Item_13_FOOD   
18    TXN_1011882                          Patisserie   Item_21_PAT   
26    TXN_1015414                            Butchers   Item_23_BUT   
27    TXN_1016209                                Food  Item_25_FOOD   

    Total Spent Discount Applied  
4          70.0              NaN  
5         123.0              NaN  
7          15.5              NaN  
8         232.0 

In [7]:
# Count missing values before handling
# missing_discount.sum() gives the total number of missing Discount Applied values
discount_missing_before = missing_discount.sum()
print(f'Discount Applied missing before handling: {discount_missing_before}')

# Show distribution before handling
print('\nDistribution BEFORE handling:')
print(data['Discount Applied'].value_counts(dropna=False))

# Fill missing values with "Unknown" string
# .fillna('Unknown') replaces all NaN values with the string "Unknown"
# This creates a third category alongside True and False
data['Discount Applied'] = data['Discount Applied'].fillna('Unknown')

# Count missing values after handling
# data['Discount Applied'].isna().sum() recounts missing values after filling
discount_missing_after = data['Discount Applied'].isna().sum()
# Calculate how many values were handled
values_handled = discount_missing_before - discount_missing_after

print(f'\nDiscount Applied missing after handling: {discount_missing_after}')
print(f'Values handled (converted to "Unknown"): {values_handled}')
print(f'Handling success rate: {values_handled / discount_missing_before:.1%}')

# Show distribution after handling
print('\nDistribution AFTER handling:')
print(data['Discount Applied'].value_counts())


Discount Applied missing before handling: 3988

Distribution BEFORE handling:
Discount Applied
True     4019
NaN      3988
False    3964
Name: count, dtype: int64

Discount Applied missing after handling: 0
Values handled (converted to "Unknown"): 3988
Handling success rate: 100.0%

Distribution AFTER handling:
Discount Applied
True       4019
Unknown    3988
False      3964
Name: count, dtype: int64


### Validation: Verify all missing values handled

Verify that all missing values have been addressed and the dataset is now 100% complete


In [8]:
# Comprehensive missing value check across ALL columns
print('Final missing value check across ALL columns:')
print('=' * 80)

# .isnull().sum() counts missing values for each column
missing_summary = data.isnull().sum()
# Filter to show only columns with missing values
missing_cols = missing_summary[missing_summary > 0]

if len(missing_cols) > 0:
    print('⚠ Columns with remaining missing values:')
    for col, count in missing_cols.items():
        pct = (count / len(data)) * 100
        print(f'  {col:30s}: {count:5d} ({pct:5.2f}%)')
else:
    print('✓ NO MISSING VALUES IN ANY COLUMN')
    print('✓ Dataset is now 100% complete!')

print('\n' + '=' * 80)
print('Verification of all critical columns:')
print('=' * 80)
critical_columns = ['Item', 'Price Per Unit', 'Quantity', 'Total Spent', 'Discount Applied']
all_complete = True
for col in critical_columns:
    missing_count = data[col].isna().sum()
    status = '✓' if missing_count == 0 else '✗'
    print(f'{status} {col:30s}: {missing_count} missing')
    if missing_count > 0:
        all_complete = False

print('\n' + '=' * 80)
if all_complete:
    print('✓✓✓ SUCCESS: ALL CRITICAL COLUMNS ARE 100% COMPLETE ✓✓✓')
else:
    print('⚠ Warning: Some columns still have missing values')


Final missing value check across ALL columns:
✓ NO MISSING VALUES IN ANY COLUMN
✓ Dataset is now 100% complete!

Verification of all critical columns:
✓ Item                          : 0 missing
✓ Price Per Unit                : 0 missing
✓ Quantity                      : 0 missing
✓ Total Spent                   : 0 missing
✓ Discount Applied              : 0 missing

✓✓✓ SUCCESS: ALL CRITICAL COLUMNS ARE 100% COMPLETE ✓✓✓


### Sample inspection: After handling

Display sample rows to verify the Unknown category was applied correctly


In [9]:
# Display sample of rows that were handled
print('Sample of rows after Discount Applied handling:')
print('=' * 80)
# missing_discount is still the original boolean filter (before handling)
# Use it to show the same rows, now with "Unknown" values
sample_handled = data[missing_discount][['Transaction ID', 'Category', 'Item', 'Total Spent', 'Discount Applied']].head(10)
print(sample_handled)

print('\nVerification by Category:')
print('=' * 80)
# Show distribution of Discount Applied (including Unknown) by Category
# This verifies Unknown is distributed evenly across categories
for category in data['Category'].unique():
    category_data = data[data['Category'] == category]
    unknown_count = (category_data['Discount Applied'] == 'Unknown').sum()
    total_count = len(category_data)
    unknown_pct = (unknown_count / total_count) * 100 if total_count > 0 else 0
    print(f'  {category:40s}: {unknown_count:4d} Unknown out of {total_count:5d} ({unknown_pct:5.2f}%)')


Sample of rows after Discount Applied handling:
   Transaction ID                            Category          Item  \
4     TXN_1004124  Computers and electric accessories    Item_7_CEA   
5     TXN_1004284                       Milk Products  Item_25_MILK   
7     TXN_1006123       Electric household essentials    Item_8_EHE   
8     TXN_1006129                       Milk Products  Item_17_MILK   
10    TXN_1007144                           Beverages    Item_2_BEV   
16    TXN_1010976  Computers and electric accessories   Item_12_CEA   
17    TXN_1011669                                Food  Item_13_FOOD   
18    TXN_1011882                          Patisserie   Item_21_PAT   
26    TXN_1015414                            Butchers   Item_23_BUT   
27    TXN_1016209                                Food  Item_25_FOOD   

    Total Spent Discount Applied  
4          70.0          Unknown  
5         123.0          Unknown  
7          15.5          Unknown  
8         232.0          Unkno

### Final dataset summary

Summarize the complete dataset after all 4 steps of missing data handling


In [10]:
print('='  * 80)
print('COMPLETE MISSING DATA HANDLING PIPELINE - FINAL SUMMARY')
print('=' * 80)

print('\nOriginal Dataset:')
print(f'  Rows: 12,575')
print(f'  Missing values: 5 columns affected')

print('\nSTEP 1 - Total Spent (Listwise Deletion):')
print(f'  Rows dropped: 604 (4.8%)')
print(f'  Rows retained: 11,971 (95.2%)')
print(f'  Side effect: Quantity also 100% complete')

print('\nSTEP 2 - Price Per Unit (Deterministic Reconstruction):')
print(f'  Values reconstructed: 609 (using formula: Total ÷ Quantity)')
print(f'  Estimation error: 0% (deterministic)')
print(f'  Price Per Unit: 100% complete')

print('\nSTEP 3 - Item (Mode Imputation):')
print(f'  Values imputed: 609 (mode by category)')
print(f'  Item: 100% complete')
print(f'  Category consistency: 100%')

print('\nSTEP 4 - Discount Applied (Unknown Category):')
print(f'  Values handled: 3,988 (converted to "Unknown")')
print(f'  Discount Applied: 100% complete')

print('\n' + '=' * 80)
print('Final Dataset Status:')
print('=' * 80)
print(f'  Final rows: {len(data):,}')
print(f'  Data retention: 95.2%')
print(f'  Total missing values: {data.isnull().sum().sum()}')
print(f'  ✓ ALL columns 100% complete!')

print('\n' + '=' * 80)
print('Data Quality Metrics:')
print('=' * 80)
print(f'  ✓ Mathematical consistency: 100% (Total = Price × Quantity)')
print(f'  ✓ Category-Item consistency: 100%')
print(f'  ✓ No estimation error in Price Per Unit')
print(f'  ✓ Transparent handling of unknowns')

print('\n' + '=' * 80)
print('✓✓✓ MISSING DATA HANDLING PIPELINE COMPLETE ✓✓✓')
print('=' * 80)


COMPLETE MISSING DATA HANDLING PIPELINE - FINAL SUMMARY

Original Dataset:
  Rows: 12,575
  Missing values: 5 columns affected

STEP 1 - Total Spent (Listwise Deletion):
  Rows dropped: 604 (4.8%)
  Rows retained: 11,971 (95.2%)
  Side effect: Quantity also 100% complete

STEP 2 - Price Per Unit (Deterministic Reconstruction):
  Values reconstructed: 609 (using formula: Total ÷ Quantity)
  Estimation error: 0% (deterministic)
  Price Per Unit: 100% complete

STEP 3 - Item (Mode Imputation):
  Values imputed: 609 (mode by category)
  Item: 100% complete
  Category consistency: 100%

STEP 4 - Discount Applied (Unknown Category):
  Values handled: 3,988 (converted to "Unknown")
  Discount Applied: 100% complete

Final Dataset Status:
  Final rows: 11,971
  Data retention: 95.2%
  Total missing values: 0
  ✓ ALL columns 100% complete!

Data Quality Metrics:
  ✓ Mathematical consistency: 100% (Total = Price × Quantity)
  ✓ Category-Item consistency: 100%
  ✓ No estimation error in Price Per

### Persist final cleaned dataset

Save the completely cleaned dataset - this is the final output of the entire pipeline


In [12]:
# Save the final cleaned dataset to CSV
# to_csv writes the DataFrame to a CSV file
# index=False prevents writing row numbers as a column
# This is the FINAL output of the entire 4-step missing data handling pipeline
output_path = '../../output_data/4_discount_applied/final_cleaned_dataset.csv'
data.to_csv(output_path, index=False)
print(f'✓ FINAL cleaned dataset saved to {output_path}')
print(f'  Final row count: {len(data):,}')
print(f'  All critical columns: 100% complete')
print(f'  Ready for analysis!')


✓ FINAL cleaned dataset saved to ../output_data/4_discount_applied/final_cleaned_dataset.csv
  Final row count: 11,971
  All critical columns: 100% complete
  Ready for analysis!


### Summary

**Discount Applied Handling - STEP 4 Complete**

**Classification:** MCAR (Missing Completely At Random)
- Missingness is evenly distributed (~33%) across all categories, payment methods, and locations
- The distribution of TRUE/FALSE in observed data is balanced (approximately 50/50)
- The missing pattern shows no relationship with other variables
- Data collection issue where field was randomly not recorded

**Method:** Create Unknown category
- Converted 3,988 missing values to "Unknown" string
- Preserves all transaction data (no rows dropped)
- Transparent handling - explicitly indicates unknown status

**Justification:**
- MCAR pattern allows any handling method
- Too much data to drop (33% of dataset)
- Unknown category is most transparent and honest
- Avoids false certainty from imputation
- Maintains all other complete columns

**Validation results:**
- ✓ All 3,988 missing values converted to "Unknown"
- ✓ Discount Applied is now 100% complete
- ✓ NO missing values in entire dataset
- ✓ All critical columns complete

**Final Pipeline Results:**
- Original rows: 12,575
- Final rows: 11,971 (95.2% retention)
- STEP 1: Deleted 604 rows (Total Spent + Quantity)
- STEP 2: Reconstructed 609 values (Price Per Unit)
- STEP 3: Imputed 609 values (Item)
- STEP 4: Handled 3,988 values (Discount Applied)
- ✓✓✓ ALL COLUMNS 100% COMPLETE ✓✓✓
