In [2]:
import pandas as pd
from pathlib import Path

## Load All Encoded Datasets

Load each encoded dataset to extract the new encoded columns:

In [3]:
# Define input paths
BASE_PATH = "../../output_data"
OUTPUT_PATH = "../../output_data/final_encoded_data/final_fully_encoded_dataset.csv"

# Load all encoded datasets
df_customer = pd.read_csv(f"{BASE_PATH}/1_customer_id/encoded_customer_id_dataset.csv")
df_location = pd.read_csv(f"{BASE_PATH}/2_location/location_binary_encoded.csv")
df_payment = pd.read_csv(f"{BASE_PATH}/3_payment_method/encoded_payment_method_dataset.csv")
df_discount = pd.read_csv(f"{BASE_PATH}/4_discount_applied/discount_applied_one_hot_encoded.csv")
df_category = pd.read_csv(f"{BASE_PATH}/5_category/encoded_category_dataset.csv")
df_item = pd.read_csv(f"{BASE_PATH}/6_item/encoded_item_dataset.csv")

## Merge Encoded Columns

Merge all encoded columns based on Transaction ID:

In [4]:
# Start with category dataset (has the most accumulated columns from previous steps)
final_df = df_category.copy()

# Add Customer ID Target Encoding
final_df['Customer ID Target Encoded'] = df_customer['Customer ID Target Encoded']

# Add Item Target Encoding
final_df['Item Target Encoded'] = df_item['Item Target Encoded']

# Add Location encoding
final_df['Location_Encoded'] = df_location['Location_Encoded']

# Add Payment Method encodings
final_df['Payment_Cash'] = df_payment['Payment_Cash']
final_df['Payment_Credit Card'] = df_payment['Payment_Credit Card']
final_df['Payment_Digital Wallet'] = df_payment['Payment_Digital Wallet']

# Add Discount Applied encodings
final_df['Discount_False'] = df_discount['Discount_False']
final_df['Discount_True'] = df_discount['Discount_True']
final_df['Discount_Unknown'] = df_discount['Discount_Unknown']

print(f"\nCombined dataset shape: {final_df.shape}")
print(f"Column count: {len(final_df.columns)}")


Combined dataset shape: (11971, 28)
Column count: 28


## Drop Original Categorical Columns

Remove original categorical columns, keeping only numerical and encoded features:

In [5]:
# Columns to drop (original categorical)
columns_to_drop = ['Customer ID', 'Category', 'Item', 'Payment Method', 'Location', 'Discount Applied']

# Drop original categorical columns
final_df = final_df.drop(columns=columns_to_drop)

print(f"After dropping original categorical columns:")
print(f"Shape: {final_df.shape}")
print(f"\nRemaining columns ({len(final_df.columns)}):")
print(final_df.columns.tolist())

After dropping original categorical columns:
Shape: (11971, 22)

Remaining columns (22):
['Transaction ID', 'Price Per Unit', 'Quantity', 'Total Spent', 'Transaction Date', 'cat_Beverages', 'cat_Butchers', 'cat_Computers and electric accessories', 'cat_Electric household essentials', 'cat_Food', 'cat_Furniture', 'cat_Milk Products', 'cat_Patisserie', 'Customer ID Target Encoded', 'Item Target Encoded', 'Location_Encoded', 'Payment_Cash', 'Payment_Credit Card', 'Payment_Digital Wallet', 'Discount_False', 'Discount_True', 'Discount_Unknown']


## Validation

Verify the final encoded dataset:

In [6]:
# 1. No missing values in encoded columns
encoded_cols = [col for col in final_df.columns if col not in ['Transaction ID', 'Price Per Unit', 'Quantity', 'Total Spent', 'Transaction Date']]
no_missing = final_df[encoded_cols].isna().sum().sum() == 0
print(f"1. No missing values in encoded columns: {no_missing}")

# 2. One-hot columns sum to 1 per row (Payment Method)
payment_cols = ['Payment_Cash', 'Payment_Credit Card', 'Payment_Digital Wallet']
payment_sum_check = (final_df[payment_cols].sum(axis=1) == 1).all()
print(f"2. Payment Method one-hot sums to 1 per row: {payment_sum_check}")

# 3. One-hot columns sum to 1 per row (Discount Applied)
discount_cols = ['Discount_False', 'Discount_True', 'Discount_Unknown']
discount_sum_check = (final_df[discount_cols].sum(axis=1) == 1).all()
print(f"3. Discount Applied one-hot sums to 1 per row: {discount_sum_check}")

# 4. Category one-hot columns sum to 1 per row
category_cols = [col for col in final_df.columns if col.startswith('cat_')]
category_sum_check = (final_df[category_cols].sum(axis=1) == 1).all()
print(f"4. Category one-hot sums to 1 per row: {category_sum_check}")

# 5. Binary encoding has only 0/1
binary_check = final_df['Location_Encoded'].isin([0, 1]).all()
print(f"5. Location encoding contains only 0/1: {binary_check}")

# 6. Row count preserved
print(f"6. Row count: {len(final_df)} (expected: 11,971)")

# 7. Column count check
print(f"\n7. Final column breakdown:")
print(f"   - Identifiers: 1 (Transaction ID)")
print(f"   - Numerical: 4 (Price Per Unit, Quantity, Total Spent, Transaction Date)")
print(f"   - Customer ID encoding: 1")
print(f"   - Item encoding: 1")
print(f"   - Category encoding: {len(category_cols)}")
print(f"   - Location encoding: 1")
print(f"   - Payment encoding: 3")
print(f"   - Discount encoding: 3")
print(f"   - TOTAL: {len(final_df.columns)} columns")

1. No missing values in encoded columns: True
2. Payment Method one-hot sums to 1 per row: True
3. Discount Applied one-hot sums to 1 per row: True
4. Category one-hot sums to 1 per row: True
5. Location encoding contains only 0/1: True
6. Row count: 11971 (expected: 11,971)

7. Final column breakdown:
   - Identifiers: 1 (Transaction ID)
   - Numerical: 4 (Price Per Unit, Quantity, Total Spent, Transaction Date)
   - Customer ID encoding: 1
   - Item encoding: 1
   - Category encoding: 8
   - Location encoding: 1
   - Payment encoding: 3
   - Discount encoding: 3
   - TOTAL: 22 columns


## Preview Final Dataset

Display first few rows and summary statistics:

In [13]:
print(final_df.head())

  Transaction ID  Price Per Unit  Quantity  Total Spent Transaction Date  \
0    TXN_1002182            11.0       5.0         55.0       2024-10-08   
1    TXN_1003865             6.5       5.0         32.5       2022-03-12   
2    TXN_1003940            11.0       9.0         99.0       2022-04-22   
3    TXN_1004091            41.0       3.0        123.0       2023-11-09   
4    TXN_1004124            14.0       5.0         70.0       2022-03-02   

   cat_Beverages  cat_Butchers  cat_Computers and electric accessories  \
0              0             0                                       0   
1              0             0                                       0   
2              0             0                                       0   
3              0             0                                       0   
4              0             0                                       1   

   cat_Electric household essentials  cat_Food  cat_Furniture  \
0                                

## Save Final Encoded Dataset

Save the fully encoded dataset to output directory:

In [7]:
# Create output directory if it doesn't exist
Path(OUTPUT_PATH).parent.mkdir(parents=True, exist_ok=True)

# Save the final encoded dataset
final_df.to_csv(OUTPUT_PATH, index=False)