In [82]:
# -----------------------------------------------------------------------------------
# Phase 3: Data Preprocessing
# -----------------------------------------------------------------------------------
# Objective:
# Prepare the synthetic financial transaction dataset for machine learning by
# cleaning, transforming, and encoding the data.

# Key Steps:
# 1. Handle missing values (if any).
# 2. Convert categorical columns (e.g., vendor, category, payment method) into
#    numerical representations suitable for ML models.
# 3. Parse and extract features from dates (e.g., day of week, month).
# 4. Normalize or scale the transaction amount column.
# 5. Finalize the feature set (X) and target labels (y).

# Output:
# A clean and ready-to-use dataset for training and evaluation in Phase 4.

# Note:
# This notebook assumes you already ran `01_generate_data.ipynb` and
# `02_data_exploration.ipynb`, and have the dataset saved as `data/synthetic_finory_transactions.csv`.
# -----------------------------------------------------------------------------------

In [84]:
# Load the synthetic Finory transaction dataset (5,000 rows) for preprocessing
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("../data/synthetic_finory_transactions.csv")
df.head()

Unnamed: 0,transaction_id,vendor,amount,category,date,payment_method,note
0,e541e1ae-893d-4479-a25d-326c8f7b559e,NortonLifeLock,18.77,SecuritySystem,2025-06-12,Visa,Story do here.
1,23b8a680-a487-4131-ba03-3a191faf5232,Republic Services,120.4,Trash,2025-05-01,MasterCard,Black board dark toward data economic.
2,70dc4c45-ede7-4a89-afae-cb1949fe8992,Consolidated Edison,52.67,Utilities,2025-06-05,Visa,Act peace stock whether.
3,1c80e262-f04e-41e4-a138-ff16bb52efdd,Paychex,36.52,Payroll,2025-02-21,Visa,Cost receive contain hit.
4,00c48818-534c-41f7-a229-b4386cafac1a,Molson Coors Beverage Company,6.78,Alcohol,2025-03-24,Visa,Unit magazine ten.


In [86]:
# -----------------------------------------------------------------------------------
# Vendor Grouping: Reduce 330+ vendors into broader categories for better model generalization
# -----------------------------------------------------------------------------------
from vendor_mapping import vendor_groups  # Import the mapping from your vendor_mapping.py

# Map each vendor into a broader group (e.g., Amazon → Retail)
df['vendor_group'] = df['vendor'].map(vendor_groups)

# Fallback to 'Other' for vendors not in the mapping
df['vendor_group'] = df['vendor_group'].fillna('Other')

print("✅ Vendor grouping applied. Sample:")
print(df[['vendor', 'vendor_group']].head())

✅ Vendor grouping applied. Sample:
                          vendor         vendor_group
0                 NortonLifeLock           Technology
1              Republic Services   Energy & Utilities
2            Consolidated Edison   Energy & Utilities
3                        Paychex  Finance & Insurance
4  Molson Coors Beverage Company               Retail


In [88]:
# ✅ Merge detailed categories into ~10 broader groups for better model performance
category_merge_map = {
    'Groceries': 'Food & Dining',
    'Alcohol': 'Food & Dining',
    'DiningOut': 'Food & Dining',
    
    'Electronics': 'Shopping & Electronics',
    'Software': 'Shopping & Electronics',
    'Hardware': 'Shopping & Electronics',
    
    'Gas': 'Transport & Travel',
    'Transportation': 'Transport & Travel',
    'Travel': 'Transport & Travel',
    
    'Rent': 'Housing & Bills',
    'Utilities': 'Housing & Bills',
    'Insurance': 'Housing & Bills',
    
    'Entertainment': 'Entertainment',
    'Subscriptions': 'Entertainment',
    'Games': 'Entertainment',
    
    'BankFees': 'Financial Services',
    'Interest': 'Financial Services',
    'LoanPayments': 'Financial Services',
    
    'Healthcare': 'Healthcare',
    'Pharmacy': 'Healthcare',
    'Medical': 'Healthcare',
    
    'Education': 'Education',
    'Books': 'Education',
    'OnlineCourses': 'Education',
    
    'Charity': 'Gifts & Charity',
    'Donations': 'Gifts & Charity',
    'Gifts': 'Gifts & Charity',
    
    'Other': 'Other'
}

# Apply category merging
df['category_merged'] = df['category'].map(category_merge_map).fillna('Other')

print("✅ Category merging applied. Sample:")
print(df[['category', 'category_merged']].head())

✅ Category merging applied. Sample:
         category  category_merged
0  SecuritySystem            Other
1           Trash            Other
2       Utilities  Housing & Bills
3         Payroll            Other
4         Alcohol    Food & Dining


In [90]:
# Check for any missing/null values in each column
df.isnull().sum()

transaction_id     0
vendor             0
amount             0
category           0
date               0
payment_method     0
note               0
vendor_group       0
category_merged    0
dtype: int64

In [92]:
# View data types and non-null counts; ensure 'date' column is in datetime format
df.info()

# If needed, convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   transaction_id   5000 non-null   object 
 1   vendor           5000 non-null   object 
 2   amount           5000 non-null   float64
 3   category         5000 non-null   object 
 4   date             5000 non-null   object 
 5   payment_method   5000 non-null   object 
 6   note             5000 non-null   object 
 7   vendor_group     5000 non-null   object 
 8   category_merged  5000 non-null   object 
dtypes: float64(1), object(8)
memory usage: 351.7+ KB


In [94]:
# Extract day of week, month, and log-transformed amount for feature engineering
df['day_of_week'] = df['date'].dt.dayofweek  # 0=Monday
df['month'] = df['date'].dt.month
df['amount_log'] = np.log1p(df['amount'])

In [96]:
# ✅ Encode grouped vendor, payment method, and merged category as numerical labels
from sklearn.preprocessing import LabelEncoder

# Initialize label encoders
le_vendor = LabelEncoder()
le_payment = LabelEncoder()
le_category = LabelEncoder()

# Encode the grouped vendor column (not the original vendor name)
df['vendor_encoded'] = le_vendor.fit_transform(df['vendor_group'])

# Encode payment method
df['payment_encoded'] = le_payment.fit_transform(df['payment_method'])

# ✅ Encode the merged category column (category_merged, not the original category)
df['category_encoded'] = le_category.fit_transform(df['category_merged'])

print("✅ Vendor group, payment method, and merged category encoded successfully!")
print(df[['vendor_group', 'vendor_encoded', 'payment_method', 'payment_encoded', 
          'category_merged', 'category_encoded']].head())

✅ Vendor group, payment method, and merged category encoded successfully!
          vendor_group  vendor_encoded payment_method  payment_encoded  \
0           Technology               9           Visa                4   
1   Energy & Utilities               0     MasterCard                2   
2   Energy & Utilities               0           Visa                4   
3  Finance & Insurance               2           Visa                4   
4               Retail               8           Visa                4   

   category_merged  category_encoded  
0            Other                 5  
1            Other                 5  
2  Housing & Bills                 4  
3            Other                 5  
4    Food & Dining                 2  


In [98]:

# Define feature columns (X) and target (y)
X = df[['vendor_encoded', 'amount_log', 'payment_encoded', 'day_of_week', 'month']]
y = df['category_encoded']

# 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("✅ Data split into training and test sets!")
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Test set size: {X_test.shape[0]} samples")

✅ Data split into training and test sets!
Training set size: 4000 samples
Test set size: 1000 samples


In [100]:
# ✅ Save the fully preprocessed dataset for later model training
df.to_csv("../data/synthetic_finory_preprocessed.csv", index=False)
print("✅ Preprocessed dataset saved as synthetic_finory_preprocessed.csv")

✅ Preprocessed dataset saved as synthetic_finory_preprocessed.csv


In [102]:
# -----------------------------------------------------------------------------------
# Phase 3: Data Preprocessing
# -----------------------------------------------------------------------------------
# Objective:
# Prepare the synthetic transaction data for machine learning by engineering
# useful features, handling categorical variables, and splitting into training
# and testing sets.
#
# Key Steps:
# - Checked for missing values
# - Parsed and extracted date-related features (day of week, month)
# - Log-transformed skewed 'amount' field
# - Encoded categorical variables (vendor, payment method, category)
# - Split dataset into train/test sets for modeling
# -----------------------------------------------------------------------------------

In [104]:
# Save label encoders after preprocessing
import joblib
import os

os.makedirs("../models", exist_ok=True)

label_encoders = {
    "vendor_encoder": le_vendor,
    "payment_encoder": le_payment,
    "category_encoder": le_category
}

joblib.dump(label_encoders, "../models/finory_label_encoders.joblib")
print("✅ Label encoders saved successfully!")

✅ Label encoders saved successfully!
