In [3]:
# -----------------------------------------------------------------------------------
# Phase 3: Data Preprocessing
# -----------------------------------------------------------------------------------
# Objective:
# Prepare the synthetic financial transaction dataset for machine learning by
# cleaning, transforming, and encoding the data.

# Key Steps:
# 1. Handle missing values (if any).
# 2. Convert categorical columns (e.g., vendor, category, payment method) into
#    numerical representations suitable for ML models.
# 3. Parse and extract features from dates (e.g., day of week, month).
# 4. Normalize or scale the transaction amount column.
# 5. Finalize the feature set (X) and target labels (y).

# Output:
# A clean and ready-to-use dataset for training and evaluation in Phase 4.

# Note:
# This notebook assumes you already ran `01_generate_data.ipynb` and
# `02_data_exploration.ipynb`, and have the dataset saved as `data/synthetic_finory_transactions.csv`.
# -----------------------------------------------------------------------------------

In [5]:
# Load the synthetic Finory transaction dataset (5,000 rows) for preprocessing
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv("../data/synthetic_finory_transactions.csv")
df.head()

Unnamed: 0,transaction_id,vendor,amount,category,date,payment_method,note
0,e541e1ae-893d-4479-a25d-326c8f7b559e,NortonLifeLock,18.77,SecuritySystem,2025-06-12,Visa,Story do here.
1,23b8a680-a487-4131-ba03-3a191faf5232,Republic Services,120.4,Trash,2025-05-01,MasterCard,Black board dark toward data economic.
2,70dc4c45-ede7-4a89-afae-cb1949fe8992,Consolidated Edison,52.67,Utilities,2025-06-05,Visa,Act peace stock whether.
3,1c80e262-f04e-41e4-a138-ff16bb52efdd,Paychex,36.52,Payroll,2025-02-21,Visa,Cost receive contain hit.
4,00c48818-534c-41f7-a229-b4386cafac1a,Molson Coors Beverage Company,6.78,Alcohol,2025-03-24,Visa,Unit magazine ten.


In [7]:
# Check for any missing/null values in each column
df.isnull().sum()

transaction_id    0
vendor            0
amount            0
category          0
date              0
payment_method    0
note              0
dtype: int64

In [9]:
# View data types and non-null counts; ensure 'date' column is in datetime format
df.info()

# If needed, convert 'date' column to datetime
df['date'] = pd.to_datetime(df['date'])

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   transaction_id  5000 non-null   object 
 1   vendor          5000 non-null   object 
 2   amount          5000 non-null   float64
 3   category        5000 non-null   object 
 4   date            5000 non-null   object 
 5   payment_method  5000 non-null   object 
 6   note            5000 non-null   object 
dtypes: float64(1), object(6)
memory usage: 273.6+ KB


In [11]:
# Extract day of week, month, and log-transformed amount for feature engineering
df['day_of_week'] = df['date'].dt.dayofweek  # 0=Monday
df['month'] = df['date'].dt.month
df['amount_log'] = np.log1p(df['amount'])

In [21]:
# ✅ Save the fully preprocessed dataset for later model training
df.to_csv("../data/synthetic_finory_preprocessed.csv", index=False)
print("✅ Preprocessed dataset saved as synthetic_finory_preprocessed.csv")

✅ Preprocessed dataset saved as synthetic_finory_preprocessed.csv


In [15]:
# Encode categorical variables (vendor, payment_method, category) as numerical labels
from sklearn.preprocessing import LabelEncoder

le_vendor = LabelEncoder()
le_payment = LabelEncoder()
le_category = LabelEncoder()  # Target

df['vendor_encoded'] = le_vendor.fit_transform(df['vendor'])
df['payment_encoded'] = le_payment.fit_transform(df['payment_method'])
df['category_encoded'] = le_category.fit_transform(df['category'])

In [17]:
# Split the dataset into training and testing sets (80% train, 20% test)
from sklearn.model_selection import train_test_split

X = df[['vendor_encoded', 'amount_log', 'payment_encoded', 'day_of_week', 'month']]
y = df['category_encoded']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
# -----------------------------------------------------------------------------------
# Phase 3: Data Preprocessing
# -----------------------------------------------------------------------------------
# Objective:
# Prepare the synthetic transaction data for machine learning by engineering
# useful features, handling categorical variables, and splitting into training
# and testing sets.
#
# Key Steps:
# - Checked for missing values
# - Parsed and extracted date-related features (day of week, month)
# - Log-transformed skewed 'amount' field
# - Encoded categorical variables (vendor, payment method, category)
# - Split dataset into train/test sets for modeling
# -----------------------------------------------------------------------------------