<H1 align="center"> Automated Expense Categorization - Notebook 02: Baseline Model Development



## 1. Setup

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score
from sklearn.ensemble import RandomForestRegressor

## 2. Load data

In [2]:
df = pd.read_csv('../data/processed/transactions_long.csv')

In [3]:
# -----------------------------------------------------------
# Step 2.1: Label Encoding for Categories
# -----------------------------------------------------------

category_map = {
    'Groceries': 1,
    'Eating_Out': 2,
    'Entertainment': 3,
    'Transport': 4,
    'Utilities': 5,
    'Healthcare': 6,
    'Education': 7,
    'Miscellaneous': 8

}
df['category_encoded'] = df['category'].map(category_map)

## 3. Define featues and target

In [4]:
# -----------------------------------------------------------
# Step 3.1: Predict Amount for Savings
#   Remove non-numeric columns and target variable
# -----------------------------------------------------------

x_savings = df.drop(columns=["Desired_Savings", "entity_id", "Occupation", "City_Tier", "category"])
y_savings = df["Desired_Savings"]

In [5]:
# -----------------------------------------------------------
# Step 3.2 Predict Overspending
#   Remove non-numeric columns and target variable
# -----------------------------------------------------------

df["Overspend_Flag"] = (df["amount"] > df["Disposable_Income"]).astype(int)
x_spending = df.drop(columns=["Overspend_Flag", "entity_id", "category", "Occupation", "City_Tier", "category"])
y_spending = df["Overspend_Flag"]


In [6]:
# # -----------------------------------------------------------
# # Step 3.3 Predict Spending Category
# # ----------------------------------------------------------
# future_spending = [
    
# ]
# y_future_spending = df[future_spending]
# x_future_soending = pd.get_dummies(df, columns=["category"], drop_first=True)


# 4. Split data into train and test

In [7]:
# # -----------------------------------------------------------
# # Step 4.1 Split data for savings category
# # ----------------------------------------------------------
# Split 80% training/ 20% testing

X_savings_train, X_savings_test, y_savings_train, y_savings_test = train_test_split(
    x_savings, 
    y_savings, 
    test_size=0.2, 
    random_state=42     
)

In [8]:
# # -----------------------------------------------------------
# # Step 4.2 Split data for overspending category 
# # ----------------------------------------------------------
# Split 80% training/ 20% testing

X_spending_train, X_spending_test, y_spending_train, y_spending_test = train_test_split(
    x_spending, 
    y_spending, 
    test_size=0.2, 
    random_state=42     
)

# 5. Scale

In [9]:
# # -----------------------------------------------------------
# # Step 5.1 Scale features for savings model
# # ----------------------------------------------------------

scaler_s = StandardScaler()
scaler_s.fit(X_savings_train)

X_savings_train_scaled = scaler_s.transform(X_savings_train)
X_savings_test_scaled = scaler_s.transform(X_savings_test)

In [10]:
# # -----------------------------------------------------------
# # Step 5.2 Scale features for overspending model
# # ----------------------------------------------------------

scaler_s = StandardScaler()
scaler_s.fit(X_spending_train)

X_spending_train_scaled = scaler_s.transform(X_spending_train)
X_spending_test_scaled = scaler_s.transform(X_spending_test)

# 6. Baseline models
 - Linear Regression
 - Linear SVM
 - Random Forest

In [11]:
# # -----------------------------------------------------------
# # Step 6.1 Linear Regression for Savings Prediction
# # ----------------------------------------------------------
lr_savings = LinearRegression()
lr_savings.fit(X_savings_train_scaled, y_savings_train)

y_savings_pred_lr = lr_savings.predict(X_savings_test_scaled)

SVM Model <br>
Expensive to run: O(n^2) to O(n^3) <br>
\>7min
<br> Speed is a known constraint of linear SVM

In [None]:
# # -----------------------------------------------------------
# # Step 6.2 Linear SVM for Savings Prediction
# # ----------------------------------------------------------

svm_linear_savings = SVR(kernel='linear')
svm_linear_savings.fit(X_savings_train_scaled, y_savings_train)


Random Forest Configuration
 - Added all default values for possible configuration
 - Investigation needed into fine tuning

In [None]:
# # -----------------------------------------------------------
# # Step 6.3 Random Forest for Savings Prediction
# # ----------------------------------------------------------

rf_savings = RandomForestRegressor(
    n_estimators=100,              # Number of trees in the forest
    criterion='squared_error',     # Function to measure the quality of a split ('squared_error' for regression)
    max_depth=None,                # No maximum depth; trees expand until all leaves are pure
    min_samples_split=2,           # Minimum samples required to split a node
    min_samples_leaf=1,            # Minimum samples required at a leaf node
    min_weight_fraction_leaf=0.0,  # Minimum weighted fraction of the sum total of weights required to be at a leaf node
    max_features=1.0,              # Number of features to consider when looking for best split
    max_leaf_nodes=None,           # Unlimited leaf nodes
    min_impurity_decrease=0.0,     # Minimum impurity decrease required to split a node
    bootstrap=True,                # Whether bootstrap samples are used when building trees
    oob_score=False,               # Whether to use out-of-bag samples to estimate R^2
    n_jobs=None,                   # Number of CPU cores to use (None = 1 core)
    random_state=None,             # Seed for reproducibility
    verbose=0,                     # Verbosity level (0 = silent)
    warm_start=False,              # Reuse previous solution to add more estimators
    ccp_alpha=0.0,                 # Complexity parameter for Minimal Cost-Complexity Pruning
    max_samples=None,              # Number (or fraction) of samples to draw for each tree if bootstrap=True
)


rf_savings.fit(X_savings_train_scaled, y_savings_train)
y_savings_pred_rf = rf_savings.predict(X_savings_test_scaled)

In [None]:
# # -----------------------------------------------------------
# # Step 6.4 Linear Regression for Spending Prediction
# # ----------------------------------------------------------
lr_spending = LinearRegression()
lr_spending.fit(X_spending_train_scaled, y_spending_train)

y_spending_pred_lr = lr_spending.predict(X_spending_test_scaled)

In [None]:
# # -----------------------------------------------------------
# # Step 6.5 Linear SVM for Spending Prediction
# # ----------------------------------------------------------

# !!!!! Added max_iter to avoid timeout 

svm_spending = SVR(kernel='linear')
svm_spending.fit(X_spending_train_scaled, y_spending_train)

In [None]:
# # -----------------------------------------------------------
# # Step 6.6 Random Forest for Spending Prediction
# # ----------------------------------------------------------

rf_spending = RandomForestRegressor(
    n_estimators=100,              # Number of trees in the forest
    criterion='squared_error',     # Function to measure the quality of a split ('squared_error' for regression)
    max_depth=None,                # No maximum depth; trees expand until all leaves are pure
    min_samples_split=2,           # Minimum samples required to split a node
    min_samples_leaf=1,            # Minimum samples required at a leaf node
    min_weight_fraction_leaf=0.0,  # Minimum weighted fraction of the sum total of weights required to be at a leaf node
    max_features=1.0,              # Number of features to consider when looking for best split
    max_leaf_nodes=None,           # Unlimited leaf nodes
    min_impurity_decrease=0.0,     # Minimum impurity decrease required to split a node
    bootstrap=True,                # Whether bootstrap samples are used when building trees
    oob_score=False,               # Whether to use out-of-bag samples to estimate R^2
    n_jobs=None,                   # Number of CPU cores to use (None = 1 core)
    random_state=None,             # Seed for reproducibility
    verbose=0,                     # Verbosity level (0 = silent)
    warm_start=False,              # Reuse previous solution to add more estimators
    ccp_alpha=0.0,                 # Complexity parameter for Minimal Cost-Complexity Pruning
    max_samples=None,              # Number (or fraction) of samples to draw for each tree if bootstrap=True
    monotonic_cst=None             # Monotonic constraints (rarely used)
)


rf_spending.fit(X_spending_train_scaled, y_spending_train)
y_spending_pred_rf = rf_spending.predict(X_spending_test_scaled)

# 7. Evaluate Models
1. Regression model evaluation
    - Mean Absolute Error
    - Mean Squared Error
    - Root Mean Squated Error
    - Coefficient of Determination
2. Classifiaction model evaluation
    - Accuracy
    - Precision
    - Recall
    - F1

In [None]:
# Evaluation: Regression (Savings) and Classification (Overspend)
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

def regression_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_true, y_pred)
    return {'MAE': mae, 'MSE': mse, 'RMSE': rmse, 'R2': r2}

print("Savings prediction (Linear Regression) metrics:")
metrics_lr = regression_metrics(y_savings_test, y_savings_pred_lr)
print(metrics_lr)

print("Savings prediction (Random Forest) metrics:")
metrics_rf = regression_metrics(y_savings_test, y_savings_pred_rf)
print(metrics_rf)

try:
    X_train_cls = X_spending_train_scaled
    X_test_cls = X_spending_test_scaled
    y_train_cls = y_spending_train
    y_test_cls = y_spending_test
except NameError:
    raise RuntimeError("Spending train/test variables not found in notebook state")

y_spend_pred_lr_cont = globals().get('y_spending_pred_lr', None)
y_spend_pred_rf_cont = globals().get('y_spending_pred_rf', None)

def classification_eval(y_true, y_pred, name='Model'):
    print(f"\n{name} Results:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, zero_division=0))
    print("Recall:", recall_score(y_true, y_pred, zero_division=0))
    print("F1-score:", f1_score(y_true, y_pred, zero_division=0))
    print("\nConfusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print("\nDetailed Report:\n", classification_report(y_true, y_pred, zero_division=0))

if y_spend_pred_lr_cont is not None:
    y_pred_lr_bin = (np.array(y_spend_pred_lr_cont) >= 0.5).astype(int)
    classification_eval(y_test_cls, y_pred_lr_bin, name='Spending Linear Regression (thresholded)')

if y_spend_pred_rf_cont is not None:
    y_pred_rf_bin = (np.array(y_spend_pred_rf_cont) >= 0.5).astype(int)
    classification_eval(y_test_cls, y_pred_rf_bin, name='Spending Random Forest (thresholded)')

log_clf = LogisticRegression(max_iter=1000)
log_clf.fit(X_train_cls, y_train_cls)
y_pred_log = log_clf.predict(X_test_cls)
classification_eval(y_test_cls, y_pred_log, name='Logistic Regression (classifier)')

rf_clf = RandomForestClassifier(n_estimators=100, random_state=42)
rf_clf.fit(X_train_cls, y_train_cls)
y_pred_rf_clf = rf_clf.predict(X_test_cls)
classification_eval(y_test_cls, y_pred_rf_clf, name='Random Forest (classifier)')

if hasattr(log_clf, 'predict_proba'):
    try:
        y_proba = log_clf.predict_proba(X_test_cls)[:, 1]
        auc = roc_auc_score(y_test_cls, y_proba)
        print("\nLogistic Regression ROC AUC:", auc)
    except ValueError:
        print("\nROC AUC could not be computed (single class present in y_test)")


Savings prediction (Linear Regression) metrics:
{'MAE': 1367.3062541404454, 'MSE': 4752452.789736879, 'RMSE': np.float64(2180.012107704193), 'R2': 0.9268700960784603}
Savings prediction (Random Forest) metrics:
{'MAE': 1.3267414643144364, 'MSE': 537.7841964851087, 'RMSE': np.float64(23.190174567801527), 'R2': 0.9999917246717939}

Spending Linear Regression (thresholded) Results:
Accuracy: 0.9611068359625496
Precision: 0.6981132075471698
Recall: 0.029983792544570502
F1-score: 0.057498057498057496

Confusion Matrix:
 [[29938    16]
 [ 1197    37]]

Detailed Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98     29954
           1       0.70      0.03      0.06      1234

    accuracy                           0.96     31188
   macro avg       0.83      0.51      0.52     31188
weighted avg       0.95      0.96      0.94     31188


Spending Random Forest (thresholded) Results:
Accuracy: 0.9981403103757855
Precision: 0.9867549668874

# 8. Results