<H1 align="center"> Automated Expense Categorization - Notebook 02: Baseline Model Development



## 1. Setup

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.ensemble import RandomForestRegressor

## 2. Load data

In [2]:
df = pd.read_csv('../data/processed/transactions_long.csv')

In [3]:
# -----------------------------------------------------------
# Step 2.1: Label Encoding for Categories
# -----------------------------------------------------------

category_map = {
    'Groceries': 1,
    'Eating_Out': 2,
    'Entertainment': 3,
    'Transport': 4,
    'Utilities': 5,
    'Healthcare': 6,
    'Education': 7,
    'Miscellaneous': 8

}
df['category_encoded'] = df['category'].map(category_map)

## 3. Define featues and target

In [4]:
df.columns.tolist()

['Income',
 'Age',
 'Dependents',
 'Occupation',
 'City_Tier',
 'Rent',
 'Loan_Repayment',
 'Insurance',
 'Desired_Savings_Percentage',
 'Desired_Savings',
 'Disposable_Income',
 'Potential_Savings_Groceries',
 'Potential_Savings_Transport',
 'Potential_Savings_Eating_Out',
 'Potential_Savings_Entertainment',
 'Potential_Savings_Utilities',
 'Potential_Savings_Healthcare',
 'Potential_Savings_Education',
 'Potential_Savings_Miscellaneous',
 'entity_id',
 'category',
 'amount',
 'category_encoded']

In [5]:
# -----------------------------------------------------------
# Step 3.1: Predict Amount for Savings
#   Remove non-numeric columns and target variable
# -----------------------------------------------------------

x_savings = df.drop(columns=["Desired_Savings", "entity_id", "Occupation", "City_Tier", "category"])
y_savings = df["Desired_Savings"]

In [6]:
# -----------------------------------------------------------
# Step 3.2 Predict Overspending
#   Remove non-numeric columns and target variable
# -----------------------------------------------------------

df["Overspend_Flag"] = (df["amount"] > df["Disposable_Income"]).astype(int)
x_spending = df.drop(columns=["Overspend_Flag", "entity_id", "category", "Occupation", "City_Tier", "category"])
y_spending = df["Overspend_Flag"]


In [7]:
# # -----------------------------------------------------------
# # Step 3.3 Predict Spending Category
# # ----------------------------------------------------------
# future_spending = [
    
# ]
# y_future_spending = df[future_spending]
# x_future_soending = pd.get_dummies(df, columns=["category"], drop_first=True)


# 4. Split data into train and test

In [8]:
# # -----------------------------------------------------------
# # Step 4.1 Split data for savings category
# # ----------------------------------------------------------
# Split 80% training/ 20% testing

X_savings_train, X_savings_test, y_savings_train, y_savings_test = train_test_split(
    x_savings, 
    y_savings, 
    test_size=0.2, 
    random_state=42     
)

In [9]:
# # -----------------------------------------------------------
# # Step 4.2 Split data for overspending category 
# # ----------------------------------------------------------
# Split 80% training/ 20% testing

X_spending_train, X_spending_test, y_spending_train, y_spending_test = train_test_split(
    x_spending, 
    y_spending, 
    test_size=0.2, 
    random_state=42     
)

# 5. Scale

In [10]:
# # -----------------------------------------------------------
# # Step 5.1 Scale features for savings model
# # ----------------------------------------------------------

scaler_s = StandardScaler()
scaler_s.fit(X_savings_train)

X_savings_train_scaled = scaler_s.transform(X_savings_train)
X_savings_test_scaled = scaler_s.transform(X_savings_test)

In [11]:
# # -----------------------------------------------------------
# # Step 5.2 Scale features for overspending model
# # ----------------------------------------------------------

scaler_s = StandardScaler()
scaler_s.fit(X_spending_train)

X_spending_train_scaled = scaler_s.transform(X_spending_train)
X_spending_test_scaled = scaler_s.transform(X_spending_test)

# 6. Baseline models
 - Linear Regression
 - Linear SVM
 - Random Forest

In [12]:
# # -----------------------------------------------------------
# # Step 6.1 Linear Regression for Savings Prediction
# # ----------------------------------------------------------
lr_savings = LinearRegression()
lr_savings.fit(X_savings_train_scaled, y_savings_train)

y_savings_pred_lr = lr_savings.predict(X_savings_test_scaled)

SVM Model <br>
Expensive to run: O(n^2) to O(n^3) <br>
\>7min
<br> Speed is a known constraint of linear SVM

In [13]:
# # -----------------------------------------------------------
# # Step 6.2 Linear SVM for Savings Prediction
# # ----------------------------------------------------------

svm_linear = SVR(kernel='linear')
svm_linear.fit(X_savings_train_scaled, y_savings_train)


0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


Random Forest Configuration
 - Added all default values for possible configuration
 - Investigation needed into fine tuning

In [14]:
# # -----------------------------------------------------------
# # Step 6.3 Random Forest for Savings Prediction
# # ----------------------------------------------------------

rf_savings = RandomForestRegressor(
    n_estimators=100,              # Number of trees in the forest
    criterion='squared_error',     # Function to measure the quality of a split ('squared_error' for regression)
    max_depth=None,                # No maximum depth; trees expand until all leaves are pure
    min_samples_split=2,           # Minimum samples required to split a node
    min_samples_leaf=1,            # Minimum samples required at a leaf node
    min_weight_fraction_leaf=0.0,  # Minimum weighted fraction of the sum total of weights required to be at a leaf node
    max_features=1.0,              # Number of features to consider when looking for best split
    max_leaf_nodes=None,           # Unlimited leaf nodes
    min_impurity_decrease=0.0,     # Minimum impurity decrease required to split a node
    bootstrap=True,                # Whether bootstrap samples are used when building trees
    oob_score=False,               # Whether to use out-of-bag samples to estimate R^2
    n_jobs=None,                   # Number of CPU cores to use (None = 1 core)
    random_state=None,             # Seed for reproducibility
    verbose=0,                     # Verbosity level (0 = silent)
    warm_start=False,              # Reuse previous solution to add more estimators
    ccp_alpha=0.0,                 # Complexity parameter for Minimal Cost-Complexity Pruning
    max_samples=None,              # Number (or fraction) of samples to draw for each tree if bootstrap=True
    monotonic_cst=None             # Monotonic constraints (rarely used)
)


rf_savings.fit(X_savings_train_scaled, y_savings_train)
y_savings_pred_rf = rf_savings.predict(X_savings_test_scaled)

In [15]:
# # -----------------------------------------------------------
# # Step 6.4 Linear Regression for Spending Prediction
# # ----------------------------------------------------------
lr_spending = LinearRegression()
lr_spending.fit(X_spending_train_scaled, y_spending_train)

y_spending_pred_lr = lr_spending.predict(X_spending_test_scaled)

In [16]:
# # -----------------------------------------------------------
# # Step 6.5 Linear SVM for Spending Prediction
# # ----------------------------------------------------------

# !!!!! Added max_iter to avoid timeout 

svm_linear = SVR(kernel='linear', max_iter=1000)
svm_linear.fit(X_spending_train_scaled, y_spending_train)



0,1,2
,kernel,'linear'
,degree,3
,gamma,'scale'
,coef0,0.0
,tol,0.001
,C,1.0
,epsilon,0.1
,shrinking,True
,cache_size,200
,verbose,False


In [17]:
# # -----------------------------------------------------------
# # Step 6.6 Random Forest for Spending Prediction
# # ----------------------------------------------------------

rf_spending = RandomForestRegressor(
    n_estimators=100,              # Number of trees in the forest
    criterion='squared_error',     # Function to measure the quality of a split ('squared_error' for regression)
    max_depth=None,                # No maximum depth; trees expand until all leaves are pure
    min_samples_split=2,           # Minimum samples required to split a node
    min_samples_leaf=1,            # Minimum samples required at a leaf node
    min_weight_fraction_leaf=0.0,  # Minimum weighted fraction of the sum total of weights required to be at a leaf node
    max_features=1.0,              # Number of features to consider when looking for best split
    max_leaf_nodes=None,           # Unlimited leaf nodes
    min_impurity_decrease=0.0,     # Minimum impurity decrease required to split a node
    bootstrap=True,                # Whether bootstrap samples are used when building trees
    oob_score=False,               # Whether to use out-of-bag samples to estimate R^2
    n_jobs=None,                   # Number of CPU cores to use (None = 1 core)
    random_state=None,             # Seed for reproducibility
    verbose=0,                     # Verbosity level (0 = silent)
    warm_start=False,              # Reuse previous solution to add more estimators
    ccp_alpha=0.0,                 # Complexity parameter for Minimal Cost-Complexity Pruning
    max_samples=None,              # Number (or fraction) of samples to draw for each tree if bootstrap=True
    monotonic_cst=None             # Monotonic constraints (rarely used)
)


rf_spending.fit(X_spending_train_scaled, y_spending_train)
y_spending_pred_rf = rf_spending.predict(X_spending_test_scaled)

# 7. Evaluate Models
1. Regression model evaluation
    - Mean Absolute Error
    - Mean Squared Error
    - Root Mean Squated Error
    - Coefficient of Determination
2. Classifiaction model evaluation
    - Accuracy
    - Precision
    - Recall
    - F1

# 8. Results