<H1 align="center"> Automated Expense Categorization - Notebook 02: Baseline Model Development



## 1. Setup

In [94]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

## 2. Load data

In [95]:
df = pd.read_csv('../data/processed/transactions_long.csv')

In [96]:
# -----------------------------------------------------------
# Step 2.1: Label Encoding for Categories
# -----------------------------------------------------------

category_map = {
    'Groceries': 1,
    'Eating_Out': 2,
    'Entertainment': 3,
    'Transport': 4,
    'Utilities': 5,
    'Healthcare': 6,
    'Education': 7,
    'Miscellaneous': 8

}
df['category_encoded'] = df['category'].map(category_map)

## 3. Define featues and target

In [97]:
df.columns.tolist()


['Income',
 'Age',
 'Dependents',
 'Occupation',
 'City_Tier',
 'Rent',
 'Loan_Repayment',
 'Insurance',
 'Desired_Savings_Percentage',
 'Desired_Savings',
 'Disposable_Income',
 'Potential_Savings_Groceries',
 'Potential_Savings_Transport',
 'Potential_Savings_Eating_Out',
 'Potential_Savings_Entertainment',
 'Potential_Savings_Utilities',
 'Potential_Savings_Healthcare',
 'Potential_Savings_Education',
 'Potential_Savings_Miscellaneous',
 'entity_id',
 'category',
 'amount',
 'category_encoded']

In [98]:
# -----------------------------------------------------------
# Step 3.1: Predict Amount for Savings
#   Remove non-numeric columns and target variable
# -----------------------------------------------------------

x_savings = df.drop(columns=["Desired_Savings", "entity_id", "Occupation", "City_Tier", "category"])
y_savings = df["Desired_Savings"]

In [99]:
# -----------------------------------------------------------
# Step 3.2 Predict Overspending
#   Remove non-numeric columns and target variable
# -----------------------------------------------------------

df["Overspend_Flag"] = (df["amount"] > df["Disposable_Income"]).astype(int)
x_spending = df.drop(columns=["Overspend_Flag", "entity_id", "category", "Occupation", "City_Tier", "category"])
y_spending = df["Overspend_Flag"]


In [100]:
# # -----------------------------------------------------------
# # Step 3.3 Predict Spending Category
# # ----------------------------------------------------------
# future_spending = [
    
# ]
# y_future_spending = df[future_spending]
# x_future_soending = pd.get_dummies(df, columns=["category"], drop_first=True)



# 4. Split data into train and test

In [101]:
# # -----------------------------------------------------------
# # Step 4.1 Split data for savings category
# # ----------------------------------------------------------
# Split 80% training/ 20% testing

X_savings_train, X_savings_test, y_savings_train, y_savings_test = train_test_split(
    x_savings, 
    y_savings, 
    test_size=0.2, 
    random_state=42     
)


In [102]:
# # -----------------------------------------------------------
# # Step 4.2 Split data for overspending category 
# # ----------------------------------------------------------
# Split 80% training/ 20% testing

X_spending_train, X_spending_test, y_spending_train, y_spending_test = train_test_split(
    x_spending, 
    y_spending, 
    test_size=0.2, 
    random_state=42     
)

# 5. Scale

In [103]:
# # -----------------------------------------------------------
# # Step 5.1 Scale features for savings model
# # ----------------------------------------------------------

scaler_s = StandardScaler()
scaler_s.fit(X_savings_train)

X_savings_train_scaled = scaler_s.transform(X_savings_train)
X_savings_test_scaled = scaler_s.transform(X_savings_test)

In [104]:
# # -----------------------------------------------------------
# # Step 5.2 Scale features for overspending model
# # ----------------------------------------------------------

scaler_s = StandardScaler()
scaler_s.fit(X_spending_train)

X_spending_train_scaled = scaler_s.transform(X_spending_train)
X_spending_test_scaled = scaler_s.transform(X_spending_test)

# 6. Baseline models
 - Linear Regression
 - Linear SVM
 - Random Forest

# 7. Evaluate Models

# 8. Results