In [15]:
import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error, mean_squared_error
import lightgbm as lgb
from lightgbm import early_stopping, log_evaluation
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [11]:
df = pd.read_csv("expense_dataset.csv", parse_dates=["year_month"])

# Ensure sorted
df = df.sort_values(["company_id", "category", "year_month"]).reset_index(drop=True)


In [12]:
# 2. Feature Engineering
# ----------------------------
df["month"] = df["year_month"].dt.month
df["trend_index"] = (df["year_month"].dt.year - df["year_month"].dt.year.min())*12 + df["month"]

# Cyclical encoding for seasonality
df["month_sin"] = np.sin(2*np.pi*df["month"]/12)
df["month_cos"] = np.cos(2*np.pi*df["month"]/12)

# Missingness indicators
df["is_yoy_missing"] = df["yoy_pct_change"].isnull().astype(int)
df["is_pct_prev_missing"] = df["pct_change_prev"].isnull().astype(int)

# Fill NA values safely
for col in ["yoy_pct_change", "pct_change_prev", "roll_mean_3", "roll_std_3"]:
    df[col] = df[col].fillna(0)

# Ratios
eps = 1e-6
df["lag_ratio_1_2"] = df["lag_1"] / (df["lag_2"] + eps)
df["lag_ratio_1_rollmean"] = df["lag_1"] / (df["roll_mean_3"] + eps)

In [13]:
# ----------------------------
# 3. Define Features & Target
# ----------------------------
target = "monthly_total"
features = [
    "lag_1","lag_2","lag_3",
    "roll_mean_3","roll_std_3",
    "yoy_same_month","yoy_pct_change","pct_change_prev",
    "n_transactions",
    "is_sparse_category","fallback_used","seasonal_peak_flag","external_signal_index","confidence_flag",
    "trend_index","month_sin","month_cos",
    "lag_ratio_1_2","lag_ratio_1_rollmean"
]

# Drop any rows without target
df = df[df[target].notnull()]

In [16]:
# ----------------------------
# 4. Rolling Cross-Validation
# ----------------------------
def rolling_cv(df, features, target, n_folds=3):
    metrics = []
    unique_months = sorted(df["year_month"].unique())
    horizon = 1  # one-step ahead
    
    for i in range(n_folds):
        train_end = - (n_folds - i)
        train_months = unique_months[:train_end]
        val_month = unique_months[train_end]
        
        train = df[df["year_month"].isin(train_months)]
        val = df[df["year_month"]==val_month]
        
        X_train, y_train = train[features], train[target]
        X_val, y_val = val[features], val[target]
        
        # --- Baseline ---
        baseline = val["lag_1"].values
        mae_base = mean_absolute_error(y_val, baseline)
        
        # --- Ridge Regression ---
        ridge = Ridge(alpha=0.1)
        ridge.fit(X_train, y_train)
        preds_ridge = ridge.predict(X_val)
        mae_ridge = mean_absolute_error(y_val, preds_ridge)
        
        # --- LightGBM ---
        lgb_train = lgb.Dataset(X_train, label=y_train)
        lgb_val = lgb.Dataset(X_val, label=y_val, reference=lgb_train)
        
        params = {
            "objective": "regression",
            "metric": "mae",
            "learning_rate": 0.05,
            "num_leaves": 63,
            "max_depth": 10,
            "lambda_l2": 0.01,
            "feature_fraction": 0.9,
            "bagging_fraction": 0.9,
            "verbose": -1
        }
        
        model = lgb.train(
                    params,
                    lgb_train,
                    valid_sets=[lgb_train, lgb_val],
                    num_boost_round=1000,
                    callbacks=[early_stopping(50), log_evaluation(0)]  # 50 rounds patience
                )
            
        preds_lgb = model.predict(X_val, num_iteration=model.best_iteration)
        mae_lgb = mean_absolute_error(y_val, preds_lgb)
        
        metrics.append({
            "val_month": str(val_month)[:10],
            "Baseline_MAE": mae_base,
            "Ridge_MAE": mae_ridge,
            "LightGBM_MAE": mae_lgb
        })
    
    return pd.DataFrame(metrics)

cv_results = rolling_cv(df, features, target, n_folds=5)
print("\nCross-validation results:")
print(cv_results)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[370]	training's l1: 4184.11	valid_1's l1: 5222.4
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[154]	training's l1: 5741.68	valid_1's l1: 11676.9
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[111]	training's l1: 6522.54	valid_1's l1: 7662.14
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[122]	training's l1: 6031.29	valid_1's l1: 6823.68
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[125]	training's l1: 5853.81	valid_1's l1: 13952.6

Cross-validation results:
    val_month  Baseline_MAE     Ridge_MAE  LightGBM_MAE
0  2024-08-01  51027.189464  27204.273806   5222.397977
1  2024-09-01  73360.148600  45492.573298  11676.852124
2  2024-10-01  78914.143934  31229.751354   7662.142477
3  2024-11-01  

In [17]:
# ----------------------------
# 5. Train Final Model
# ----------------------------
X = df[features]
y = df[target]

final_model = lgb.LGBMRegressor(
    objective="regression",
    learning_rate=0.05,
    num_leaves=63,
    max_depth=10,
    n_estimators=1000,
    lambda_l2=0.01,
    feature_fraction=0.9,
    bagging_fraction=0.9
)
final_model.fit(X, y)


In [18]:
# ----------------------------
# 6. Predict Next Month
# ----------------------------
# Get last available month
last_month = df["year_month"].max()
next_month = last_month + pd.DateOffset(months=1)

# Prepare prediction rows (for each category in last month)
latest = df[df["year_month"]==last_month].copy()
latest["year_month"] = next_month

X_pred = latest[features]
preds = final_model.predict(X_pred)

latest["predicted_amount"] = preds
print("\nNext month forecast:")
print(latest[["company_id","category","year_month","predicted_amount"]].head())



Next month forecast:
     company_id    category year_month  predicted_amount
23            1  Consulting 2025-01-01     380058.714121
108           1   Marketing 2025-01-01     209406.236779
160           1    Training 2025-01-01      59204.253749
177           1      Travel 2025-01-01     314283.668252
198           1   Utilities 2025-01-01      62514.245362


In [19]:
# 7. Save Predictions
# ----------------------------
latest[["company_id","category","year_month","predicted_amount"]].to_csv("predicted_expenses.csv", index=False)
print("\n✅ Predictions saved to predicted_expenses.csv")


✅ Predictions saved to predicted_expenses.csv


In [20]:
import joblib

# Save the model
joblib.dump(final_model, "final_expense_model.pkl")
print("✅ Model saved as final_expense_model.pkl")


✅ Model saved as final_expense_model.pkl


In [22]:
import joblib

# Load the saved model
loaded_model = joblib.load("final_expense_model.pkl")

# Make predictions as usual
preds = loaded_model.predict(X_pred)
preds


array([380058.71412094, 209406.23677897,  59204.25374859, 314283.66825185,
        62514.24536178, 380875.54617521,  63074.43530787, 197636.37107558,
        14978.47649895, 191931.6573579 ,  37502.61022165, 241978.33767833,
       215935.88327295,  18537.90783226,  19518.70621035, 172085.83088725,
        40127.38370229, 251471.35422599, 215988.9396215 , 198326.26838963,
        63907.58536626,  62272.12523805,  52383.51263316,  99587.97743439,
       270729.92972827, 103875.3288225 ,  30702.80106508, 208749.68822307,
       414097.35044812,  50535.11753435, 447999.04120621, 538948.93772987,
        60174.05257727, 140168.36670106,  59972.60868185, 263273.6743737 ,
        42672.17572753, 486542.62117178, 238552.24569771,  99065.47044976,
        18848.0095712 , 111305.39880173, 326095.6501288 , 129290.80936291,
       149931.45495336,  24840.68142621,  57184.36530573,  45435.22692182])