In [1]:

import os, sys
sys.path.append(os.path.abspath("../src"))


import pandas as pd
from datetime import date

from ml_price_model import (
    make_synthetic_training_data,
    train_baseline_model,
    predict_price_drop_probability,
    build_feature_pipeline,  # we'll use this directly for train/test split
)

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression


In [2]:
df = make_synthetic_training_data(n_rows=2000)
df.head()


Unnamed: 0,origin,destination,airline,region,departure_date,search_date,days_until_departure,current_price_usd,future_min_price_usd,price_drops
0,MIA,SJU,Spirit,EU,2025-12-27,2025-12-18,41,243.888878,219.210136,0
1,SJU,MIA,Spirit,EU,2025-11-30,2025-11-19,14,230.413577,212.055301,0
2,MCO,SJU,Delta,EU,2025-12-28,2025-12-24,42,279.599626,220.221772,0
3,MIA,SJU,Spirit,LATAM,2025-12-20,2025-12-19,34,208.861441,191.38401,1
4,SJU,MCO,JetBlue,EU,2025-12-02,2025-11-21,16,285.166497,256.539497,1


In [3]:
df["price_drops"].value_counts(normalize=True)

price_drops
1    0.576
0    0.424
Name: proportion, dtype: float64

In [4]:
feature_cols = [
    "origin",
    "destination",
    "airline",
    "region",
    "days_until_departure",
    "current_price_usd",
]

X = df[feature_cols]
y = df["price_drops"].astype(int)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y,
)

X_train.shape, X_test.shape


((1600, 6), (400, 6))

In [5]:
preprocessor = build_feature_pipeline()

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

clf = LogisticRegression(max_iter=1000)

model = Pipeline(
    steps=[
        ("preprocess", preprocessor),
        ("clf", clf),
    ]
)

model.fit(X_train, y_train)
model


In [6]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=None,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

model_rf = Pipeline(
    steps=[
        ("preprocess", preprocessor),   # keep same preprocessing
        ("clf", rf),
    ]
)

model_rf.fit(X_train, y_train)

y_pred_rf = model_rf.predict(X_test)
y_proba_rf = model_rf.predict_proba(X_test)[:, 1]

print("Classification report:\n")
print(classification_report(y_test, y_pred_rf))

roc = roc_auc_score(y_test, y_proba_rf)
print(f"ROC-AUC: {roc:.3f}")


Classification report:

              precision    recall  f1-score   support

           0       0.47      0.34      0.40       170
           1       0.60      0.72      0.65       230

    accuracy                           0.56       400
   macro avg       0.53      0.53      0.52       400
weighted avg       0.54      0.56      0.54       400

ROC-AUC: 0.551


In [8]:
import joblib

os.makedirs("../models", exist_ok=True)
joblib.dump(model_rf, "../models/price_drop_model.pkl")

['../models/price_drop_model.pkl']