## 1 — global setup: seed, imports, deterministic backend   

In [None]:
RANDOM_SEED = 42

# ── make project root importable ────────────────────────────────
import sys
import pathlib
sys.path.append(str(pathlib.Path.cwd().parent))          # …/twitter-airline-analysis

# ── stdlib ──────────────────────────────────────────────────────
import os
import random
from pathlib import Path

# ── third-party ────────────────────────────────────────────────
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt        # used for Optuna plots later
import optuna
from joblib                           import load, dump
from sklearn import set_config
from sklearn.pipeline                 import Pipeline
from sklearn.feature_extraction.text  import TfidfVectorizer
from sklearn.linear_model             import LogisticRegression
from sklearn.metrics                  import roc_auc_score, classification_report, RocCurveDisplay
from sklearn.model_selection          import train_test_split

# ── local project code ─────────────────────────────────────────
from twitter_airline_analysis.data_prep import load_prepared_data

# ── global sklearn setting ─────────────────────────────────────
set_config(transform_output="pandas")

# ── project paths ──────────────────────────────────────────────
PROJECT_ROOT   = Path.cwd().resolve().parent
BASELINE_PATH  = PROJECT_ROOT / "models" / "logreg_tfidf.joblib"
TUNED_MODEL_PATH = PROJECT_ROOT / "models" / "logreg_tfidf_optuna.joblib"
PROCESSED_DIR = PROJECT_ROOT / "data" / "processed"
ARTIFACT_DIR  = PROJECT_ROOT / "artifacts"
ARTIFACT_DIR.mkdir(exist_ok=True)

# ── reproducibility ────────────────────────────────────────────
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)

  from .autonotebook import tqdm as notebook_tqdm


## 2 - load and split data

In [None]:
def _load(name: str):
    """Helper to read a Feather column and return a pandas Series / Index."""
    return pd.read_feather(PROCESSED_DIR / f"{name}.ftr")[name]

# X, y splits already materialised during Module-4
X_train = _load("X_train")
X_valid = _load("X_val")
X_test  = _load("X_test")

y_train = _load("y_train")
y_valid = _load("y_val")
y_test  = _load("y_test")

print(
    "Shapes —",
    f"train: {X_train.shape},  valid: {X_valid.shape},  test: {X_test.shape}"
)

## 3 - Baseline Reference (Pre‑Optuna)      

In [None]:
"""
Falls back to training a minimal model
if the file isn’t found, so the notebook remains runnable end-to-end.
"""

print(f"Looking for baseline at: {BASELINE_PATH}")

if BASELINE_PATH.exists():
    baseline_pipe = load(BASELINE_PATH)
    print("✔ Loaded baseline artefact.")
else:
    print("✗ Baseline artefact not found – training quick default model...")
    baseline_pipe = Pipeline([
        ("tfidf", TfidfVectorizer(sublinear_tf=True)),
        ("clf",   LogisticRegression(max_iter=500, n_jobs=-1, random_state=42)),
    ])
    baseline_pipe.fit(X_train, y_train)
    BASELINE_PATH.parent.mkdir(exist_ok=True)
    dump(baseline_pipe, BASELINE_PATH)
    print(f"Saved new baseline to {BASELINE_PATH}")

# ── validation metric ──────────────────────────────────────────
baseline_preds = baseline_pipe.predict_proba(X_valid)[:, 1]
baseline_auc   = roc_auc_score(y_valid, baseline_preds)

print(f"Baseline TF-IDF + LogReg AUC: {baseline_auc:.3f}")


## 3 - Optuna Setup

The objective is deliberately lightweight; feature engineering is confined to TfidfVectorizer to keep search time reasonable.

In [None]:

def objective(trial: optuna.Trial) -> float:
    """Return validation ROC-AUC for a single Optuna trial."""
    # ── pipeline definition ─────────────────────────────────────
    pipeline = Pipeline([
        (
            "tfidf",
            TfidfVectorizer(
                max_df      = trial.suggest_float("max_df", 0.7, 1.0),
                min_df      = trial.suggest_int("min_df", 1, 10),
                ngram_range = (1, trial.suggest_int("max_ngram", 1, 3)),
                sublinear_tf=True,
            ),
        ),
        (
            "clf",
            LogisticRegression(
                C        = trial.suggest_loguniform("C", 1e-3, 1e2),
                penalty  = trial.suggest_categorical("penalty", ["l2", "elasticnet"]),
                solver   = "saga",
                l1_ratio = (
                    trial.suggest_float("l1_ratio", 0.0, 1.0)
                    if trial.params.get("penalty") == "elasticnet"
                    else None
                ),
                max_iter     = 500,
                n_jobs       = -1,
                random_state = RANDOM_SEED,
            ),
        ),
    ])

    # ── training & evaluation ───────────────────────────────────
    pipeline.fit(X_train, y_train)
    preds  = pipeline.predict_proba(X_valid)[:, 1]
    score  = roc_auc_score(y_valid, preds)

    # Save pipeline so we can persist the best one later
    trial.set_user_attr("pipeline", pipeline)
    return score

# ── create / load study ────────────────────────────────────────
study = optuna.create_study(
    direction      = "maximize",
    study_name     = "logreg_tfidf_auc",
    pruner         = optuna.pruners.MedianPruner(n_warmup_steps=10),
    storage        = f"sqlite:///{ARTIFACT_DIR/'optuna_study.db'}",
    load_if_exists = True,
)


## 4 - Run Study

In [None]:
study.optimize(objective, n_trials=100, show_progress_bar=True)

## 4 -  Persist & reload the best model

In [None]:
best_pipeline = study.best_trial.user_attrs["pipeline"]   # retrieved from objective

# Persist
TUNED_MODEL_PATH.parent.mkdir(exist_ok=True)
dump(best_pipeline, TUNED_MODEL_PATH)
print(f"✅ Saved tuned model → {TUNED_MODEL_PATH}")

# Reload to verify (optional sanity check)
best_pipeline = load(TUNED_MODEL_PATH)

## 5 - Final evaluation on the held‑out test set

In [None]:
preds = best_pipeline.predict_proba(X_test)[:, 1]

print(classification_report(y_test, preds > 0.5, digits=3))
RocCurveDisplay.from_predictions(y_test, preds)
study.trials_dataframe().to_csv(ARTIFACT_DIR / "optuna_trials.csv", index=False)

## 6 - Persist artefacts