## 1 — global setup: seed, imports, deterministic backend   

In [None]:
# ╔════════════════════════════════════════════════════════════════╗
# ║  Cell 1 — global setup: seed, imports, deterministic backend   ║
# ╚════════════════════════════════════════════════════════════════╝
RANDOM_SEED = 42

# -- make project root importable ---------------------------------
import sys, pathlib
sys.path.append(str(pathlib.Path.cwd().parent))   # <‑‑ added line

# -- std libs & third‑party ---------------------------------------
import os, random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import optuna
from sklearn import set_config
from twitter_airline_analysis.data_prep import load_prepared_data

import optuna
from sklearn.pipeline          import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model      import LogisticRegression
from sklearn.metrics           import roc_auc_score
from sklearn.model_selection   import train_test_split

set_config(transform_output="pandas")

# -- reproducibility ----------------------------------------------
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
os.environ["PYTHONHASHSEED"] = str(RANDOM_SEED)


  from .autonotebook import tqdm as notebook_tqdm


## 2 - load and split data

In [2]:
# Load and Split 
X, y = load_prepared_data()           # uses the pre‑cleaned text & label

X_train, X_valid, y_train, y_valid = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

## 3 - Optuna Setup

The objective is deliberately lightweight; feature engineering is confined to TfidfVectorizer to keep search time reasonable.

In [None]:
def objective(trial: optuna.Trial) -> float:
    """Return validation ROC‑AUC for a single trial."""
    pipeline = Pipeline([
        ("tfidf", TfidfVectorizer(
            max_df      = trial.suggest_float("max_df", 0.7, 1.0),
            min_df      = trial.suggest_int(  "min_df", 1, 10),
            ngram_range = (1, trial.suggest_int("max_ngram", 1, 3)),
            sublinear_tf= True
        )),
        ("clf", LogisticRegression(
            C       = trial.suggest_loguniform("C", 1e-3, 1e+2),
            penalty = trial.suggest_categorical("penalty", ["l2", "elasticnet"]),
            solver  = "saga",
            l1_ratio= trial.suggest_float("l1_ratio", 0.0, 1.0) if
                      trial.params.get("penalty") == "elasticnet" else None,
            max_iter= 500,
            n_jobs  = -1,
            random_state = 42
        ))
    ])
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict_proba(X_valid)[:, 1]
    return roc_auc_score(y_valid, preds)

study = optuna.create_study(
    direction="maximize",
    study_name="logreg_tfidf_auc",
    pruner=optuna.pruners.MedianPruner(n_warmup_steps=10),
    storage="sqlite:///artifacts/optuna_study.db",
    load_if_exists=True
)

## 4 - Run Study

In [None]:
study.optimize(objective, n_trials=100, show_progress_bar=True)