# 03 ‚Äî Crime: Violent vs Non‚ÄëViolent (With Optuna)

Adapted to your normalized SQLite DB: `data/crime.db`.

**Target:** `violent` (0/1)

**Runs in this notebook:** 8 tuned experiments = 4 models √ó (PCA off/on).

Saves artifacts per run to `models/experiments/<run_id>/`.


In [26]:
# If needed: !pip install optuna lightgbm "mlflow<3" scikit-learn pandas numpy joblib
from dotenv import load_dotenv
import os
import mlflow

load_dotenv()  # loads .env file

MLFLOW_OK = True
try:
    mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
    mlflow.set_experiment("crime-violent-vs-nonviolent")
    print("‚úÖ MLflow connected to:", os.environ["MLFLOW_TRACKING_URI"])
except Exception as e:
    print("‚ö†Ô∏è MLflow disabled:", e)
    MLFLOW_OK = False

FAST_MODE = True

if FAST_MODE:
    OPTUNA_TRIALS = 8
    CV_SPLITS = 2
    TUNE_FRACTION = 0.5
else:
    OPTUNA_TRIALS = 25
    CV_SPLITS = 3
    TUNE_FRACTION = 1.0


‚úÖ MLflow connected to: https://dagshub.com/kt19-jpeg/ml_project.mlflow


In [27]:

import os, json, sqlite3
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

try:
    import mlflow
    MLFLOW_OK = True
except Exception as e:
    print("‚ö†Ô∏è mlflow not installed; skipping MLflow logging.")
    MLFLOW_OK = False

SEED = 42

def find_project_root() -> Path:
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if (p / "data" / "crime.db").exists():
            return p
    return here

ROOT = find_project_root()
DB_PATH = ROOT / "data" / "crime.db"
OUT_ROOT = ROOT / "models" / "experiments"
OUT_ROOT.mkdir(parents=True, exist_ok=True)

def _make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

print("ROOT:", ROOT)
print("DB_PATH:", DB_PATH)
print("OUT_ROOT:", OUT_ROOT)


ROOT: /Users/kavyansh/IdeaProjects/ml_project/buffalo_crime
DB_PATH: /Users/kavyansh/IdeaProjects/ml_project/buffalo_crime/data/crime.db
OUT_ROOT: /Users/kavyansh/IdeaProjects/ml_project/buffalo_crime/models/experiments


In [28]:

SQL = '''
SELECT
  i.incident_id,
  i.hour_of_day,
  i.latitude,
  i.longitude,
  i.zip_code,
  i.council_district,
  dow.name AS day_of_week,
  pd.name  AS police_district,
  nb.name  AS neighborhood,
  l.violent
FROM incidents i
JOIN labels l ON l.incident_id = i.incident_id
LEFT JOIN day_of_week_dim dow ON dow.day_of_week_id = i.day_of_week_id
LEFT JOIN police_district_dim pd ON pd.police_district_id = i.police_district_id
LEFT JOIN neighborhood_dim nb ON nb.neighborhood_id = i.neighborhood_id;
'''

if not DB_PATH.exists():
    raise FileNotFoundError(f"Could not find DB at {DB_PATH}")

con = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(SQL, con)
con.close()

print("Loaded:", df.shape)
df.head()


Loaded: (327558, 10)


Unnamed: 0,incident_id,hour_of_day,latitude,longitude,zip_code,council_district,day_of_week,police_district,neighborhood,violent
0,09-2820596,3,42.885,-78.879,14202,FILLMORE,Friday,District B,Central,0
1,06-1740516,0,42.939,-78.838,14214,MASTEN,Tuesday,District E,Fillmore-Leroy,0
2,08-1760604,15,42.935,-78.866,14216,NORTH,Tuesday,District D,Parkside,1
3,06-0210543,13,42.899,-78.81,14211,LOVEJOY,Saturday,District C,Genesee-Moselle,0
4,06-2120630,16,42.948,-78.906,14207,NORTH,Monday,District D,Riverside,0


In [29]:
import numpy as np
import pandas as pd

TARGET_COL = "violent"
FEATURE_COLS = [
    "hour_of_day", "latitude", "longitude",
    "zip_code", "council_district",
    "day_of_week", "police_district", "neighborhood",
]

# --- Clean + stabilize numeric inputs BEFORE split (prevents PCA matmul warning) ---
# Ensure required columns exist (helps catch SQL/schema changes early)
missing_cols = [c for c in FEATURE_COLS + [TARGET_COL] if c not in df.columns]
if missing_cols:
    raise ValueError(f"Missing columns in df: {missing_cols}")

# Coerce numeric cols to numeric
for c in ["hour_of_day", "latitude", "longitude"]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Replace inf/-inf with NaN so imputers can handle them
df = df.replace([np.inf, -np.inf], np.nan)

# Keep only rows with target, and drop rows missing critical numeric fields
df = df[FEATURE_COLS + [TARGET_COL]].dropna(subset=[TARGET_COL]).copy()
df = df.dropna(subset=["hour_of_day", "latitude", "longitude"])

# Cast target to int
df[TARGET_COL] = df[TARGET_COL].astype(int)

X = df[FEATURE_COLS]
y = df[TARGET_COL]

print("Target counts:\n", y.value_counts())
print("\nTarget %:\n", (y.value_counts(normalize=True) * 100).round(2))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

NUM_COLS = ["hour_of_day", "latitude", "longitude"]
CAT_COLS = ["zip_code", "council_district", "day_of_week", "police_district", "neighborhood"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]), NUM_COLS),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", _make_ohe()),
        ]), CAT_COLS),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Target counts:
 violent
0    229361
1     91038
Name: count, dtype: int64

Target %:
 violent
0    71.59
1    28.41
Name: proportion, dtype: float64
Train: (256319, 8) Test: (64080, 8)


In [30]:

import optuna
FAST_MODE = True

if FAST_MODE:
    OPTUNA_TRIALS = 8
    CV_SPLITS = 2
    TUNE_FRACTION = 0.5
else:
    OPTUNA_TRIALS = 25
    CV_SPLITS = 3
    TUNE_FRACTION = 1.0

skf = StratifiedKFold(n_splits=CV_SPLITS, shuffle=True, random_state=SEED)

MODEL_NAMES = ["logreg", "ridge", "gboost", "rf"]
PCA_FLAGS = [False, True]

def build_estimator(model_name: str, params: dict):
    if model_name == "logreg":
        # only C is tuned here
        C = params.get("C", 1.0)
        return LogisticRegression(C=C, solver="lbfgs", max_iter=3000, random_state=SEED)

    if model_name == "ridge":
        alpha = params.get("alpha", 1.0)
        return RidgeClassifier(alpha=alpha, random_state=SEED)

    if model_name == "gboost":
        return GradientBoostingClassifier(
            n_estimators=params.get("n_estimators", 50),
            learning_rate=params.get("learning_rate", 0.1),
            max_depth=params.get("max_depth", 3),
            random_state=SEED
        )

    if model_name == "rf":
        # IMPORTANT: don't pass random_state twice
        rf_params = dict(params)
        rf_params.pop("random_state", None)
        return RandomForestClassifier(**rf_params, random_state=SEED)

    raise ValueError(model_name)


def suggest_params(trial, model_name: str):
    if model_name == "logreg":
        return {"C": trial.suggest_float("C", 1e-3, 50.0, log=True)}

    if model_name == "ridge":
        return {"alpha": trial.suggest_float("alpha", 1e-3, 50.0, log=True)}

    if model_name == "gboost":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 100, 600),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
            "max_depth": trial.suggest_int("max_depth", 2, 6),
        }

    if model_name == "rf":
        return {
            "n_estimators": trial.suggest_int("n_estimators", 80, 200),
            "max_depth": trial.suggest_int("max_depth", 3, 18),
            "min_samples_split": trial.suggest_int("min_samples_split", 2, 20),
            "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
            "max_features": trial.suggest_categorical("max_features", ["sqrt", "log2", None]),
            "n_jobs": -1,
        }

    raise ValueError(model_name)


def build_pipeline(model_name: str, params: dict, use_pca: bool):
    steps = [("preprocess", preprocess)]
    if use_pca:
        steps.append(("pca", PCA(n_components=20, random_state=SEED)))
    steps.append(("clf", build_estimator(model_name, params)))
    return Pipeline(steps)


def save_run(run_id: str, pipe: Pipeline, metrics: dict, params: dict):
    out_dir = OUT_ROOT / run_id
    out_dir.mkdir(parents=True, exist_ok=True)
    joblib.dump(pipe, out_dir / "model.pkl")
    (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")
    (out_dir / "params.json").write_text(json.dumps(params, indent=2), encoding="utf-8")
    return out_dir


def maybe_log_mlflow(run_id: str, params: dict, metrics: dict, artifact_dir: Path):
    if not MLFLOW_OK:
        return
    mlflow.set_experiment("crime-violent-vs-nonviolent")
    with mlflow.start_run(run_name=run_id):
        mlflow.log_params(params)
        mlflow.log_metric("f1", float(metrics.get("f1", 0.0)))
        if "best_cv_f1" in metrics and metrics["best_cv_f1"] is not None:
            mlflow.log_metric("best_cv_f1", float(metrics["best_cv_f1"]))
        mlflow.log_artifacts(str(artifact_dir))



In [None]:

tuned_results = []

for model_name in MODEL_NAMES:
    for use_pca in PCA_FLAGS:
        run_id = f"{model_name}__pca{int(use_pca)}__tuned1"

        def objective(trial):
            params = suggest_params(trial, model_name)
            pipe = build_pipeline(model_name, params, use_pca)

            scores = []
            for tr_idx, va_idx in skf.split(X_train, y_train):
                X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
                y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]
                pipe.fit(X_tr, y_tr)
                pred = pipe.predict(X_va)
                scores.append(f1_score(y_va, pred, average="binary"))
            return float(np.mean(scores))

        study = optuna.create_study(direction="maximize")
        study.optimize(objective, n_trials=OPTUNA_TRIALS)

        best_params = dict(study.best_params)
        best_cv_f1 = float(study.best_value)

        pipe = build_pipeline(model_name, best_params, use_pca)
        pipe.fit(X_train, y_train)

        pred = pipe.predict(X_test)
        f1 = float(f1_score(y_test, pred, average="binary"))

        metrics = {
            "f1": f1,
            "best_cv_f1": best_cv_f1,
            "optuna_trials": OPTUNA_TRIALS,
            "confusion_matrix": confusion_matrix(y_test, pred).tolist(),
            "classification_report": classification_report(y_test, pred, output_dict=True),
        }

        params_out = {
            "model": model_name,
            "use_pca": use_pca,
            "tuned": True,
            "pca_components": 20 if use_pca else None,
            **best_params,
        }

        out_dir = save_run(run_id, pipe, metrics, params_out)
        maybe_log_mlflow(run_id, params_out, metrics, out_dir)

        tuned_results.append({
            "run_id": run_id,
            "model": model_name,
            "use_pca": use_pca,
            "tuned": True,
            "f1": f1,
            "best_cv_f1": best_cv_f1,
            "artifact_dir": str(out_dir),
        })

        print(f"‚úÖ {run_id}: F1={f1:.4f} (CV best={best_cv_f1:.4f})")

tuned_leaderboard = pd.DataFrame(tuned_results).sort_values("f1", ascending=False)
tuned_leaderboard


[I 2025-12-19 11:13:44,993] A new study created in memory with name: no-name-cb32cc47-c7fb-4bd4-ac4c-c3c0d1310335
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b
[I 2025-12-19 11:13:46,700] Trial 0 finished with value: 0.0008771689372550095 and parameters: {'C': 0.068255285780164

üèÉ View run logreg__pca0__tuned1 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/58c76e5c093440aa9696eec036f63e64
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ logreg__pca0__tuned1: F1=0.0003 (CV best=0.0018)


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transf

üèÉ View run logreg__pca1__tuned1 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/2ab74f2e193c4653b90ce03a6c7e24c3
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0


[I 2025-12-19 11:14:13,301] A new study created in memory with name: no-name-a7b6a384-b1ef-4b00-ad8e-8fa8c25573ba


‚úÖ logreg__pca1__tuned1: F1=0.0000 (CV best=0.0000)


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
[I 2025-12-19 11:14:14,413] Trial 0 finished with value: 2.7456687076137395e-05 and parameters: {'alpha': 0.031405979509513945}. Best is trial 0 with value: 2.7456687076137395e-05.
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
[I 2025-12-19 11:14:15,501] Trial 1 finished with value: 2.7456687076137395e-05 and parameters: {'alpha': 0.04216148144076629}. Best is trial 0 with value: 2.7456687076137395e-05.
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
[I 2025-12-19 11:14:16,573] Trial 2 finished with value: 2.7456687076137395e-05 and parameters: {'alpha': 2.373555070318159}. Best is t

üèÉ View run ridge__pca0__tuned1 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/da541423ae3649f7add965ccca8f65af
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ ridge__pca0__tuned1: F1=0.0001 (CV best=0.0000)


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  ret = a @ b
  ret = a @ b
  ret = a @ b
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  ret = a @ b
  ret = a @ b
  ret = a @ b
  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(s

üèÉ View run ridge__pca1__tuned1 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/47a97ece58b1435889b8e7d7382d68e3
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ ridge__pca1__tuned1: F1=0.0000 (CV best=0.0000)


[I 2025-12-19 11:18:08,812] Trial 0 finished with value: 0.14990741506117217 and parameters: {'n_estimators': 434, 'learning_rate': 0.030799689039374672, 'max_depth': 5}. Best is trial 0 with value: 0.14990741506117217.
[I 2025-12-19 11:18:55,660] Trial 1 finished with value: 0.04474330580040832 and parameters: {'n_estimators': 148, 'learning_rate': 0.03574224140083477, 'max_depth': 3}. Best is trial 0 with value: 0.14990741506117217.
[I 2025-12-19 11:19:28,463] Trial 2 finished with value: 0.003832762254432961 and parameters: {'n_estimators': 108, 'learning_rate': 0.03010479846585591, 'max_depth': 3}. Best is trial 0 with value: 0.14990741506117217.


In [None]:

lb_path = OUT_ROOT / "leaderboard_optuna.csv"
tuned_leaderboard.to_csv(lb_path, index=False)
print("Saved leaderboard:", lb_path.resolve())

best_run = tuned_leaderboard.iloc[0]
best_model_path = Path(best_run["artifact_dir"]) / "model.pkl"
best_out = ROOT / "models" / "best_model_optuna.pkl"
best_out.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(joblib.load(best_model_path), best_out)

print("Best tuned model:", best_run["run_id"], "F1=", best_run["f1"])
print("Saved best tuned model to:", best_out.resolve())
