# 02 ‚Äî Crime: Violent vs Non‚ÄëViolent (Without Optuna)

Adapted to your normalized SQLite DB: `data/crime.db`.

**Target:** `violent` (0/1)

**Runs in this notebook:** 8 baseline experiments = 4 models √ó (PCA off/on).

Saves artifacts per run to `models/experiments/<run_id>/`.


In [7]:
# If needed: !pip install lightgbm "mlflow<3" scikit-learn pandas numpy joblib
from dotenv import load_dotenv
import os
import mlflow

load_dotenv()  # loads .env file

MLFLOW_OK = True
try:
    mlflow.set_tracking_uri(os.environ["MLFLOW_TRACKING_URI"])
    mlflow.set_experiment("crime-violent-vs-nonviolent")
    print("‚úÖ MLflow connected to:", os.environ["MLFLOW_TRACKING_URI"])
except Exception as e:
    print("‚ö†Ô∏è MLflow disabled:", e)
    MLFLOW_OK = False

‚úÖ MLflow connected to: https://dagshub.com/kt19-jpeg/ml_project.mlflow


In [8]:

import os, json, sqlite3
from pathlib import Path
import numpy as np
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import f1_score, confusion_matrix, classification_report

from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier

try:
    import mlflow
    MLFLOW_OK = True
except Exception as e:
    print("‚ö†Ô∏è mlflow not installed; skipping MLflow logging.")
    MLFLOW_OK = False

SEED = 42

def find_project_root() -> Path:
    here = Path.cwd().resolve()
    for p in [here] + list(here.parents):
        if (p / "data" / "crime.db").exists():
            return p
    return here

ROOT = find_project_root()
DB_PATH = ROOT / "data" / "crime.db"
OUT_ROOT = ROOT / "models" / "experiments"
OUT_ROOT.mkdir(parents=True, exist_ok=True)

def _make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=False)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=False)

print("ROOT:", ROOT)
print("DB_PATH:", DB_PATH)
print("OUT_ROOT:", OUT_ROOT)


ROOT: /Users/kavyansh/IdeaProjects/ml_project/buffalo_crime
DB_PATH: /Users/kavyansh/IdeaProjects/ml_project/buffalo_crime/data/crime.db
OUT_ROOT: /Users/kavyansh/IdeaProjects/ml_project/buffalo_crime/models/experiments


In [9]:

SQL = '''
SELECT
  i.incident_id,
  i.hour_of_day,
  i.latitude,
  i.longitude,
  i.zip_code,
  i.council_district,
  dow.name AS day_of_week,
  pd.name  AS police_district,
  nb.name  AS neighborhood,
  l.violent
FROM incidents i
JOIN labels l ON l.incident_id = i.incident_id
LEFT JOIN day_of_week_dim dow ON dow.day_of_week_id = i.day_of_week_id
LEFT JOIN police_district_dim pd ON pd.police_district_id = i.police_district_id
LEFT JOIN neighborhood_dim nb ON nb.neighborhood_id = i.neighborhood_id;
'''

if not DB_PATH.exists():
    raise FileNotFoundError(f"Could not find DB at {DB_PATH}")

con = sqlite3.connect(DB_PATH)
df = pd.read_sql_query(SQL, con)
con.close()

print("Loaded:", df.shape)
df.head()


Loaded: (327558, 10)


Unnamed: 0,incident_id,hour_of_day,latitude,longitude,zip_code,council_district,day_of_week,police_district,neighborhood,violent
0,09-2820596,3,42.885,-78.879,14202,FILLMORE,Friday,District B,Central,0
1,06-1740516,0,42.939,-78.838,14214,MASTEN,Tuesday,District E,Fillmore-Leroy,0
2,08-1760604,15,42.935,-78.866,14216,NORTH,Tuesday,District D,Parkside,1
3,06-0210543,13,42.899,-78.81,14211,LOVEJOY,Saturday,District C,Genesee-Moselle,0
4,06-2120630,16,42.948,-78.906,14207,NORTH,Monday,District D,Riverside,0


In [10]:

TARGET_COL = "violent"
FEATURE_COLS = [
    "hour_of_day", "latitude", "longitude",
    "zip_code", "council_district",
    "day_of_week", "police_district", "neighborhood",
]

df = df[FEATURE_COLS + [TARGET_COL]].dropna(subset=[TARGET_COL]).copy()
df[TARGET_COL] = df[TARGET_COL].astype(int)

X = df[FEATURE_COLS]
y = df[TARGET_COL]

print("Target counts:\n", y.value_counts())
print("\nTarget %:\n", (y.value_counts(normalize=True) * 100).round(2))

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

NUM_COLS = ["hour_of_day", "latitude", "longitude"]
CAT_COLS = ["zip_code", "council_district", "day_of_week", "police_district", "neighborhood"]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),
            ("scaler", StandardScaler()),
        ]), NUM_COLS),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("ohe", _make_ohe()),
        ]), CAT_COLS),
    ],
    remainder="drop",
    verbose_feature_names_out=False,
)

print("Train:", X_train.shape, "Test:", X_test.shape)


Target counts:
 violent
0    234988
1     92570
Name: count, dtype: int64

Target %:
 violent
0    71.74
1    28.26
Name: proportion, dtype: float64
Train: (262046, 8) Test: (65512, 8)


In [11]:

# Baseline models (no tuning). Using LightGBM instead of XGBoost to avoid macOS OpenMP issues.
MODELS = {
    "logreg": LogisticRegression(C=1.0, solver="lbfgs", max_iter=3000, random_state=SEED),
    "ridge": RidgeClassifier(alpha=1.0, random_state=SEED),
    "gboost": GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=3, random_state=SEED),
    "et": ExtraTreesClassifier(
        n_estimators=100,
        max_depth=18,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features="sqrt",
        n_jobs=-1,
        random_state=SEED
    ),
}
PCA_FLAGS = [False, True]  # PCA off/on

def build_pipeline(model_name: str, use_pca: bool):
    steps = [("preprocess", preprocess)]
    if use_pca:
        steps.append(("pca", PCA(n_components=20, random_state=SEED)))
    steps.append(("clf", MODELS[model_name]))
    return Pipeline(steps)

def save_run(run_id: str, pipe: Pipeline, metrics: dict, params: dict):
    out_dir = OUT_ROOT / run_id
    out_dir.mkdir(parents=True, exist_ok=True)
    joblib.dump(pipe, out_dir / "model.pkl")
    (out_dir / "metrics.json").write_text(json.dumps(metrics, indent=2), encoding="utf-8")
    (out_dir / "params.json").write_text(json.dumps(params, indent=2), encoding="utf-8")
    return out_dir

def maybe_log_mlflow(run_id: str, params: dict, metrics: dict, artifact_dir: Path):
    if not MLFLOW_OK:
        return
    mlflow.set_experiment("crime-violent-vs-nonviolent")
    with mlflow.start_run(run_name=run_id):
        mlflow.log_params(params)
        mlflow.log_metric("f1", metrics["f1"])
        mlflow.log_artifacts(str(artifact_dir))


In [12]:

# Run 8 experiments: 4 models √ó (PCA off/on), tuned=0
results = []
for model_name in MODELS.keys():
    for use_pca in PCA_FLAGS:
        run_id = f"{model_name}__pca{int(use_pca)}__tuned0"

        pipe = build_pipeline(model_name, use_pca)
        pipe.fit(X_train, y_train)

        pred = pipe.predict(X_test)
        f1 = float(f1_score(y_test, pred, average="binary"))

        metrics = {
            "f1": f1,
            "confusion_matrix": confusion_matrix(y_test, pred).tolist(),
            "classification_report": classification_report(y_test, pred, output_dict=True),
        }
        params = {
            "model": model_name,
            "use_pca": use_pca,
            "tuned": False,
            "pca_components": 20 if use_pca else None,
        }

        out_dir = save_run(run_id, pipe, metrics, params)
        maybe_log_mlflow(run_id, params, metrics, out_dir)

        results.append({
            "run_id": run_id,
            "model": model_name,
            "use_pca": use_pca,
            "tuned": False,
            "f1": f1,
            "artifact_dir": str(out_dir),
        })

        print(f"‚úÖ {run_id}: F1={f1:.4f}")

leaderboard = pd.DataFrame(results).sort_values("f1", ascending=False)
leaderboard


  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  ret = a @ b
  ret = a @ b
  ret = a @ b


üèÉ View run logreg__pca0__tuned0 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/3248da041b08492abdc8abfb3d73bde8
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ logreg__pca0__tuned0: F1=0.0001


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  raw_prediction = X @ weights + intercept
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  grad[:n_features] = X.T @ grad_pointwise + l2_reg_strength * weights
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transf

üèÉ View run logreg__pca1__tuned0 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/f49e91a7753a4f0f8cbfb84f0dcb159e
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ logreg__pca1__tuned0: F1=0.0000


  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b


üèÉ View run ridge__pca0__tuned0 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/0ec5c366387545e293308c4376bffa65
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ ridge__pca0__tuned0: F1=0.0000


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  ret = a @ b
  ret = a @ b
  ret = a @ b
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  ret = a @ b
  ret = a @ b
  ret = a @ b
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitaliz

üèÉ View run ridge__pca1__tuned0 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/0c6e32a4f037434b9cb94c0cd477c11b
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ ridge__pca1__tuned0: F1=0.0000
üèÉ View run gboost__pca0__tuned0 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/dfd8d2707562469db7cea1744f58465d
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ gboost__pca0__tuned0: F1=0.1302


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T


üèÉ View run gboost__pca1__tuned0 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/a15d2ec7c4f94cc9bb13ba8fb83616d0
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ gboost__pca1__tuned0: F1=0.1193
üèÉ View run et__pca0__tuned0 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/329d537490f44e39b8150cfd456aad48
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ et__pca0__tuned0: F1=0.0138


  C = X.T @ X
  C = X.T @ X
  C = X.T @ X
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed = X @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T
  X_transformed -= xp.reshape(self.mean_, (1, -1)) @ self.components_.T


üèÉ View run et__pca1__tuned0 at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0/runs/eadfdd87ff634e888e5c53adbfd50aec
üß™ View experiment at: https://dagshub.com/kt19-jpeg/ml_project.mlflow/#/experiments/0
‚úÖ et__pca1__tuned0: F1=0.1172


Unnamed: 0,run_id,model,use_pca,tuned,f1,artifact_dir
4,gboost__pca0__tuned0,gboost,False,False,0.130206,/Users/kavyansh/IdeaProjects/ml_project/buffal...
5,gboost__pca1__tuned0,gboost,True,False,0.119332,/Users/kavyansh/IdeaProjects/ml_project/buffal...
7,et__pca1__tuned0,et,True,False,0.117239,/Users/kavyansh/IdeaProjects/ml_project/buffal...
6,et__pca0__tuned0,et,False,False,0.013772,/Users/kavyansh/IdeaProjects/ml_project/buffal...
0,logreg__pca0__tuned0,logreg,False,False,0.000108,/Users/kavyansh/IdeaProjects/ml_project/buffal...
1,logreg__pca1__tuned0,logreg,True,False,0.0,/Users/kavyansh/IdeaProjects/ml_project/buffal...
2,ridge__pca0__tuned0,ridge,False,False,0.0,/Users/kavyansh/IdeaProjects/ml_project/buffal...
3,ridge__pca1__tuned0,ridge,True,False,0.0,/Users/kavyansh/IdeaProjects/ml_project/buffal...


In [13]:

# Save leaderboard + best model
lb_path = OUT_ROOT / "leaderboard_no_optuna.csv"
leaderboard.to_csv(lb_path, index=False)
print("Saved leaderboard:", lb_path.resolve())

best_run = leaderboard.iloc[0]
best_model_path = Path(best_run["artifact_dir"]) / "model.pkl"
best_out = ROOT / "models" / "best_model_no_optuna.pkl"
best_out.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(joblib.load(best_model_path), best_out)

print("Best baseline model:", best_run["run_id"], "F1=", best_run["f1"])
print("Saved best baseline model to:", best_out.resolve())


Saved leaderboard: /Users/kavyansh/IdeaProjects/ml_project/buffalo_crime/models/experiments/leaderboard_no_optuna.csv
Best baseline model: gboost__pca0__tuned0 F1= 0.13020561280212736
Saved best baseline model to: /Users/kavyansh/IdeaProjects/ml_project/buffalo_crime/models/best_model_no_optuna.pkl
