# Week 4  Logistic Regression and Feature Scaling

In [None]:

import pandas as pd
import numpy as np

# Paths to uploaded datasets
DATA_SMALL = "/mnt/data/T_ONTIME_REPORTING.csv"
DATA_LARGE = "/mnt/data/DelayData.csv"

# Utility: fast preview of a CSV
def fast_preview(path, n=5):
    print(f"Previewing {path}")
    df = pd.read_csv(path, nrows=n)
    display(df.head(n))
    return df

# Utility: chunked iterator for large CSV
def chunk_reader(path, chunksize=100_000, usecols=None, dtype=None):
    return pd.read_csv(path, chunksize=chunksize, usecols=usecols, dtype=dtype)

# Utility: downsample large dataset for experiments
def load_sample_from_large(n_rows=200_000, usecols=None):
    # Stream chunks until we accumulate n_rows
    rows = []
    total = 0
    for chunk in chunk_reader(DATA_LARGE, chunksize=100_000, usecols=usecols):
        rows.append(chunk)
        total += len(chunk)
        if total >= n_rows:
            break
    df = pd.concat(rows, ignore_index=True)
    print(f"Loaded sample of {len(df):,} rows from large file")
    return df

# Quick sanity check previews
_ = fast_preview(DATA_SMALL, n=5)
_ = fast_preview(DATA_LARGE, n=5)


In [None]:

# Build a binary classification target  arrival delay greater than 15 minutes
usecols = [
    "arrdelay","depdelay","scheduledhour","month","dayofmonth",
    "temperature","windspeed","raindummy","snowdummy",
    "marketshareorigin","marketsharedest"
]
df = load_sample_from_large(n_rows=250_000, usecols=usecols).dropna()

df["target_delay15"] = (df["arrdelay"] > 15).astype(int)
y = df["target_delay15"].values
X = df.drop(columns=["arrdelay","target_delay15"]).values

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score, RocCurveDisplay

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

clf = LogisticRegression(max_iter=2000, solver="lbfgs")
clf.fit(X_train_s, y_train)
proba = clf.predict_proba(X_test_s)[:,1]
pred = (proba >= 0.5).astype(int)

print(classification_report(y_test, pred))
print("ROC AUC:", roc_auc_score(y_test, proba))
RocCurveDisplay.from_predictions(y_test, proba)


### Notes
Explain class imbalance if present, show ROC AUC, precision, recall. Describe feature scaling and reasons.