# Week 6  Decision Trees and Random Forests

In [None]:

import pandas as pd
import numpy as np

# Paths to uploaded datasets
DATA_SMALL = "/mnt/data/T_ONTIME_REPORTING.csv"
DATA_LARGE = "/mnt/data/DelayData.csv"

# Utility: fast preview of a CSV
def fast_preview(path, n=5):
    print(f"Previewing {path}")
    df = pd.read_csv(path, nrows=n)
    display(df.head(n))
    return df

# Utility: chunked iterator for large CSV
def chunk_reader(path, chunksize=100_000, usecols=None, dtype=None):
    return pd.read_csv(path, chunksize=chunksize, usecols=usecols, dtype=dtype)

# Utility: downsample large dataset for experiments
def load_sample_from_large(n_rows=200_000, usecols=None):
    # Stream chunks until we accumulate n_rows
    rows = []
    total = 0
    for chunk in chunk_reader(DATA_LARGE, chunksize=100_000, usecols=usecols):
        rows.append(chunk)
        total += len(chunk)
        if total >= n_rows:
            break
    df = pd.concat(rows, ignore_index=True)
    print(f"Loaded sample of {len(df):,} rows from large file")
    return df

# Quick sanity check previews
_ = fast_preview(DATA_SMALL, n=5)
_ = fast_preview(DATA_LARGE, n=5)


In [None]:

# Use the same binary target setup
usecols = [
    "arrdelay","depdelay","scheduledhour","month","dayofmonth",
    "temperature","windspeed","raindummy","snowdummy",
    "marketshareorigin","marketsharedest","hhiorigin","hhidest"
]
df = load_sample_from_large(n_rows=250_000, usecols=usecols).dropna()
df["target_delay15"] = (df["arrdelay"] > 15).astype(int)

X = df.drop(columns=["arrdelay","target_delay15"]).values
y = df["target_delay15"].values

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_auc_score
import pandas as pd

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Decision tree with simple tuning
param_tree = {"max_depth":[5,10,15,20], "min_samples_split":[2,10,50]}
grid_tree = GridSearchCV(DecisionTreeClassifier(random_state=42), param_tree, cv=3, scoring="roc_auc", n_jobs=-1)
grid_tree.fit(X_train, y_train)
tree_best = grid_tree.best_estimator_
print("Tree best params:", grid_tree.best_params_)

# Random forest
param_rf = {"n_estimators":[100,200], "max_depth":[10, None], "min_samples_split":[2,20]}
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42, n_jobs=-1), param_rf, cv=3, scoring="roc_auc", n_jobs=-1)
grid_rf.fit(X_train, y_train)
rf_best = grid_rf.best_estimator_
print("RF best params:", grid_rf.best_params_)

# Evaluate
for model, name in [(tree_best,"Decision Tree"), (rf_best,"Random Forest")]:
    proba = model.predict_proba(X_test)[:,1]
    pred = (proba >= 0.5).astype(int)
    print(f"\n{name}")
    print(classification_report(y_test, pred))
    print("ROC AUC:", roc_auc_score(y_test, proba))

# Feature importance from Random Forest
importances = rf_best.feature_importances_
feat_names = df.drop(columns=["arrdelay","target_delay15"]).columns
fi = pd.Series(importances, index=feat_names).sort_values(ascending=False).head(15)
fi.to_frame("importance")


### Notes
Record top features and how they align with domain expectations. Discuss overfitting controls such as max depth, min samples split, and out of bag if used.