# Week 2  Linear Regression part 2  Lasso Ridge Elastic Net

In [None]:

import pandas as pd
import numpy as np

# Paths to uploaded datasets
DATA_SMALL = "/mnt/data/T_ONTIME_REPORTING.csv"
DATA_LARGE = "/mnt/data/DelayData.csv"

# Utility: fast preview of a CSV
def fast_preview(path, n=5):
    print(f"Previewing {path}")
    df = pd.read_csv(path, nrows=n)
    display(df.head(n))
    return df

# Utility: chunked iterator for large CSV
def chunk_reader(path, chunksize=100_000, usecols=None, dtype=None):
    return pd.read_csv(path, chunksize=chunksize, usecols=usecols, dtype=dtype)

# Utility: downsample large dataset for experiments
def load_sample_from_large(n_rows=200_000, usecols=None):
    # Stream chunks until we accumulate n_rows
    rows = []
    total = 0
    for chunk in chunk_reader(DATA_LARGE, chunksize=100_000, usecols=usecols):
        rows.append(chunk)
        total += len(chunk)
        if total >= n_rows:
            break
    df = pd.concat(rows, ignore_index=True)
    print(f"Loaded sample of {len(df):,} rows from large file")
    return df

# Quick sanity check previews
_ = fast_preview(DATA_SMALL, n=5)
_ = fast_preview(DATA_LARGE, n=5)


In [None]:

# Prepare features and target
usecols = [
    "arrdelay","depdelay","scheduledhour","month","dayofmonth",
    "temperature","windspeed","raindummy","snowdummy",
    "marketshareorigin","marketsharedest","hhiorigin","hhidest"
]
df = load_sample_from_large(n_rows=250_000, usecols=usecols).dropna()

y = df["arrdelay"]
X = df.drop(columns=["arrdelay"])

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

models = {
    "Ridge_1.0": Ridge(alpha=1.0, random_state=42),
    "Lasso_0.01": Lasso(alpha=0.01, random_state=42, max_iter=5000),
    "Elastic_0.1": ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42, max_iter=5000),
}

results = {}
for name, model in models.items():
    pipe = make_pipeline(StandardScaler(with_mean=True), model)
    pipe.fit(X_train, y_train)
    pred = pipe.predict(X_test)
    rmse = mean_squared_error(y_test, pred, squared=False)
    results[name] = rmse

results


In [None]:

# Hyperparameter tuning example with cross validation for Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import Ridge

param_grid = {"ridge__alpha":[0.01, 0.1, 1.0, 10.0, 100.0]}
pipe = make_pipeline(StandardScaler(), Ridge())
grid = GridSearchCV(pipe, param_grid=param_grid, cv=3, scoring="neg_root_mean_squared_error", n_jobs=-1)
grid.fit(X_train, y_train)

print("Best params:", grid.best_params_)
print("Best CV score (neg RMSE):", grid.best_score_)
best_model = grid.best_estimator_
test_rmse = mean_squared_error(y_test, best_model.predict(X_test), squared=False)
print("Test RMSE:", test_rmse)


### Notes
Use regularization to control variance and reduce overfitting. Report tuned alpha. Record RMSE and any change from Week 1.