# Week 3  Linear Regression part 3  Forward Backward Selection  PCR  PLSR

In [None]:

import pandas as pd
import numpy as np

# Paths to uploaded datasets
DATA_SMALL = "/mnt/data/T_ONTIME_REPORTING.csv"
DATA_LARGE = "/mnt/data/DelayData.csv"

# Utility: fast preview of a CSV
def fast_preview(path, n=5):
    print(f"Previewing {path}")
    df = pd.read_csv(path, nrows=n)
    display(df.head(n))
    return df

# Utility: chunked iterator for large CSV
def chunk_reader(path, chunksize=100_000, usecols=None, dtype=None):
    return pd.read_csv(path, chunksize=chunksize, usecols=usecols, dtype=dtype)

# Utility: downsample large dataset for experiments
def load_sample_from_large(n_rows=200_000, usecols=None):
    # Stream chunks until we accumulate n_rows
    rows = []
    total = 0
    for chunk in chunk_reader(DATA_LARGE, chunksize=100_000, usecols=usecols):
        rows.append(chunk)
        total += len(chunk)
        if total >= n_rows:
            break
    df = pd.concat(rows, ignore_index=True)
    print(f"Loaded sample of {len(df):,} rows from large file")
    return df

# Quick sanity check previews
_ = fast_preview(DATA_SMALL, n=5)
_ = fast_preview(DATA_LARGE, n=5)


In [None]:

# Load a moderate feature set
usecols = [
    "arrdelay","depdelay","scheduledhour","month","dayofmonth",
    "temperature","windspeed","raindummy","snowdummy",
    "marketshareorigin","marketsharedest","hhiorigin","hhidest"
]
df = load_sample_from_large(n_rows=200_000, usecols=usecols).dropna()

y = df["arrdelay"].values
X = df.drop(columns=["arrdelay"]).values

from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# PCR  PCA followed by Linear Regression
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

pca = PCA(n_components=0.95, svd_solver="full")
X_train_p = pca.fit_transform(X_train_s)
X_test_p = pca.transform(X_test_s)

lin = LinearRegression()
lin.fit(X_train_p, y_train)
pred = lin.predict(X_test_p)

rmse = mean_squared_error(y_test, pred, squared=False)
r2 = r2_score(y_test, pred)

print("PCR RMSE:", rmse)
print("PCR R2:", r2)
print("Num components:", pca.n_components_)


In [None]:

# PLSR using sklearn cross decomposition
from sklearn.cross_decomposition import PLSRegression

pls = PLSRegression(n_components=min(10, X_train.shape[1]))
pls.fit(X_train_s, y_train)
pred_pls = pls.predict(X_test_s).ravel()

rmse_pls = mean_squared_error(y_test, pred_pls, squared=False)
r2_pls = r2_score(y_test, pred_pls)
print("PLSR RMSE:", rmse_pls)
print("PLSR R2:", r2_pls)


### Notes
Discuss selection strategy. For forward or backward selection you can use information criteria or cross validation with feature subsets. Record variance explained by PCA for PCR.