# Week 1  Linear Regression part 1

In [None]:

import pandas as pd
import numpy as np

# Paths to uploaded datasets
DATA_SMALL = "/mnt/data/T_ONTIME_REPORTING.csv"
DATA_LARGE = "/mnt/data/DelayData.csv"

# Utility: fast preview of a CSV
def fast_preview(path, n=5):
    print(f"Previewing {path}")
    df = pd.read_csv(path, nrows=n)
    display(df.head(n))
    return df

# Utility: chunked iterator for large CSV
def chunk_reader(path, chunksize=100_000, usecols=None, dtype=None):
    return pd.read_csv(path, chunksize=chunksize, usecols=usecols, dtype=dtype)

# Utility: downsample large dataset for experiments
def load_sample_from_large(n_rows=200_000, usecols=None):
    # Stream chunks until we accumulate n_rows
    rows = []
    total = 0
    for chunk in chunk_reader(DATA_LARGE, chunksize=100_000, usecols=usecols):
        rows.append(chunk)
        total += len(chunk)
        if total >= n_rows:
            break
    df = pd.concat(rows, ignore_index=True)
    print(f"Loaded sample of {len(df):,} rows from large file")
    return df

# Quick sanity check previews
_ = fast_preview(DATA_SMALL, n=5)
_ = fast_preview(DATA_LARGE, n=5)


## Goal
Fit linear models with polynomial and interaction terms. Diagnose multicollinearity and VIF.
Use the uploaded flight data to predict arrival delay as a continuous target.

In [None]:

# Select columns for a regression experiment
usecols = [
    "arrdelay","depdelay","scheduledhour","month","dayofmonth",
    "temperature","windspeed","raindummy","snowdummy",
    "marketshareorigin","marketsharedest"
]

df = load_sample_from_large(n_rows=250_000, usecols=usecols).dropna()

# Basic EDA
df.describe(include="all")


In [None]:

# Feature engineering  polynomial and interaction terms
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

y = df["arrdelay"]
X = df.drop(columns=["arrdelay"])

poly = PolynomialFeatures(degree=2, include_bias=False, interaction_only=False)
X_poly = poly.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_poly, y, test_size=0.2, random_state=42)

linreg = LinearRegression()
linreg.fit(X_train, y_train)

pred = linreg.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
r2 = r2_score(y_test, pred)

print("RMSE:", rmse)
print("R2:", r2)
print("Num features after polynomial transform:", X_poly.shape[1])


In [None]:

# Multicollinearity and VIF on a modest subset
import pandas as pd
from statsmodels.stats.outliers_influence import variance_inflation_factor
import statsmodels.api as sm

X_small = df[["depdelay","scheduledhour","month","dayofmonth","temperature","windspeed"]].copy()
X_small = sm.add_constant(X_small)
vif = pd.Series([variance_inflation_factor(X_small.values, i) for i in range(X_small.shape[1])],
                index=X_small.columns, name="VIF")
vif


### Notes to include in Milestone One
* Overfitting avoidance  holdout set and limit polynomial degree to two
* Metrics  RMSE and R2
* Expected and unexpected  strong effect of departure delay on arrival delay  weather weaker than expected