In [31]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [32]:
DATA_PATH = "/content/sample_data/FuelEconomy.csv"
df = pd.read_csv(DATA_PATH)


print("Columns:", df.columns.tolist())
print("Shape (rows, cols):", df.shape)

display(df.head())
display(df.describe())

missing_counts = df.isna().sum()
print("\nMissing values per column:")
print(missing_counts)

if missing_counts.sum() == 0:
    print("\nNo missing values found → no imputation or row-dropping needed.")
else:
    df = df.dropna(subset=[target_col])
    num_cols = df.select_dtypes(include=[np.number]).columns
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())
    print("\nMissing values handled: dropped missing targets; imputed numeric features with median.")


Columns: ['Horse Power', 'Fuel Economy (MPG)']
Shape (rows, cols): (100, 2)


Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.95201
3,187.310009,23.384546
4,218.59434,23.426739


Unnamed: 0,Horse Power,Fuel Economy (MPG)
count,100.0,100.0
mean,213.67619,23.178501
std,62.061726,4.701666
min,50.0,10.0
25%,174.996514,20.439516
50%,218.928402,23.143192
75%,251.706476,26.089933
max,350.0,35.0



Missing values per column:
Horse Power           0
Fuel Economy (MPG)    0
dtype: int64

No missing values found → no imputation or row-dropping needed.


In [33]:
# Target y and feature x
X = df[["Fuel Economy (MPG)"]]   # features must be 2D
y = df["Horse Power"]            # target is 1D

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.30,
    random_state=42,
    shuffle=True
)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)


X_train shape: (70, 1)
X_test shape: (30, 1)
y_train shape: (70,)
y_test shape: (30,)


In [34]:
# (a) Linear Regression
lin_model = LinearRegression()
lin_model.fit(X_train, y_train)

# (b) Polynomial Regression (degree 2)
poly2_model = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lr", LinearRegression())
])
poly2_model.fit(X_train, y_train)

# (c) Polynomial Regression (degree 3)
poly3_model = Pipeline([
    ("poly", PolynomialFeatures(degree=3, include_bias=False)),
    ("lr", LinearRegression())
])
poly3_model.fit(X_train, y_train)

# (d) Polynomial Regression (degree 4)
poly4_model = Pipeline([
    ("poly", PolynomialFeatures(degree=4, include_bias=False)),
    ("lr", LinearRegression())
])
poly4_model.fit(X_train, y_train)

print("Models trained: Linear, Poly deg 2, Poly deg 3, Poly deg 4")


Models trained: Linear, Poly deg 2, Poly deg 3, Poly deg 4


In [35]:
models = {
    "Linear Regression": lin_model,
    "Poly (deg=2)": poly2_model,
    "Poly (deg=3)": poly3_model,
    "Poly (deg=4)": poly4_model
}

rows = []
for name, model in models.items():
    # Predictions
    y_pred_train = model.predict(X_train)
    y_pred_test  = model.predict(X_test)

    # Metrics (Train)
    train_mse = mean_squared_error(y_train, y_pred_train)
    train_mae = mean_absolute_error(y_train, y_pred_train)
    train_r2  = r2_score(y_train, y_pred_train)

    # Metrics (Test)
    test_mse = mean_squared_error(y_test, y_pred_test)
    test_mae = mean_absolute_error(y_test, y_pred_test)
    test_r2  = r2_score(y_test, y_pred_test)

    rows.append({
        "Model": name,
        "Train MSE": train_mse,
        "Train MAE": train_mae,
        "Train R2": train_r2,
        "Test MSE": test_mse,
        "Test MAE": test_mae,
        "Test R2": test_r2
    })

results_table = pd.DataFrame(rows).set_index("Model")
results_table



Unnamed: 0_level_0,Train MSE,Train MAE,Train R2,Test MSE,Test MAE,Test R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Linear Regression,357.69918,16.061689,0.90632,318.561087,14.940628,0.912561
Poly (deg=2),350.879731,15.995824,0.908106,331.105434,15.14833,0.909118
Poly (deg=3),345.108668,15.746762,0.909618,318.404012,14.764973,0.912604
Poly (deg=4),339.700171,15.508465,0.911034,313.798757,14.735471,0.913868


1. Which model performs best on the test set and why?

  The Polynomial Regression (degree 4) model performs best on the test set because it has the highest Test R² and the lowest test errors among the four models. Higher Test R² means your model reduces test error more compared to the mean-baseline. Lower MSE and MAE means the model’s predictions are closer to true HP on average.

2. Does increasing polynomial degree always improve performance? If not, explain what you observe.

  No, increasing degree does not always improve test performance.

3. If a model performs unexpectedly poorly (e.g., low R2 or large test error), propose at least two
plausible reasons, such as:
– underfitting vs overfitting,
– weak relationship between features and target,
– outliers or noise in the data,
– insufficient feature information for predicting HP.

  A quadratic may not capture the true shape of the relationship between fuel economy and horsepower as well as a cubic/quartic. This is supported by the fact that deg=3 and deg=4 improve the test metrics compared to deg=2. Since MSE squares errors, a small number of test points with large prediction errors can increase Test MSE noticeably. Poly (deg=2) shows the highest Test MSE, suggesting it may be more impacted by a few poorly fit points than the better-performing models.



In [38]:
DATA_PATH2 = "/content/sample_data/electricity_consumption_based_weather_dataset.csv"
df2 = pd.read_csv(DATA_PATH2)

print("Columns:", df2.columns.tolist())
print("Shape (rows, cols):", df2.shape)

display(df2.head())
display(df2.describe(include="all"))

target_col2 = "daily_consumption"


print("\nDependent variable (target):", target_col2)

missing2 = df2.isna().sum()
print("\nMissing values per column:")
print(missing2)

if missing2.sum() == 0:
    print("\nNo missing values found → no imputation or row-dropping needed.")
else:
    df2 = df2.dropna(subset=[target_col2])

    num_cols2 = df2.select_dtypes(include=[np.number]).columns
    df2[num_cols2] = df2[num_cols2].fillna(df2[num_cols2].median())

    print("\nMissing values handled consistently: dropped missing targets; imputed numeric columns with median.")

Columns: ['date', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'daily_consumption']
Shape (rows, cols): (1433, 6)


Unnamed: 0,date,AWND,PRCP,TMAX,TMIN,daily_consumption
0,2006-12-16,2.5,0.0,10.6,5.0,1209.176
1,2006-12-17,2.6,0.0,13.3,5.6,3390.46
2,2006-12-18,2.4,0.0,15.0,6.7,2203.826
3,2006-12-19,2.4,0.0,7.2,2.2,1666.194
4,2006-12-20,2.4,0.0,7.2,1.1,2225.748


Unnamed: 0,date,AWND,PRCP,TMAX,TMIN,daily_consumption
count,1433,1418.0,1433.0,1433.0,1433.0,1433.0
unique,1433,,,,,
top,2010-11-26,,,,,
freq,1,,,,,
mean,,2.642313,3.800488,17.187509,9.141242,1561.078061
std,,1.140021,10.973436,10.136415,9.028417,606.819667
min,,0.0,0.0,-8.9,-14.4,14.218
25%,,1.8,0.0,8.9,2.2,1165.7
50%,,2.4,0.0,17.8,9.4,1542.65
75%,,3.3,1.3,26.1,17.2,1893.608



Dependent variable (target): daily_consumption

Missing values per column:
date                  0
AWND                 15
PRCP                  0
TMAX                  0
TMIN                  0
daily_consumption     0
dtype: int64

Missing values handled consistently: dropped missing targets; imputed numeric columns with median.


In [39]:
X2 = df2[["AWND", "PRCP", "TMAX", "TMIN"]]
y2 = df2["daily_consumption"]

X2_train, X2_test, y2_train, y2_test = train_test_split(
    X2, y2,
    test_size=0.30,
    random_state=42,
    shuffle=True
)

print("X2_train shape:", X2_train.shape)
print("X2_test shape:", X2_test.shape)
print("y2_train shape:", y2_train.shape)
print("y2_test shape:", y2_test.shape)


X2_train shape: (1003, 4)
X2_test shape: (430, 4)
y2_train shape: (1003,)
y2_test shape: (430,)


In [40]:
lin2_model = LinearRegression()
lin2_model.fit(X2_train, y2_train)

poly2_2_model = Pipeline([
    ("poly", PolynomialFeatures(degree=2, include_bias=False)),
    ("lr", LinearRegression())
])
poly2_2_model.fit(X2_train, y2_train)

poly3_2_model = Pipeline([
    ("poly", PolynomialFeatures(degree=3, include_bias=False)),
    ("lr", LinearRegression())
])
poly3_2_model.fit(X2_train, y2_train)

poly4_2_model = Pipeline([
    ("poly", PolynomialFeatures(degree=4, include_bias=False)),
    ("lr", LinearRegression())
])
poly4_2_model.fit(X2_train, y2_train)

print("Models trained for Part 2: Linear, Poly deg 2, Poly deg 3, Poly deg 4")


Models trained for Part 2: Linear, Poly deg 2, Poly deg 3, Poly deg 4


In [41]:
models2 = {
    "Linear Regression": lin2_model,
    "Poly (deg=2)": poly2_2_model,
    "Poly (deg=3)": poly3_2_model,
    "Poly (deg=4)": poly4_2_model
}

rows2 = []
for name, model in models2.items():
    y2_pred_train = model.predict(X2_train)
    y2_pred_test  = model.predict(X2_test)

    train_mse = mean_squared_error(y2_train, y2_pred_train)
    train_mae = mean_absolute_error(y2_train, y2_pred_train)
    train_r2  = r2_score(y2_train, y2_pred_train)

    test_mse = mean_squared_error(y2_test, y2_pred_test)
    test_mae = mean_absolute_error(y2_test, y2_pred_test)
    test_r2  = r2_score(y2_test, y2_pred_test)

    rows2.append({
        "Model": name,
        "Train MSE": train_mse,
        "Train MAE": train_mae,
        "Train R2": train_r2,
        "Test MSE": test_mse,
        "Test MAE": test_mae,
        "Test R2": test_r2
    })

results_table2 = pd.DataFrame(rows2).set_index("Model")
results_table2


Unnamed: 0_level_0,Train MSE,Train MAE,Train R2,Test MSE,Test MAE,Test R2
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Linear Regression,274825.692983,387.032094,0.272947,237199.931909,365.56372,0.311545
Poly (deg=2),268077.879733,382.135271,0.290798,234796.124745,362.89198,0.318522
Poly (deg=3),261230.611849,377.80136,0.308913,238430.503463,369.004711,0.307973
Poly (deg=4),253734.259033,374.883489,0.328744,416666.107016,416.083511,-0.209342


1. Which model generalizes best (best test performance), and what does that tell you about the
relationship between weather and electricity usage?

The model that generalizes best is Polynomial (deg=2) because it has the best test metrics. The lowest Test MSE 234796, lowest Test MAE 362.89, and highest Test R² 0.3185. Weather explains some variation in daily consumption, but a large portion of variability is driven by other factors beyond these weather features.

2. Do polynomial models improve the fit compared to linear regression? If yes, why might electricity
consumption have nonlinear dependence on weather?

Yes, a small polynomial (degree 2) improves test performance slightly over linear regression. Nonlinear effects are plausible because heating/cooling demand often changes more rapidly after crossing comfort thresholds.

3. If higher-degree models perform worse on the test set, explain this behavior using evidence from
metrics (e.g., train error decreases but test error increases).

This is classic overfitting: the degree-4 model learns noise/complex patterns in the training set that do not transfer to new data, so test performance collapses even though training performance improves.

4. If none of the models achieve good test performance, provide at least two reasons supported by
your outputs (e.g., limited feature set, high noise, unmodeled drivers such as occupancy/behavior,
seasonal effects).

One reason is the model only uses AWND, PRCP, TMAX, TMIN. Many major drivers of electricity usage are not included, which limits achievable Test R². The model cannot capture systematic consumption shifts that are not explained by daily weather alone, keeping test R² relatively low.