In [12]:
# ============================================================
# Imports
# ============================================================

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt

# Part 1: Fuel Consumption → Horsepower Prediction

**1.1 Load and inspect the dataset**
- Load the CSV into a pandas DataFrame.
- Display column names, shape, and summary statistics (describe()).
- Identify missing values (if any) and clearly state how you handle them.

In [13]:
# ============================================================
# Load dataset
# ============================================================

DATA_PATH_1 = "FuelEconomy.csv"
df1 = pd.read_csv(DATA_PATH_1)

print("Shape:", df1.shape)
print("\nColumns:")
print(df1.columns.tolist())

display(df1.head())

print("\nSummary statistics:")
display(df1.describe(include="all"))

print("\nMissing values per column:")
display(df1.isna().sum())

Shape: (100, 2)

Columns:
['Horse Power', 'Fuel Economy (MPG)']


Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.95201
3,187.310009,23.384546
4,218.59434,23.426739



Summary statistics:


Unnamed: 0,Horse Power,Fuel Economy (MPG)
count,100.0,100.0
mean,213.67619,23.178501
std,62.061726,4.701666
min,50.0,10.0
25%,174.996514,20.439516
50%,218.928402,23.143192
75%,251.706476,26.089933
max,350.0,35.0



Missing values per column:


Horse Power           0
Fuel Economy (MPG)    0
dtype: int64

In [14]:
# ============================================================
# Utility functions
# ============================================================

def prepare_xy(df_in, target_col):
    """Drop missing rows, split into X and y."""
    df_clean = df_in.dropna().copy()
    X = df_clean.drop(columns=[target_col])
    y = df_clean[target_col]
    return X, y

def split_data(X, y, test_size=0.30, random_state=42):
    """70/30 random train-test split."""
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def compute_metrics(y_true, y_pred):
    """Return MSE, MAE, R^2."""
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R^2": r2_score(y_true, y_pred),
    }

def _get_linear_parts(model, input_feature_names):
    """Extract (intercept, coefficients, feature_names) from either:
       - LinearRegression
       - Pipeline(PolynomialFeatures -> LinearRegression)
    """
    # Plain LinearRegression
    if isinstance(model, LinearRegression):
        intercept = float(model.intercept_)
        coefs = np.array(model.coef_).ravel()
        feat_names = np.array(list(input_feature_names))
        return intercept, coefs, feat_names

    # Polynomial pipeline
    if hasattr(model, "named_steps") and "poly" in model.named_steps and "lr" in model.named_steps:
        poly = model.named_steps["poly"]
        lr = model.named_steps["lr"]

        feat_names = poly.get_feature_names_out(input_features=list(input_feature_names))
        intercept = float(lr.intercept_)
        coefs = np.array(lr.coef_).ravel()
        return intercept, coefs, np.array(feat_names)

    raise ValueError("Unsupported model type for equation printing.")

def print_fitted_equation(model, input_feature_names, target_name, top_k_terms=15):
    """Print a readable fitted equation.

    For polynomial models, the number of terms can become very large,
    so we print only the TOP-K terms by absolute coefficient magnitude.
    """
    intercept, coefs, feat_names = _get_linear_parts(model, input_feature_names)

    # Sort by absolute coefficient magnitude
    order = np.argsort(np.abs(coefs))[::-1]
    order = order[:min(top_k_terms, len(coefs))]

    terms = []
    for idx in order:
        terms.append(f"({coefs[idx]:+.4f}) * {feat_names[idx]}")

    eq = f"{target_name} = {intercept:.4f} " + " ".join(terms)

    print("\n--- Fitted Model Equation (Top Terms) ---")
    print(eq)
    if len(coefs) > top_k_terms:
        print(f"(Showing top {top_k_terms} terms out of {len(coefs)} total terms.)")


def run_models_and_evaluate(df_in, target_col, degrees=(1, 2, 3, 4),
                            test_size=0.30, random_state=42,
                            show_equation=True, top_k_terms=15):
    """Train/evaluate linear (deg=1) + polynomial regression models.

    Returns a DataFrame of metrics.
    Also prints fitted equations and scatter plots (test set) for each model.
    """
    X, y = prepare_xy(df_in, target_col=target_col)
    X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_size, random_state=random_state)

    rows = []

    for deg in degrees:
        if deg == 1:
            model = LinearRegression()
            model_name = "Linear Regression"
        else:
            model = Pipeline([
                ("poly", PolynomialFeatures(degree=deg, include_bias=False)),
                ("lr", LinearRegression())
            ])
            model_name = f"Polynomial Regression (degree={deg})"

        # Fit model
        model.fit(X_train, y_train)

        # Predict
        yhat_train = model.predict(X_train)
        yhat_test  = model.predict(X_test)

        # Metrics
        train_m = compute_metrics(y_train, yhat_train)
        test_m  = compute_metrics(y_test, yhat_test)

        # Report equation + plot (TEST set)
        print("\n============================================================")
        print(f"Model: {model_name}")
        print("============================================================")

        if show_equation:
            print_fitted_equation(
                model=model,
                input_feature_names=X_train.columns,
                target_name=target_col,
                top_k_terms=top_k_terms
            )

        rows.append({
            "Model": model_name,
            "Train MSE": train_m["MSE"],
            "Train MAE": train_m["MAE"],
            "Train R^2": train_m["R^2"],
            "Test MSE": test_m["MSE"],
            "Test MAE": test_m["MAE"],
            "Test R^2": test_m["R^2"],
            "Train size": len(X_train),
            "Test size": len(X_test),
        })

    return pd.DataFrame(rows)

**1.2 Train/Test split (70% / 30% random)**
- Randomly split the dataset into 70% training and 30% testing.
- Use a fixed random state for reproducibility.

**1.3 Model training: Linear + Polynomial regression**
- Train the following models to predict HP:

(a) Linear Regression

(b) Polynomial Regression (degree 2)

(c) Polynomial Regression (degree 3)

(d) Polynomial Regression (degree 4)
- Do NOT use any regularization (no Ridge/Lasso/ElasticNet).
- Use PolynomialFeatures + LinearRegression for polynomial models.

**1.4 Model evaluation (train and test)**
- For each model, report metrics on both train and test sets:
MSE, MAE, R2
- Present your results in a clean table (recommended).

In [15]:
target_col_1 = "Fuel Economy (MPG)"

results_1 = run_models_and_evaluate(
    df1,
    target_col_1,
    degrees=(1, 2, 3, 4),
    show_equation=True,
    top_k_terms=15
)

display(results_1)


Model: Linear Regression

--- Fitted Model Equation (Top Terms) ---
Fuel Economy (MPG) = 38.8390 (-0.0732) * Horse Power

Model: Polynomial Regression (degree=2)

--- Fitted Model Equation (Top Terms) ---
Fuel Economy (MPG) = 39.0405 (-0.0754) * Horse Power (+0.0000) * Horse Power^2

Model: Polynomial Regression (degree=3)

--- Fitted Model Equation (Top Terms) ---
Fuel Economy (MPG) = 42.5506 (-0.1394) * Horse Power (+0.0004) * Horse Power^2 (-0.0000) * Horse Power^3

Model: Polynomial Regression (degree=4)

--- Fitted Model Equation (Top Terms) ---
Fuel Economy (MPG) = 32.4089 (+0.1406) * Horse Power (-0.0022) * Horse Power^2 (+0.0000) * Horse Power^3 (-0.0000) * Horse Power^4


Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2,Train size,Test size
0,Linear Regression,2.115741,1.209978,0.90632,1.67495,1.031271,0.913315,70,30
1,Polynomial Regression (degree=2),2.11507,1.210303,0.90635,1.657031,1.025411,0.914243,70,30
2,Polynomial Regression (degree=3),2.06055,1.211527,0.908764,1.903743,1.087196,0.901475,70,30
3,Polynomial Regression (degree=4),1.917714,1.168259,0.915088,2.54846,1.203406,0.868108,70,30


**1.5 Discussion and interpretation**

Use your results to answer the following questions with a data-driven explanation:
- Which model performs best on the test set and why?
- Does increasing polynomial degree always improve performance? If not, explain what you observe.
- If a model performs unexpectedly poorly (e.g., low R2 or large test error), propose at least two plausible reasons, such as:

  - underfitting vs overfitting,
  - weak relationship between features and target,
  - outliers or noise in the data,
  - insufficient feature information for predicting HP.
- Support your claims using your reported metrics (not intuition only).

# Part 2: Weather → Daily Electricity Consumption Prediction

**2.1 Load and inspect the dataset**

- Load the dataset into pandas.
- Print column names, shape, summary statistics.
- Clearly identify the dependent variable: daily consumption.
- Identify missing values (if any) and handle them consistently.

In [16]:
# ============================================================
# Load dataset
# ============================================================

DATA_PATH_2 = "electricity_consumption_based_weather_dataset.csv"
df2 = pd.read_csv(DATA_PATH_2)

print("Shape:", df2.shape)
print("\nColumns:")
print(df2.columns.tolist())

display(df2.head())

print("\nSummary statistics:")
display(df2.describe(include="all"))

print("\nMissing values per column:")
display(df2.isna().sum())

Shape: (1433, 6)

Columns:
['date', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'daily_consumption']


Unnamed: 0,date,AWND,PRCP,TMAX,TMIN,daily_consumption
0,2006-12-16,2.5,0.0,10.6,5.0,1209.176
1,2006-12-17,2.6,0.0,13.3,5.6,3390.46
2,2006-12-18,2.4,0.0,15.0,6.7,2203.826
3,2006-12-19,2.4,0.0,7.2,2.2,1666.194
4,2006-12-20,2.4,0.0,7.2,1.1,2225.748



Summary statistics:


Unnamed: 0,date,AWND,PRCP,TMAX,TMIN,daily_consumption
count,1433,1418.0,1433.0,1433.0,1433.0,1433.0
unique,1433,,,,,
top,2006-12-16,,,,,
freq,1,,,,,
mean,,2.642313,3.800488,17.187509,9.141242,1561.078061
std,,1.140021,10.973436,10.136415,9.028417,606.819667
min,,0.0,0.0,-8.9,-14.4,14.218
25%,,1.8,0.0,8.9,2.2,1165.7
50%,,2.4,0.0,17.8,9.4,1542.65
75%,,3.3,1.3,26.1,17.2,1893.608



Missing values per column:


date                  0
AWND                 15
PRCP                  0
TMAX                  0
TMIN                  0
daily_consumption     0
dtype: int64

**2.2 Train/Test split (70% / 30% random)**
- Randomly split into 70% training and 30% testing.
- Use a fixed random state so results are reproducible.

**2.3 Model training: Linear + Polynomial regression**
- Train the following models to predict daily consumption:
  
(a) Linear Regression

(b) Polynomial Regression (degree 2)

(c) Polynomial Regression (degree 3)

(d) Polynomial Regression (degree 4)
- Do NOT use regularization.
- Ensure your features are correctly separated from the target.
  
**1.4 Model evaluation (train and test)**
- For each model, report metrics on both train and test sets:
MSE, MAE, R2
- Present your results in a clean table (recommended).

In [18]:
target_col_2 = "daily_consumption"
drop_col = "date"
df2_clean = df2.drop(columns=[drop_col]).copy()

results_2 = run_models_and_evaluate(
    df2_clean,
    target_col_2,
    degrees=(1, 2, 3, 4),
    show_equation=True,
    top_k_terms=15
)

display(results_2)


Model: Linear Regression

--- Fitted Model Equation (Top Terms) ---
daily_consumption = 2045.7562 (-17.2734) * TMAX (-17.0839) * TMIN (-7.5783) * AWND (-3.3603) * PRCP

Model: Polynomial Regression (degree=2)

--- Fitted Model Equation (Top Terms) ---
daily_consumption = 2037.4142 (+165.5267) * AWND (-60.9740) * TMAX (-27.8962) * AWND^2 (+26.8325) * TMIN (-10.4820) * PRCP (-2.6130) * TMAX TMIN (-1.9368) * PRCP TMIN (+1.8505) * TMAX^2 (+1.5725) * PRCP TMAX (+1.2309) * AWND PRCP (+1.1693) * AWND TMIN (-1.0760) * AWND TMAX (+0.2939) * TMIN^2 (-0.0545) * PRCP^2

Model: Polynomial Regression (degree=3)

--- Fitted Model Equation (Top Terms) ---
daily_consumption = 1405.7630 (+509.9634) * AWND (-140.8094) * AWND^2 (+67.4216) * TMAX (-63.7983) * TMIN (+25.0565) * TMAX TMIN (-20.2281) * AWND TMIN (-13.9648) * TMIN^2 (-13.1044) * TMAX^2 (+9.3010) * PRCP (+7.8406) * AWND^3 (-4.9294) * AWND PRCP (+3.8559) * AWND^2 TMAX (+3.4853) * AWND TMAX TMIN (+2.2711) * AWND TMAX (-1.8669) * TMAX^2 TMIN
(Sho

Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2,Train size,Test size
0,Linear Regression,272403.396174,384.465016,0.276,248125.8,375.404537,0.299333,992,426
1,Polynomial Regression (degree=2),264765.769932,379.648753,0.2963,255268.5,379.039083,0.279163,992,426
2,Polynomial Regression (degree=3),259249.53487,375.952901,0.310961,265623.7,385.235167,0.249922,992,426
3,Polynomial Regression (degree=4),251909.339001,372.116566,0.33047,12151490.0,578.642201,-33.313844,992,426


**2.5 Discussion and interpretation**

Write a short, technical discussion that uses your results to answer:

- Which model generalizes best (best test performance), and what does that tell you about the relationship between weather and electricity usage?
- Do polynomial models improve the fit compared to linear regression? If yes, why might electricity consumption have nonlinear dependence on weather?
- If higher-degree models perform worse on the test set, explain this behavior using evidence from metrics (e.g., train error decreases but test error increases).
- If none of the models achieve good test performance, provide at least two reasons supported by your outputs (e.g., limited feature set, high noise, unmodeled drivers such as occupancy/behavior, seasonal effects).