In [67]:
%pip install --upgrade kagglehub
import kagglehub

# Download latest version
path = kagglehub.dataset_download("ohiedulhaquemdasad/fuel-consumption-based-on-hp-linear-regression")

print("Path to dataset files:", path)



import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt

Note: you may need to restart the kernel to use updated packages.
Path to dataset files: C:\Users\hanis\.cache\kagglehub\datasets\ohiedulhaquemdasad\fuel-consumption-based-on-hp-linear-regression\versions\1


In [49]:
DATA_PATH = "FuelEconomy.csv"
df = pd.read_csv(DATA_PATH)

print("Shape:", df.shape)
print("\nColumns:")
print(df.columns.tolist())

display(df.head())

print("\nSummary statistics:")
display(df.describe(include="all"))

print("\nMissing values per column:")
display(df.isna().sum())


Shape: (100, 2)

Columns:
['Horse Power', 'Fuel Economy (MPG)']


Unnamed: 0,Horse Power,Fuel Economy (MPG)
0,118.770799,29.344195
1,176.326567,24.695934
2,219.262465,23.95201
3,187.310009,23.384546
4,218.59434,23.426739



Summary statistics:


Unnamed: 0,Horse Power,Fuel Economy (MPG)
count,100.0,100.0
mean,213.67619,23.178501
std,62.061726,4.701666
min,50.0,10.0
25%,174.996514,20.439516
50%,218.928402,23.143192
75%,251.706476,26.089933
max,350.0,35.0



Missing values per column:


Horse Power           0
Fuel Economy (MPG)    0
dtype: int64

In [74]:
TARGET_COL = "Horse Power"
##random_state = 5

def prepare_xy(df_in, target_col=TARGET_COL):
    """Drop missing rows, split into X and y."""
    df_clean = df_in.dropna().copy()
    X = df_clean.drop(columns=[target_col])
    y = df_clean[target_col]
    return X, y
    
def split_data(X, y, test_size=0.30, random_state = 42):
    """70/30 random train-test split."""
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def compute_metrics(y_true, y_pred):
    """Return MSE, MAE, R^2."""
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R^2": r2_score(y_true, y_pred),
    }

X, y = prepare_xy(df, TARGET_COL)
X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.30, random_state= 42)

models = {
    "Linear Regression": LinearRegression(),
    "Poly Degree 2": Pipeline([
        ("poly", PolynomialFeatures(degree=2, include_bias=False)),
        ("lr", LinearRegression())
    ]),
    "Poly Degree 3": Pipeline([
        ("poly", PolynomialFeatures(degree=3, include_bias=False)),
        ("lr", LinearRegression())
    ]),
    "Poly Degree 4": Pipeline([
        ("poly", PolynomialFeatures(degree=4, include_bias=False)),
        ("lr", LinearRegression())
    ]),
}
results = []

for model_name, model in models.items():
    # Train
    model.fit(X_train, y_train)

    # Predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Metrics
    train_metrics = compute_metrics(y_train, y_train_pred)
    test_metrics = compute_metrics(y_test, y_test_pred)

    results.append({
        "Model": model_name,
        "Train MSE": train_metrics["MSE"],
        "Train MAE": train_metrics["MAE"],
        "Train R^2": train_metrics["R^2"],
        "Test MSE": test_metrics["MSE"],
        "Test MAE": test_metrics["MAE"],
        "Test R^2": test_metrics["R^2"],
    }) 

results_df = pd.DataFrame(results)
results_df

Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2
0,Linear Regression,357.69918,16.061689,0.90632,318.561087,14.940628,0.912561
1,Poly Degree 2,350.879731,15.995824,0.908106,331.105434,15.14833,0.909118
2,Poly Degree 3,345.108668,15.746762,0.909618,318.404012,14.764973,0.912604
3,Poly Degree 4,339.700171,15.508465,0.911034,313.798757,14.735471,0.913868


1.5 Discussion and interpretation (10 points)
Use your results to answer the following questions with a data-driven explanation:
• Which model performs best on the test set and why?
• Does increasing polynomial degree always improve performance? If not, explain what you observe.
• If a model performs unexpectedly poorly (e.g., low R2 or large test error), propose at least two
plausible reasons, such as:
– underfitting vs overfitting,
– weak relationship between features and target,
– outliers or noise in the data,
– insufficient feature information for predicting HP.
• Support your claims using your reported metrics (not intuition only).

The best model for the test set is poly degree 2. It has the test R^2 value closest to 1 out of the four models.  No increasing polynomial degree does not always improve performance, since increasing the poly degree can actually decrease the R^2 value after a point. For example in this data set its highest R^2 value was poly degree 2, with degree 3 and 4 decreasing. Poly degree 3 performed unexpectedly low. The first reason is that the model experienced overfitting, since the test MSE increased dramatically from degree 2 to 3. This means it is fitting noise rather than the true relationship. Another reason is that poly degree 2 had the lowest test MSE and highest R^2, so adding the cubic term only increases test error, rather than adding stabilization. This increases variance in the model. 