# Data-Driven Modeling And Machine Learning Assignment 1

## Part 1: Fuel compsumption -> Horsepower prediction 
### 1.1 Load and inspect the dataset

In [132]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

import matplotlib.pyplot as plt

In [171]:
dataframe = pd.read_csv("FuelEconomy.csv")
print("Shape:", dataframe.shape)
print("\nColumns:")
print(dataframe.columns.tolist())
print("\nSummary Statistics")
print(dataframe.describe())
print("\nMissing values per column:")
display(dataframe.isna().sum())

Shape: (100, 2)

Columns:
['Horse Power', 'Fuel Economy (MPG)']

Summary Statistics
       Horse Power  Fuel Economy (MPG)
count   100.000000          100.000000
mean    213.676190           23.178501
std      62.061726            4.701666
min      50.000000           10.000000
25%     174.996514           20.439516
50%     218.928402           23.143192
75%     251.706476           26.089933
max     350.000000           35.000000

Missing values per column:


Horse Power           0
Fuel Economy (MPG)    0
dtype: int64

### 1.2 Train/Test split (70% / 30% random)

In [137]:
train_test_split(dataframe[["Horse Power"]], dataframe["Fuel Economy (MPG)"], test_size=0.3, random_state=42)

[    Horse Power
 11   120.484236
 47   244.358343
 85   238.836490
 28    91.440264
 93   131.542116
 ..          ...
 60   230.462677
 71   299.530458
 14   211.729109
 92   227.380124
 51   290.887200
 
 [70 rows x 1 columns],
     Horse Power
 83   232.216607
 53   250.709289
 70   223.179649
 45   132.669569
 44   227.670465
 39   252.552386
 22   122.040161
 80   198.958315
 10   321.840752
 0    118.770799
 18   123.885698
 30   350.000000
 73   261.852110
 33   282.604246
 90   237.673085
 4    218.594340
 76   214.484493
 77   320.951358
 12   155.415368
 31   175.979219
 55   199.147638
 88   203.571654
 26   258.424223
 42   243.737242
 69   315.817498
 15   259.183192
 40   218.107081
 96   266.869640
 9    163.350335
 72   175.348913,
 11    29.678637
 47    19.536770
 85    19.674094
 28    31.806706
 93    29.397567
         ...    
 60    20.715722
 71    16.693369
 14    25.341892
 92    24.181109
 51    17.873266
 Name: Fuel Economy (MPG), Length: 70, dtype: float64,


### 1.3 Model training: Linear + Polynomial regression & 1.4 Train/Test split (70% / 30% random)

In [140]:
TARGET_COL = "Horse Power" 
 
def prepare_xy(df_in, target_col=TARGET_COL):
    """Drop missing rows, split into X and y."""
    df_clean = df_in.dropna().copy()
    X = df_clean.drop(columns=[target_col])
    y = df_clean[target_col]
    return X, y

def split_data(X, y, test_size=0.30, random_state=42):
    """70/30 random train-test split."""
    return train_test_split(X, y, test_size=test_size, random_state=random_state)

def compute_metrics(y_true, y_pred):
    """Return MSE, MAE, R^2."""
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "R^2": r2_score(y_true, y_pred),
    }

def run_models_and_evaluate(df_in, scenario_name, degrees=(1, 2, 3, 4),
                            target_col=TARGET_COL, test_size=0.30, random_state=42,
                            show_equation=True, show_plots=True, top_k_terms=15):
    """Train/evaluate linear (deg=1) + polynomial regression models.

    Returns a DataFrame of metrics.
    Also prints fitted equations and scatter plots (test set) for each model.
    """
    X, y = prepare_xy(df_in, target_col=target_col)
    X_train, X_test, y_train, y_test = split_data(X, y, test_size=test_size, random_state=random_state)

    rows = []

    for deg in degrees:
        if deg == 1:
            model = LinearRegression()
            model_name = "Linear Regression"
        else:
            model = Pipeline([
                ("poly", PolynomialFeatures(degree=deg, include_bias=False)),
                ("lr", LinearRegression())
            ])
            model_name = f"Polynomial Regression (degree={deg})"

        # Fit model
        model.fit(X_train, y_train)

        # Predict
        yhat_train = model.predict(X_train)
        yhat_test  = model.predict(X_test)

        # Metrics
        train_m = compute_metrics(y_train, yhat_train)
        test_m  = compute_metrics(y_test, yhat_test)

        if show_equation:
            print_fitted_equation(
                model=model,
                input_feature_names=X_train.columns,
                target_name=target_col,
                top_k_terms=top_k_terms
            )

        if show_plots:
            plot_actual_vs_predicted_test(
                y_test=y_test,
                y_pred=yhat_test,
                title=f"{scenario_name} — {model_name} (Test Set: Actual vs Predicted)"
            )

        rows.append({
            "Model": model_name,
            "Train MSE": train_m["MSE"],
            "Train MAE": train_m["MAE"],
            "Train R^2": train_m["R^2"],
            "Test MSE": test_m["MSE"],
            "Test MAE": test_m["MAE"],
            "Test R^2": test_m["R^2"],
        })

    return pd.DataFrame(rows)

results = run_models_and_evaluate(
    dataframe,
    scenario_name="Horse Power vs Fuel Economy",
    degrees=(1, 2, 3, 4),
    target_col="Horse Power",
    test_size=0.30,
    random_state=42,
    show_equation=False,
    show_plots=False
)

display(results)


Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2
0,Linear Regression,357.69918,16.061689,0.90632,318.561087,14.940628,0.912561
1,Polynomial Regression (degree=2),350.879731,15.995824,0.908106,331.105434,15.14833,0.909118
2,Polynomial Regression (degree=3),345.108668,15.746762,0.909618,318.404012,14.764973,0.912604
3,Polynomial Regression (degree=4),339.700171,15.508465,0.911034,313.798757,14.735471,0.913868


### 1.5 Discussion and interpretation  

The model that has performed the best on the test set is polynomial regression by degree 4.It has the lowest test MSE (313.799), lowest test MAE (14.74), and highest test R² (0.9139). Which shows a strong relationship between horse power and fuel economy. Increasing polynomial degree doesn't improve performance in every case. However, it does improve in this test case. The R² does increase slightly from 0.909 in degree 2 to 0.914 in degree 4. The error also decreases, from test MSE in degree 2 being 331 and dropping to 314 in degree 4. So polynomial degree imrpove performance in this test case. No models in this test has performed poorly. 
 

## Part 2: Weather → Daily Electricity Consumption Prediction
### 2.1 Load and inspect the dataset

In [169]:
dataframe = pd.read_csv("electricity_consumption_based_weather_dataset.csv")
print("Shape:", dataframe.shape)
print("\nColumns:")
print(dataframe.columns.tolist())
print("\nSummary Statistics")
print(dataframe.describe())
print("\nMissing values per column:")
display(dataframe.isna().sum())



Shape: (1433, 6)

Columns:
['date', 'AWND', 'PRCP', 'TMAX', 'TMIN', 'daily_consumption']

Summary Statistics
              AWND         PRCP         TMAX         TMIN  daily_consumption
count  1418.000000  1433.000000  1433.000000  1433.000000        1433.000000
mean      2.642313     3.800488    17.187509     9.141242        1561.078061
std       1.140021    10.973436    10.136415     9.028417         606.819667
min       0.000000     0.000000    -8.900000   -14.400000          14.218000
25%       1.800000     0.000000     8.900000     2.200000        1165.700000
50%       2.400000     0.000000    17.800000     9.400000        1542.650000
75%       3.300000     1.300000    26.100000    17.200000        1893.608000
max      10.200000   192.300000    39.400000    27.200000        4773.386000

Missing values per column:


date                  0
AWND                 15
PRCP                  0
TMAX                  0
TMIN                  0
daily_consumption     0
dtype: int64

### 2.2 Train/Test split (70% / 30% random)

In [148]:
def prepare_xy(df_in, target_col="daily_consumption"):
    df_clean = df_in.dropna().copy()
    X = df_clean.drop(columns=[target_col, "date"])
    y = df_clean[target_col]
    return X, y

X, y = prepare_xy(dataframe)


train_test_split(X, y, test_size=0.3, random_state=42)

[      AWND  PRCP  TMAX  TMIN
 178    3.1   0.0  18.9  13.9
 287    1.6   0.0  22.2  13.3
 1381   2.5   0.0  17.8  12.2
 1187   2.2   0.0  23.3  12.2
 294    1.7   0.0  27.8  19.4
 ...    ...   ...   ...   ...
 1104   5.5  37.6  12.2   2.8
 1142   2.3   1.5   3.9  -1.7
 1306   2.4   0.0  34.4  24.4
 860    2.0   0.0  31.1  11.1
 1138   4.0   0.0  -6.7 -10.6
 
 [992 rows x 4 columns],
       AWND  PRCP  TMAX  TMIN
 51     5.0   0.0  -7.8 -13.3
 481    3.0   2.0  16.1   9.4
 806    4.9  12.4  -2.2  -8.9
 1273   1.1   0.3  26.1  17.8
 297    1.7   4.1  20.6  16.1
 ...    ...   ...   ...   ...
 720    2.2   0.5   2.2  -2.2
 705    3.1   0.0   3.3  -3.9
 352    4.4   0.0   1.1  -2.2
 666    1.6   0.0  23.3  14.4
 124    2.3   0.3  16.1   7.2
 
 [426 rows x 4 columns],
 178      927.272
 287     1516.290
 1381    1315.456
 1187     131.732
 294     2106.114
           ...   
 1104    2183.618
 1142    2903.014
 1306     806.352
 860     2022.284
 1138    2842.728
 Name: daily_consumption, Le

### 2.3 Model training: Linear + Polynomial regression & 2.4 Train/Test split (70% / 30% random)

In [151]:
results = run_models_and_evaluate(
    dataframe,
    scenario_name="Weather vs Daily Electricity Consumption Prediction",
    degrees=(1, 2, 3, 4),
    target_col="daily_consumption",
    test_size=0.30,
    random_state=42,
    show_equation=False,
    show_plots=False
)

display(results)

Unnamed: 0,Model,Train MSE,Train MAE,Train R^2,Test MSE,Test MAE,Test R^2
0,Linear Regression,272403.396174,384.465016,0.276,248125.8,375.404537,0.299333
1,Polynomial Regression (degree=2),264765.769932,379.648753,0.2963,255268.5,379.039083,0.279163
2,Polynomial Regression (degree=3),259249.53487,375.952901,0.310961,265623.7,385.235167,0.249922
3,Polynomial Regression (degree=4),251909.339001,372.116566,0.33047,12151490.0,578.642201,-33.313844


### 2.5 Discussion and interpretation 

The model that generalize the best is the linear regression model, with the highest  Test R² (0.299) and lower Test MSE (2.48×10⁵). Which shows that the relationship between weather and electricity usage is weakly linearly correlated. The polynomial models improve in train R^2 compared to linear regression, but does worst in test R^2 as it gradually decreases. It shows that it has a nonlinear relationship between weather and electricity consumption. Higher-degree models performs worst on the test sets as the mse decreases from 272 to 251 in train sets, but increases sharply in test sets to 1.21×10⁷. Also in degree 4, the test R² (−33.31) shows instability. None of the models have a good test performance, it's because of high noise and dates being ignored, also weather can not explain explain electricity usage. 