# Imports

In [50]:
import warnings

warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.linalg import svd
import sklearn.cluster as cluster
import plotly.express as px
import plotly.graph_objects as go

np.random.seed(1234)

In [51]:
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
# mpl.rcParams.update({"axes.grid": True})

# 8.1 Parabolic Fitting

## Data Load - Cars

In [52]:
vehicle_data = pd.read_csv("./data/auto-mpg.csv")
vehicle_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [53]:
vehicle_data["horsepower"] = pd.to_numeric(vehicle_data["horsepower"], errors="coerce")
vehicle_data.dropna(inplace=True)

In [54]:
vehicle_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
dtypes: float64(4), int64(4), object(1)
memory usage: 30.6+ KB


## Scatter Horsepower vs. MPG

In [55]:
px.scatter(
    vehicle_data,
    x="horsepower",
    y="mpg",
    trendline="ols",
    trendline_color_override="red",
)

# 8.2 Nonlinear Features

There is no non-linear scikit model, so we can generate our own non-linear features e.g. horsepower squared, then do linear fits including those columns

In [56]:
vehicle_data["hp"] = vehicle_data["horsepower"]
vehicle_data["hp2"] = vehicle_data["hp"] ** 2
vehicle_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,hp,hp2
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,130.0,16900.0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,165.0,27225.0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,150.0,22500.0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,150.0,22500.0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,140.0,19600.0


In [57]:
model = LinearRegression().fit(vehicle_data[["hp", "hp2"]], vehicle_data["mpg"])
[model.intercept_, model.coef_]

[56.90009970211294, array([-0.46618963,  0.00123054])]

In [58]:
[
    LinearRegression().fit(vehicle_data[["hp"]], vehicle_data["mpg"]).coef_,
    LinearRegression().fit(vehicle_data[["hp"]], vehicle_data[["mpg"]]).coef_,
]

[array([-0.15784473]), array([[-0.15784473]])]

In [59]:
from sklearn.metrics import mean_squared_error

mean_squared_error(vehicle_data["mpg"], model.predict(vehicle_data[["hp", "hp2"]]))

18.98476890761722

# 8.3 Prediction vs. Inference

2 distinct goals of modeling - prediction and inference
Prediction
- Model predicts accurate outcomes from real-world data
- For example, given a new previously unseen data point, model can make a reasonable prediction about how well it might perform

Inference
- Model helps understand the true relationship
- Answers key questions
- A much harder task

# 8.4 Scikit Transformers

In [60]:
from sklearn.preprocessing import PolynomialFeatures

poly_transform = PolynomialFeatures(degree=3, include_bias=False)
df_cubic = pd.DataFrame(
    poly_transform.fit_transform(vehicle_data[["hp"]]),
    columns=poly_transform.get_feature_names_out(),
)
df_cubic.head()

Unnamed: 0,hp,hp^2,hp^3
0,130.0,16900.0,2197000.0
1,165.0,27225.0,4492125.0
2,150.0,22500.0,3375000.0
3,150.0,22500.0,3375000.0
4,140.0,19600.0,2744000.0


In [61]:
vehicle_data.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,hp,hp2
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,130.0,16900.0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,165.0,27225.0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,150.0,22500.0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,150.0,22500.0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,140.0,19600.0


# 8.5 Scikit Transformers

In [62]:
cu_model = LinearRegression(fit_intercept=True).fit(df_cubic, vehicle_data["mpg"])

In [63]:
cu_model.predict([[130, 130**2, 130**3]])

array([17.15342144])

## Predicting Values from New Data

The model thusly defined needs cubic features, but a new data point would just be the horsepower.  

You could build the features yourself using transformer, for example, or...

In [64]:
cu_model.predict(poly_transform.fit_transform([[100]]))

array([22.44325879])

In [65]:
mean_squared_error(
    vehicle_data["mpg"],
    cu_model.predict(poly_transform.fit_transform(vehicle_data[["hp"]])),
)

18.94498981448592

## Scikit Pipeline

The above workarounds are a bit clunky  
Instead, use pipeline. Consider the above workflow a 2-stage pipeline:
- Generate features
- Do the linear regression fit

### Build the Pipeline

In [66]:
from sklearn.pipeline import Pipeline

pipelined_model = Pipeline(
    [
        ("transform", PolynomialFeatures(degree=3, include_bias=False)),
        ("regression", LinearRegression()),
    ]
)

pipelined_model.fit(vehicle_data[["hp"]], vehicle_data["mpg"])

### Use to Make a New Prediction

In [67]:
pipelined_model.predict([[100]])

array([22.44325879])

### Predict All and Compute MSE

In [68]:
mpg_fit = pipelined_model.predict(vehicle_data[["hp"]])
mean_squared_error(vehicle_data["mpg"], mpg_fit)

18.94498981448592

### Pipeline Model Attributes

Attributes include
- Can be applied to new data directly
- Avoids need to explicitly create separate dataframe with transformer features
- Avoids need to keep track of various separate variables for transformer and regression objects

Downsides
- A bit more work to get at something for a part of the pipeline, for example the model coefs

In [69]:
pipelined_model.named_steps["regression"].coef_

array([-5.68850128e-01,  2.07901126e-03, -2.14662591e-06])

# 8.6 Order 0 - 6 Models on Vehicle Data

Variance in machine learning refers to sensitivity to the training data

Adding just a single outlier to a data set causes the higher-order model to change quite significantly, whereas the lower order models did not change much
- So the lower-order model has lower variance

Showing plot of model performance vs. complexity

![alt text](./data/performance_vs_complexity.png "Performance vs. Complexity")


# 8.7 Dangers of Overfitting

Given N data points, order N - 1 will give MSE = 0

![alt text](./data/fitting_as_linear_system.png "Fitting as Linear System")

Overfitting is basically memorization, just memorizing existing data, but cannot respond to new data at all


# 8.8 Detecting Overfitting with Simple Cross Validation

## To keep things simple and easy to visualize, choose just 35 data points from the car set

In [70]:
vehicle_data_35 = vehicle_data.sample(35, random_state=42)

## Define function to generate MSE given order

In [93]:
def get_mse_for_degree_k(
    k,
    df_fit: pd.DataFrame = vehicle_data_35,
    df_evaluate: pd.DataFrame = vehicle_data_35,
):
    pipelined_model = Pipeline(
        [
            ("transform", PolynomialFeatures(degree=k)),
            ("regression", LinearRegression(fit_intercept=True)),
        ]
    )

    pipelined_model.fit(df_fit[["hp"]], df_fit["mpg"])
    return mean_squared_error(
        df_evaluate["mpg"], pipelined_model.predict(df_evaluate[["hp"]])
    )

## Exercise for degrees 0,...,6

In [94]:
ks = list(range(7))
mses = [get_mse_for_degree_k(k) for k in ks]
df_mses = pd.DataFrame({"k": ks, "MSE": mses})
df_mses

Unnamed: 0,k,MSE
0,0,50.630727
1,1,19.542525
2,2,17.580688
3,3,17.114635
4,4,16.48965
5,5,15.438443
6,6,14.918056


In [73]:
fig = px.scatter(df_mses, x="k", y="MSE").update_traces(mode="lines+markers")
fig.show()

## Exercise model using new points

In [92]:
df_mses["MSE 9 Points"] = [
    get_mse_for_degree_k(k, df_xy=vehicle_data.sample(9, random_state=6)) for k in ks
]
df_mses

Unnamed: 0,k,MSE,MSE 9 Points
0,0,50.630727,70.044988
1,1,19.542525,33.174596
2,2,17.580688,22.587845
3,3,17.114635,36.360989
4,4,16.48965,63.417542
5,5,15.438443,1041.812626
6,6,14.918056,2520.528413
