# Imports

In [28]:
import warnings

warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy.linalg import svd
import sklearn.cluster as cluster
import plotly.express as px
import plotly.graph_objects as go

np.random.seed(1234)

In [3]:
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", None)
# mpl.rcParams.update({"axes.grid": True})

# 8.1 Parabolic Fitting

## Data Load - Cars

In [7]:
df = pd.read_csv("./data/auto-mpg.csv")
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140,3449,10.5,70,1,ford torino


In [18]:
df["horsepower"] = pd.to_numeric(df["horsepower"], errors="coerce")
df.dropna(inplace=True)

In [45]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           392 non-null    float64
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   car name      392 non-null    object 
 9   hp2           392 non-null    float64
 10  hp            392 non-null    float64
dtypes: float64(6), int64(4), object(1)
memory usage: 36.8+ KB


## Scatter Horsepower vs. MPG

In [21]:
px.scatter(
    df,
    x="horsepower",
    y="mpg",
    trendline="ols",
    trendline_color_override="red",
)

# 8.2 Nonlinear Features

There is no non-linear scikit model, so we can generate our own non-linear features e.g. horsepower squared, then do linear fits including those columns

In [25]:
df["hp"] = df["horsepower"]
df["hp2"] = df["hp"] ** 2
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,hp2,hp
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,16900.0,130.0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,27225.0,165.0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,22500.0,150.0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,22500.0,150.0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,19600.0,140.0


In [58]:
model = LinearRegression(fit_intercept=True).fit(df[["hp", "hp2"]], df["mpg"])

In [59]:
[model.intercept_, model.coef_]

[56.90009970211294, array([-0.46618963,  0.00123054])]

In [60]:
from sklearn.metrics import mean_squared_error

mean_squared_error(df["mpg"], model.predict(df[["hp", "hp2"]]))

18.98476890761722

# 8.3 Prediction vs. Inference

2 distinct goals of modeling - prediction and inference
Prediction
- Model predicts accurate outcomes from real-world data
- For example, given a new previously unseen data point, model can make a reasonable prediction about how well it might perform

Inference
- Model helps understand the true relationship
- Answers key questions
- A much harder task

# 8.4 Scikit Transformers

In [74]:
from sklearn.preprocessing import PolynomialFeatures

poly_transform = PolynomialFeatures(degree=3, include_bias=False)
df_pf = pd.DataFrame(
    poly_transform.fit_transform(df[["hp"]]),
    columns=poly_transform.get_feature_names_out(),
)
df_pf.head()

Unnamed: 0,hp,hp^2,hp^3
0,130.0,16900.0,2197000.0
1,165.0,27225.0,4492125.0
2,150.0,22500.0,3375000.0
3,150.0,22500.0,3375000.0
4,140.0,19600.0,2744000.0


In [68]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name,hp2,hp
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu,16900.0,130.0
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320,27225.0,165.0
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite,22500.0,150.0
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst,22500.0,150.0
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino,19600.0,140.0
