### Models for Car Prices

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder, PolynomialFeatures
from sklearn.pipeline import Pipeline

In [2]:
#load the cars data, specify car name as index
cars = pd.read_csv('data/mtcars.csv', index_col = 0)
#inspect
cars.head()

Unnamed: 0,mpg,cyl,disp,hp,drat,wt,qsec,vs,am,gear,carb
Mazda RX4,21.0,6,160.0,110,3.9,2.62,16.46,0,1,4,4
Mazda RX4 Wag,21.0,6,160.0,110,3.9,2.875,17.02,0,1,4,4
Datsun 710,22.8,4,108.0,93,3.85,2.32,18.61,1,1,4,1
Hornet 4 Drive,21.4,6,258.0,110,3.08,3.215,19.44,1,0,3,1
Hornet Sportabout,18.7,8,360.0,175,3.15,3.44,17.02,0,0,3,2


In [3]:
#mpg vs. hp -- X and y (X as DataFrame)
X = cars[['mpg']]
y = cars['hp']

In [4]:
#Linear model instantiate
lr = LinearRegression()

In [5]:
#cross validate with neg_mean_squared_error as scorer
# insert model, X, y, cv, and scoring
# this splits data into 5 sections - trains on 4 sections and tests/validates on one
# the scores are for each training section
# notice that the last score is quite large
lin_perf = cross_val_score(lr, X, y, cv=5, scoring='neg_mean_squared_error')

In [6]:
lin_perf

array([-1488.37123279, -1158.31066907, -2477.86137832, -1166.38301226,
       -6395.24822953])

In [7]:
lin_perf.mean()

-2537.2349043959885

In [8]:
#add a quadratic feature
#this build a polynomial model
X['mpg^2']=cars['mpg']**2
X.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['mpg^2']=cars['mpg']**2


Unnamed: 0,mpg,mpg^2
Mazda RX4,21.0,441.0
Mazda RX4 Wag,21.0,441.0
Datsun 710,22.8,519.84
Hornet 4 Drive,21.4,457.96
Hornet Sportabout,18.7,349.69


In [9]:
lr_quad = LinearRegression()

In [10]:
#cross validate linear model
quad_perf = cross_val_score(lr_quad, X, y, cv=5, scoring='neg_mean_squared_error')

In [13]:
quad_perf

array([-1017.45026951,  -754.50829529, -8854.77987111, -1005.40693089,
       -6247.55292699])

In [14]:
quad_perf.mean()

-3575.939658756365

In [11]:
#Which is better???
#still seems to be an issue of variability in the last section
#can take the mean of the neg mean square error to eval perf
#note: if using r-squared as the metric- it will get better as the number of features increase.
# adjusted r-squared will try to account for this scale of features

array([-1017.45026951,  -754.50829529, -8854.77987111, -1005.40693089,
       -6247.55292699])

-3575.939658756365

In [None]:
#plot histograms of residuals


### `PolynomialFeatures`

Redo the above the `scikitlearn` way.

In [33]:
import sklearn
sklearn.__version__
# this won't have get feature names out, just jas f

'0.24.2'

In [35]:

# note - it may not always make sense to square all of your features? ex with mult features
X2=cars[['mpg', 'disp']]

pd.DataFrame(poly_features.fit_transform(X2), columns = poly_features.get_feature_names())



Unnamed: 0,x0,x1,x0^2,x0 x1,x1^2
0,21.0,160.0,441.0,3360.0,25600.0
1,21.0,160.0,441.0,3360.0,25600.0
2,22.8,108.0,519.84,2462.4,11664.0
3,21.4,258.0,457.96,5521.2,66564.0
4,18.7,360.0,349.69,6732.0,129600.0
5,18.1,225.0,327.61,4072.5,50625.0
6,14.3,360.0,204.49,5148.0,129600.0
7,24.4,146.7,595.36,3579.48,21520.89
8,22.8,140.8,519.84,3210.24,19824.64
9,19.2,167.6,368.64,3217.92,28089.76


In [15]:
#train and test spli
X_train, X_test, y_train, y_test = train_test_split(cars[['mpg']], cars['hp'], random_state=42)
# this is our starting train set
X_train.head()

Unnamed: 0,mpg
Merc 450SL,17.3
Mazda RX4,21.0
Hornet Sportabout,18.7
Chrysler Imperial,14.7
Valiant,18.1


In [17]:
#instantiate
# default is degree set to 2, which will generate a quadratic feature
# include_bias is to include an intercept term, which we don't need, so we'll set it to default
# this object works similarly to regression esitmator
poly_features = PolynomialFeatures(degree = 2, include_bias=False)


In [19]:
#fit and transform train
# this adds a second column with mpg squared 
# the test data is meant to be unseen,the fit method conducts some sort of learning on the data
# and we dont want to learn anything from the test data 
X_train_quad = poly_features.fit_transform(X_train)

In [21]:
#transform test
#transform() just transforms the test, it doesn't learn from it at all. 
X_test_quad = poly_features.transform(X_test)
X_test_quad

array([[  19.7 ,  388.09],
       [  10.4 ,  108.16],
       [  19.2 ,  368.64],
       [  32.4 , 1049.76],
       [  22.8 ,  519.84],
       [  19.2 ,  368.64],
       [  15.  ,  225.  ],
       [  27.3 ,  745.29]])

In [23]:
#fit the model
quad = LinearRegression().fit(X_train_quad, y_train)

LinearRegression()

In [24]:
quad.coef_

array([-27.14643408,   0.42488024])

In [25]:
quad.intercept_

501.1285838922789

In [26]:
#make predictions
quad.predict(X_test_quad)

array([131.23560303, 264.76071571, 136.54489949,  67.60639551,
       103.05962838, 136.54489949, 189.53012563,  76.68992407])

In [27]:
from sklearn.metrics import mean_squared_error

In [28]:
mean_squared_error(quad.predict(X_test_quad), y_test, squared=False)
# so on avg, the quadratic estimator was off by about 58 HP. 

58.520303388705614

### Using a `Pipeline`

Combine transformer with preprocessing.

In [36]:
from sklearn.pipeline import Pipeline

In [40]:
#pipeline to poly then regressor
# this pipeline will do the polynomial feature step for us
# this allows us to consume data, transform and feed to an estimator in one step
quad_pipe = Pipeline([('polynomial_features', PolynomialFeatures(include_bias=False)),
                    ('regressor', LinearRegression())])


In [41]:
#cross validate the pipeline
cross_val_score(quad_pipe, X_train, y_train, scoring= 'neg_mean_squared_error')

array([-1038.73412944,  -335.21266277, -1428.57434733,  -926.28033774,
       -4783.35272794])

In [43]:
#evaluate on test
quad_pipe.fit(X_train, y_train)
preds = quad_pipe.predict(X_test)
mean_squared_error(y_test, preds)

3424.62590870615