# James Jones
## 12-07-2022

### Steps to Perform:
- Perform initial preprocessing of data
- Perform preprocessing for statsmodels
- Run the model in statsmodels and produce a results summary
- Evaluate the model on the test set

In [1]:
# Import Libraries
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer, make_column_selector

# Fix random seed to reproducibility
np.random.seed(91)

### Perform initial preprocessing of data

In [2]:
# Load data
df = pd.read_csv('CarPrice_Assignment.csv')
df.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,alfa-romero giulia,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,alfa-romero stelvio,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,alfa-romero Quadrifoglio,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,audi 100 ls,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950.0
4,5,2,audi 100ls,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450.0


In [3]:
print(df.info())
print('\n')
print(df.isna().sum())
print('\n')
print(df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   car_ID            205 non-null    int64  
 1   symboling         205 non-null    int64  
 2   CarName           205 non-null    object 
 3   fueltype          205 non-null    object 
 4   aspiration        205 non-null    object 
 5   doornumber        205 non-null    object 
 6   carbody           205 non-null    object 
 7   drivewheel        205 non-null    object 
 8   enginelocation    205 non-null    object 
 9   wheelbase         205 non-null    float64
 10  carlength         205 non-null    float64
 11  carwidth          205 non-null    float64
 12  carheight         205 non-null    float64
 13  curbweight        205 non-null    int64  
 14  enginetype        205 non-null    object 
 15  cylindernumber    205 non-null    object 
 16  enginesize        205 non-null    int64  
 1

- So far, our data looks nice and clean

### Perform preprocessing for statsmodels

In [4]:
# Make X and y variables
y = df['price'].copy()
X = df.drop(columns = 'price')
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 91)
X_train.head()

Unnamed: 0,car_ID,symboling,CarName,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,...,cylindernumber,enginesize,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
111,112,0,peugeot 504,gas,std,four,sedan,rwd,front,107.9,...,four,120,mpfi,3.46,2.19,8.4,95,5000,19,24
69,70,0,buick century,diesel,turbo,two,hardtop,rwd,front,106.7,...,five,183,idi,3.58,3.64,21.5,123,4350,22,25
157,158,0,toyota corolla 1200,gas,std,four,hatchback,fwd,front,95.7,...,four,98,2bbl,3.19,3.03,9.0,70,4800,30,37
180,181,-1,toyota starlet,gas,std,four,sedan,rwd,front,104.5,...,six,171,mpfi,3.27,3.35,9.2,156,5200,20,24
95,96,1,nissan juke,gas,std,two,hatchback,fwd,front,94.5,...,four,97,2bbl,3.15,3.29,9.4,69,5200,31,37


### Create Preprocessing pipeline


In [5]:
# Start with categorical columns
cat_select = make_column_selector(dtype_include = 'object')
cat_cols = cat_select(X_train)
print(cat_cols)
print(len(cat_cols))

['CarName', 'fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel', 'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem']
10


In [6]:
# Now numeric
num_select = make_column_selector(dtype_include = 'number')
num_cols = num_select(X_train)
print(num_cols)
print(len(num_cols))

['car_ID', 'symboling', 'wheelbase', 'carlength', 'carwidth', 'carheight', 'curbweight', 'enginesize', 'boreratio', 'stroke', 'compressionratio', 'horsepower', 'peakrpm', 'citympg', 'highwaympg']
15


In [7]:
# Create the column transformer and add a constant
cat_pipe = make_pipeline(SimpleImputer(strategy = 'constant',
                                       fill_value = 'MISSING'),
                        OneHotEncoder(handle_unknown = 'ignore',
                                      sparse = False))

num_pipe = make_pipeline(SimpleImputer(strategy = 'mean')) # Don't scale numeric values

preprocessor = make_column_transformer((cat_pipe, cat_cols),
                                       (num_pipe, num_cols),
                                       remainder = 'passthrough')

In [8]:
# We now need to re-make our DF after our Column Transformer
preprocessor.fit(X_train)

#### Now we will get our original feature names, match them to our processed values and rebuild our DF

In [9]:
# Transform 
pd.DataFrame(preprocessor.transform(X_train).round(3))

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,158,159,160,161,162,163,164,165,166,167
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.7,3075.0,120.0,3.46,2.19,8.4,95.0,5000.0,19.0,24.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.9,3495.0,183.0,3.58,3.64,21.5,123.0,4350.0,22.0,25.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,52.8,2109.0,98.0,3.19,3.03,9.0,70.0,4800.0,30.0,37.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.1,3131.0,171.0,3.27,3.35,9.2,156.0,5200.0,20.0,24.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,53.3,2028.0,97.0,3.15,3.29,9.4,69.0,5200.0,31.0,37.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
148,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.7,3252.0,152.0,3.70,3.52,21.0,95.0,4150.0,28.0,33.0
149,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,53.5,2024.0,97.0,3.15,3.29,9.4,69.0,5200.0,31.0,37.0
150,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,49.7,3139.0,181.0,3.43,3.27,9.0,160.0,5200.0,19.0,25.0
151,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.9,2480.0,110.0,3.27,3.35,22.5,73.0,4500.0,30.0,33.0


In [10]:
# Create an empty list for feature names
final_features = []

In [11]:
# Start with categorical features
preprocessor.named_transformers_['pipeline-1'].named_steps

{'simpleimputer': SimpleImputer(fill_value='MISSING', strategy='constant'),
 'onehotencoder': OneHotEncoder(handle_unknown='ignore', sparse=False)}

In [12]:
# Locating OHE 
ohe_step = preprocessor.named_transformers_['pipeline-1'].named_steps['onehotencoder']
ohe_step

In [13]:
# From here, get our cat features
cat_features = ohe_step.get_feature_names_out(cat_cols)
cat_features

array(['CarName_alfa-romero Quadrifoglio', 'CarName_alfa-romero giulia',
       'CarName_alfa-romero stelvio', 'CarName_audi 100ls',
       'CarName_audi 4000', 'CarName_audi 5000',
       'CarName_audi 5000s (diesel)', 'CarName_audi fox',
       'CarName_bmw 320i', 'CarName_bmw x1', 'CarName_bmw x3',
       'CarName_bmw x5', 'CarName_bmw z4', 'CarName_buick century',
       'CarName_buick century luxus (sw)',
       'CarName_buick electra 225 custom',
       'CarName_buick opel isuzu deluxe',
       'CarName_buick regal sport coupe (turbo)', 'CarName_buick skyhawk',
       'CarName_buick skylark', 'CarName_chevrolet impala',
       'CarName_chevrolet monte carlo', 'CarName_dodge challenger se',
       'CarName_dodge colt (sw)', 'CarName_dodge colt hardtop',
       'CarName_dodge coronet custom (sw)', 'CarName_dodge dart custom',
       'CarName_dodge rampage', 'CarName_honda accord',
       'CarName_honda civic', 'CarName_honda civic (auto)',
       'CarName_honda civic 1300', 'CarNam

In [14]:
# Add the categorical feature names to our final_features list
final_features.extend(cat_features)
final_features

['CarName_alfa-romero Quadrifoglio',
 'CarName_alfa-romero giulia',
 'CarName_alfa-romero stelvio',
 'CarName_audi 100ls',
 'CarName_audi 4000',
 'CarName_audi 5000',
 'CarName_audi 5000s (diesel)',
 'CarName_audi fox',
 'CarName_bmw 320i',
 'CarName_bmw x1',
 'CarName_bmw x3',
 'CarName_bmw x5',
 'CarName_bmw z4',
 'CarName_buick century',
 'CarName_buick century luxus (sw)',
 'CarName_buick electra 225 custom',
 'CarName_buick opel isuzu deluxe',
 'CarName_buick regal sport coupe (turbo)',
 'CarName_buick skyhawk',
 'CarName_buick skylark',
 'CarName_chevrolet impala',
 'CarName_chevrolet monte carlo',
 'CarName_dodge challenger se',
 'CarName_dodge colt (sw)',
 'CarName_dodge colt hardtop',
 'CarName_dodge coronet custom (sw)',
 'CarName_dodge dart custom',
 'CarName_dodge rampage',
 'CarName_honda accord',
 'CarName_honda civic',
 'CarName_honda civic (auto)',
 'CarName_honda civic 1300',
 'CarName_honda civic 1500 gl',
 'CarName_honda civic cvcc',
 'CarName_honda prelude',
 'CarNa

In [15]:
# Much less code for numeric features
num_cols

['car_ID',
 'symboling',
 'wheelbase',
 'carlength',
 'carwidth',
 'carheight',
 'curbweight',
 'enginesize',
 'boreratio',
 'stroke',
 'compressionratio',
 'horsepower',
 'peakrpm',
 'citympg',
 'highwaympg']

In [16]:
# Add numeric features which were passed through the model
final_features.extend(num_cols)
final_features

['CarName_alfa-romero Quadrifoglio',
 'CarName_alfa-romero giulia',
 'CarName_alfa-romero stelvio',
 'CarName_audi 100ls',
 'CarName_audi 4000',
 'CarName_audi 5000',
 'CarName_audi 5000s (diesel)',
 'CarName_audi fox',
 'CarName_bmw 320i',
 'CarName_bmw x1',
 'CarName_bmw x3',
 'CarName_bmw x5',
 'CarName_bmw z4',
 'CarName_buick century',
 'CarName_buick century luxus (sw)',
 'CarName_buick electra 225 custom',
 'CarName_buick opel isuzu deluxe',
 'CarName_buick regal sport coupe (turbo)',
 'CarName_buick skyhawk',
 'CarName_buick skylark',
 'CarName_chevrolet impala',
 'CarName_chevrolet monte carlo',
 'CarName_dodge challenger se',
 'CarName_dodge colt (sw)',
 'CarName_dodge colt hardtop',
 'CarName_dodge coronet custom (sw)',
 'CarName_dodge dart custom',
 'CarName_dodge rampage',
 'CarName_honda accord',
 'CarName_honda civic',
 'CarName_honda civic (auto)',
 'CarName_honda civic 1300',
 'CarName_honda civic 1500 gl',
 'CarName_honda civic cvcc',
 'CarName_honda prelude',
 'CarNa

#### Transforming X_train and X_test and making our final DF
- NOTE: 'columns = final_features', 'index = X_train.index'

In [17]:
X_train_df = pd.DataFrame(preprocessor.transform(X_train),
                          columns = final_features, index = X_train.index)
X_train_df.head()

Unnamed: 0,CarName_alfa-romero Quadrifoglio,CarName_alfa-romero giulia,CarName_alfa-romero stelvio,CarName_audi 100ls,CarName_audi 4000,CarName_audi 5000,CarName_audi 5000s (diesel),CarName_audi fox,CarName_bmw 320i,CarName_bmw x1,...,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.7,3075.0,120.0,3.46,2.19,8.4,95.0,5000.0,19.0,24.0
69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.9,3495.0,183.0,3.58,3.64,21.5,123.0,4350.0,22.0,25.0
157,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,52.8,2109.0,98.0,3.19,3.03,9.0,70.0,4800.0,30.0,37.0
180,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,54.1,3131.0,171.0,3.27,3.35,9.2,156.0,5200.0,20.0,24.0
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,53.3,2028.0,97.0,3.15,3.29,9.4,69.0,5200.0,31.0,37.0


In [18]:
# Repeat with testing data (X)
X_test_df = pd.DataFrame(preprocessor.transform(X_test), 
                         columns = final_features, index = X_test.index)
X_test_df.head()

Unnamed: 0,CarName_alfa-romero Quadrifoglio,CarName_alfa-romero giulia,CarName_alfa-romero stelvio,CarName_audi 100ls,CarName_audi 4000,CarName_audi 5000,CarName_audi 5000s (diesel),CarName_audi fox,CarName_bmw 320i,CarName_bmw x1,...,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg
133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.1,2695.0,121.0,3.54,3.07,9.3,110.0,5250.0,21.0,28.0
125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.2,2778.0,151.0,3.94,3.11,9.5,143.0,5500.0,19.0,27.0
23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.8,2128.0,98.0,3.03,3.39,7.6,102.0,5500.0,24.0,30.0
30,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,50.8,1713.0,92.0,2.91,3.41,9.6,58.0,4800.0,49.0,54.0
140,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,55.7,2240.0,108.0,3.62,2.64,8.7,73.0,4400.0,26.0,31.0


In [19]:
# Add a constant column to X_train and X_test
X_train_df = sm.add_constant(X_train_df, has_constant = 'add', prepend = False)
X_test_df = sm.add_constant(X_test_df, has_constant = 'add', prepend = False)
display(X_train_df.head(2), X_test_df.head(2))

Unnamed: 0,CarName_alfa-romero Quadrifoglio,CarName_alfa-romero giulia,CarName_alfa-romero stelvio,CarName_audi 100ls,CarName_audi 4000,CarName_audi 5000,CarName_audi 5000s (diesel),CarName_audi fox,CarName_bmw 320i,CarName_bmw x1,...,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,const
111,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3075.0,120.0,3.46,2.19,8.4,95.0,5000.0,19.0,24.0,1.0
69,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3495.0,183.0,3.58,3.64,21.5,123.0,4350.0,22.0,25.0,1.0


Unnamed: 0,CarName_alfa-romero Quadrifoglio,CarName_alfa-romero giulia,CarName_alfa-romero stelvio,CarName_audi 100ls,CarName_audi 4000,CarName_audi 5000,CarName_audi 5000s (diesel),CarName_audi fox,CarName_bmw 320i,CarName_bmw x1,...,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,const
133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2695.0,121.0,3.54,3.07,9.3,110.0,5250.0,21.0,28.0,1.0
125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2778.0,151.0,3.94,3.11,9.5,143.0,5500.0,19.0,27.0,1.0


### Run the model in statsmodels and produce a results summary

In [21]:
# Instantiate model
model_train = sm.OLS(y_train, X_train_df, hasconst = True)

# Fit model
result_train = model_train.fit()

# Print results summary
result_train.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,1.0
Model:,OLS,Adj. R-squared:,0.994
Method:,Least Squares,F-statistic:,160.4
Date:,"Wed, 07 Dec 2022",Prob (F-statistic):,1.12e-06
Time:,17:46:39,Log-Likelihood:,-954.72
No. Observations:,153,AIC:,2203.0
Df Residuals:,6,BIC:,2649.0
Df Model:,146,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CarName_alfa-romero Quadrifoglio,1.729e+04,9177.328,1.884,0.109,-5168.718,3.97e+04
CarName_alfa-romero giulia,1.802e+04,7305.488,2.467,0.049,147.879,3.59e+04
CarName_alfa-romero stelvio,2.078e+04,7240.897,2.870,0.028,3064.706,3.85e+04
CarName_audi 100ls,9080.8340,4022.385,2.258,0.065,-761.588,1.89e+04
CarName_audi 4000,9524.6261,4922.360,1.935,0.101,-2519.954,2.16e+04
CarName_audi 5000,1.1e+04,3443.168,3.196,0.019,2578.397,1.94e+04
CarName_audi 5000s (diesel),2702.3184,4876.311,0.554,0.599,-9229.584,1.46e+04
CarName_audi fox,7711.2239,4563.752,1.690,0.142,-3455.875,1.89e+04
CarName_bmw 320i,3.278e+04,7343.593,4.463,0.004,1.48e+04,5.07e+04

0,1,2,3
Omnibus:,16.961,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,66.087
Skew:,-0.004,Prob(JB):,4.46e-15
Kurtosis:,6.22,Cond. No.,9.46e+17


### Evaluate the model on the test set

In [22]:
# Instantiate model
model_test = sm.OLS(y_test, X_test_df, hasconst = True)

# Fit model
result_test = model_test.fit()

# Print results summary
result_test.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,1.0
Model:,OLS,Adj. R-squared:,0.999
Method:,Least Squares,F-statistic:,825.8
Date:,"Wed, 07 Dec 2022",Prob (F-statistic):,0.00121
Time:,17:47:23,Log-Likelihood:,-286.86
No. Observations:,52,AIC:,673.7
Df Residuals:,2,BIC:,771.3
Df Model:,49,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
CarName_alfa-romero Quadrifoglio,-1.781e-05,2.06e-06,-8.631,0.013,-2.67e-05,-8.93e-06
CarName_alfa-romero giulia,2.61e-06,3.28e-07,7.959,0.015,1.2e-06,4.02e-06
CarName_alfa-romero stelvio,1.241e-06,1.51e-07,8.239,0.014,5.93e-07,1.89e-06
CarName_audi 100ls,2.433e+04,2905.694,8.373,0.014,1.18e+04,3.68e+04
CarName_audi 4000,-5.882e-07,6.89e-08,-8.536,0.013,-8.85e-07,-2.92e-07
CarName_audi 5000,-4.693e-07,5.5e-08,-8.532,0.013,-7.06e-07,-2.33e-07
CarName_audi 5000s (diesel),-1.237e-07,1.4e-08,-8.851,0.013,-1.84e-07,-6.35e-08
CarName_audi fox,-2.082e-07,2.36e-08,-8.813,0.013,-3.1e-07,-1.07e-07
CarName_bmw 320i,1.292e+04,2352.888,5.493,0.032,2800.593,2.3e+04

0,1,2,3
Omnibus:,7.478,Durbin-Watson:,2.165
Prob(Omnibus):,0.024,Jarque-Bera (JB):,10.455
Skew:,-0.373,Prob(JB):,0.00537
Kurtosis:,5.066,Cond. No.,1e+16
