# 3. FEATURE ENGINEERING AND MODEL TRAINING
---

In [1]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", 99)
pd.set_option("display.max_rows", 999)
pd.set_option('precision', 3)

cars = pd.read_csv('data/imports-85.cars2')
print(cars.shape)
cars.head()

(204, 26)


Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,length,width,height,curb_weight,engine_type,num_cylinders,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,138.0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,48.8,2548.0,dohc,4,130.0,mpfi,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0
1,1,138.5,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823.0,ohcv,6,152.0,mpfi,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0
2,2,164.0,audi,gas,std,4,sedan,fwd,front,99.8,176.6,66.2,54.3,2337.0,ohc,4,109.0,mpfi,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0
3,2,164.0,audi,gas,std,4,sedan,4wd,front,99.4,176.6,66.4,54.3,2824.0,ohc,5,136.0,mpfi,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0
4,2,143.545,audi,gas,std,2,sedan,fwd,front,99.8,177.3,66.3,53.1,2507.0,ohc,5,136.0,mpfi,3.19,3.4,8.5,110.0,5500.0,19.0,25.0,15250.0


## 1. Encoding Categorical Attributes

We will use pandas' `get_dummies` since it makes column names for us. 

In [2]:
cat_cols = ['symboling', 'make', 'fuel_type', 'aspiration', 'body_style', 
            'drive_wheels', 'engine_location', 'engine_type', 'fuel_system']
cars_cat = cars[cat_cols]
cars_cat.head()

Unnamed: 0,symboling,make,fuel_type,aspiration,body_style,drive_wheels,engine_location,engine_type,fuel_system
0,3,alfa-romero,gas,std,convertible,rwd,front,dohc,mpfi
1,1,alfa-romero,gas,std,hatchback,rwd,front,ohcv,mpfi
2,2,audi,gas,std,sedan,fwd,front,ohc,mpfi
3,2,audi,gas,std,sedan,4wd,front,ohc,mpfi
4,2,audi,gas,std,sedan,fwd,front,ohc,mpfi


In [5]:
def make_dummies(df, column):
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop([column], axis=1)
    return df
for column in cat_cols:
    cars = make_dummies(cars, column)
    
print(cars.shape)
cars.head()

(204, 74)


Unnamed: 0,normalized_losses,num_doors,wheel_base,length,width,height,curb_weight,num_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price,symboling_-2,symboling_-1,symboling_0,symboling_1,symboling_2,symboling_3,make_alfa-romero,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_isuzu,make_jaguar,make_mazda,make_mercedes-benz,make_mercury,make_mitsubishi,make_nissan,make_peugot,make_plymouth,make_porsche,make_renault,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo,fuel_type_diesel,fuel_type_gas,aspiration_std,aspiration_turbo,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd,engine_location_front,engine_location_rear,engine_type_dohc,engine_type_dohcv,engine_type_l,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_rotor,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,138.0,2,88.6,168.8,64.1,48.8,2548.0,4,130.0,3.47,2.68,9.0,111.0,5000.0,21.0,27.0,16500.0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,138.5,2,94.5,171.2,65.5,52.4,2823.0,6,152.0,2.68,3.47,9.0,154.0,5000.0,19.0,26.0,16500.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,164.0,4,99.8,176.6,66.2,54.3,2337.0,4,109.0,3.19,3.4,10.0,102.0,5500.0,24.0,30.0,13950.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,164.0,4,99.4,176.6,66.4,54.3,2824.0,5,136.0,3.19,3.4,8.0,115.0,5500.0,18.0,22.0,17450.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
4,143.545,2,99.8,177.3,66.3,53.1,2507.0,5,136.0,3.19,3.4,8.5,110.0,5500.0,19.0,25.0,15250.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0


## 2. Scaling Numerical Attributes
Here we will use `MinMaxScaler` to scale each feature to a range between 0 and 1, which will fit in nicely with our one-hot-encoded categorical features whose values are either 0 or 1.

In [6]:
num_cols = ['normalized_losses','num_doors', 'wheel_base', 'length', 'width', 
            'height', 'curb_weight', 'num_cylinders', 'engine_size', 'bore', 
            'stroke', 'compression_ratio', 'horsepower', 'peak_rpm', 'city_mpg', 
            'highway_mpg']
cars_num = cars[num_cols]
cars_num.head()

Unnamed: 0,normalized_losses,num_doors,wheel_base,length,width,height,curb_weight,num_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
0,138.0,2,88.6,168.8,64.1,48.8,2548.0,4,130.0,3.47,2.68,9.0,111.0,5000.0,21.0,27.0
1,138.5,2,94.5,171.2,65.5,52.4,2823.0,6,152.0,2.68,3.47,9.0,154.0,5000.0,19.0,26.0
2,164.0,4,99.8,176.6,66.2,54.3,2337.0,4,109.0,3.19,3.4,10.0,102.0,5500.0,24.0,30.0
3,164.0,4,99.4,176.6,66.4,54.3,2824.0,5,136.0,3.19,3.4,8.0,115.0,5500.0,18.0,22.0
4,143.545,2,99.8,177.3,66.3,53.1,2507.0,5,136.0,3.19,3.4,8.5,110.0,5500.0,19.0,25.0


In [10]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
cars_scaled = scaler.fit_transform(cars_num)
cars_scaled = pd.DataFrame(cars_scaled, columns=num_cols)
cars_scaled.head()

Unnamed: 0,normalized_losses,num_doors,wheel_base,length,width,height,curb_weight,num_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg
0,0.382,0.0,0.058,0.413,0.317,0.083,0.411,0.2,0.26,0.664,0.29,0.125,0.262,0.347,0.222,0.289
1,0.385,0.0,0.23,0.449,0.433,0.383,0.518,0.4,0.343,0.1,0.667,0.125,0.442,0.347,0.167,0.263
2,0.518,1.0,0.385,0.53,0.492,0.542,0.329,0.2,0.181,0.464,0.633,0.188,0.225,0.551,0.306,0.368
3,0.518,1.0,0.373,0.53,0.508,0.542,0.518,0.3,0.283,0.464,0.633,0.062,0.279,0.551,0.139,0.158
4,0.411,0.0,0.385,0.54,0.5,0.442,0.395,0.3,0.283,0.464,0.633,0.094,0.258,0.551,0.167,0.237


In [12]:
cars_scaled.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
normalized_losses,204.0,0.298,0.172,0.0,0.152,0.273,0.411,1.0
num_doors,204.0,0.569,0.496,0.0,0.0,1.0,1.0,1.0
wheel_base,204.0,0.356,0.175,0.0,0.23,0.303,0.461,1.0
length,204.0,0.492,0.185,0.0,0.376,0.479,0.628,1.0
width,204.0,0.468,0.179,0.0,0.315,0.433,0.55,1.0
height,204.0,0.496,0.202,0.0,0.35,0.525,0.642,1.0
curb_weight,204.0,0.414,0.202,0.0,0.255,0.359,0.563,1.0
num_cylinders,204.0,0.238,0.108,0.0,0.2,0.2,0.2,1.0
engine_size,204.0,0.249,0.158,0.0,0.136,0.221,0.306,1.0
bore,204.0,0.563,0.194,0.0,0.436,0.55,0.745,1.0


Let's now assign the scaled numerical features back to our `cars` dataframe. 

In [14]:
cars[num_cols] = cars_scaled
print(cars.shape)
cars.head()

(204, 74)


Unnamed: 0,normalized_losses,num_doors,wheel_base,length,width,height,curb_weight,num_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price,symboling_-2,symboling_-1,symboling_0,symboling_1,symboling_2,symboling_3,make_alfa-romero,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_isuzu,make_jaguar,make_mazda,make_mercedes-benz,make_mercury,make_mitsubishi,make_nissan,make_peugot,make_plymouth,make_porsche,make_renault,make_saab,make_subaru,make_toyota,make_volkswagen,make_volvo,fuel_type_diesel,fuel_type_gas,aspiration_std,aspiration_turbo,body_style_convertible,body_style_hardtop,body_style_hatchback,body_style_sedan,body_style_wagon,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd,engine_location_front,engine_location_rear,engine_type_dohc,engine_type_dohcv,engine_type_l,engine_type_ohc,engine_type_ohcf,engine_type_ohcv,engine_type_rotor,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,0.382,0.0,0.058,0.413,0.317,0.083,0.411,0.2,0.26,0.664,0.29,0.125,0.262,0.347,0.222,0.289,16500.0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
1,0.385,0.0,0.23,0.449,0.433,0.383,0.518,0.4,0.343,0.1,0.667,0.125,0.442,0.347,0.167,0.263,16500.0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
2,0.518,1.0,0.385,0.53,0.492,0.542,0.329,0.2,0.181,0.464,0.633,0.188,0.225,0.551,0.306,0.368,13950.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
3,0.518,1.0,0.373,0.53,0.508,0.542,0.518,0.3,0.283,0.464,0.633,0.062,0.279,0.551,0.139,0.158,17450.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
4,0.411,0.0,0.385,0.54,0.5,0.442,0.395,0.3,0.283,0.464,0.633,0.094,0.258,0.551,0.167,0.237,15250.0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0


In [20]:
cars_scaled.max() - cars_scaled.min()

normalized_losses    1.0
num_doors            1.0
wheel_base           1.0
length               1.0
width                1.0
height               1.0
curb_weight          1.0
num_cylinders        1.0
engine_size          1.0
bore                 1.0
stroke               1.0
compression_ratio    1.0
horsepower           1.0
peak_rpm             1.0
city_mpg             1.0
highway_mpg          1.0
dtype: float64