# Exercise 13

This particular Automobile Data Set includes a good mix of categorical values as well as continuous values and serves as a useful example that is relatively easy to understand. Since domain understanding is an important aspect when deciding how to encode various categorical values - this data set makes a good case study.

Read the data into Pandas

In [1]:
import pandas as pd

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [2]:
df.shape

(205, 26)

In [3]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [4]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [5]:
# Check is there's any missing value
df.isnull().sum()

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_doors             2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_cylinders         0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [6]:
df.loc[df.price.isnull()]

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
9,0,,audi,gas,turbo,two,hatchback,4wd,front,99.5,...,131,mpfi,3.13,3.4,7.0,160.0,5500.0,16,22,
44,1,,isuzu,gas,std,two,sedan,fwd,front,94.5,...,90,2bbl,3.03,3.11,9.6,70.0,5400.0,38,43,
45,0,,isuzu,gas,std,four,sedan,fwd,front,94.5,...,90,2bbl,3.03,3.11,9.6,70.0,5400.0,38,43,
129,1,,porsche,gas,std,two,hatchback,rwd,front,98.4,...,203,mpfi,3.94,3.11,10.0,288.0,5750.0,17,28,


In [7]:
obj_df.num_doors.value_counts().idxmax()

'four'

# Exercise 13.1

Does the database contain missing values? If so, replace them using one of the methods explained in class

In [8]:
# Check is there's any missing value
df.isnull().sum()

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_doors             2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_cylinders         0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [9]:
# Replace the normalized_losses with the mean
df.normalized_losses.fillna(df.normalized_losses.mean(), inplace=True)
# Replace the num_doors with the most common value
df.num_doors.fillna(df.num_doors.value_counts().idxmax, inplace=True)
# Replace the bore with the mean
df.bore.fillna(df.bore.mean(), inplace=True)
# Replace the stroke with the mean
df.stroke.fillna(df.stroke.mean(), inplace=True)
# Replace the peak_rpm with the mean
df.peak_rpm.fillna(df.peak_rpm.mean(), inplace=True)
# Replace the horsepower with the mean
df.horsepower.fillna(df.horsepower.mean(), inplace=True)
# Replace the price with the mean
df.price.fillna(df.price.mean(), inplace=True)

# Exercise 13.2

Split the data into training and testing sets

Train a Random Forest Regressor to predict the price of a car using the nominal features

In [10]:
from sklearn.model_selection import train_test_split
# Create X and Y variables
y= df["price"]
X = df[["symboling", "wheel_base", "length", "width", "height", "engine_size", "bore", "stroke",
       "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg"]]

# Split in train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=123)

In [11]:
# Ramdom forest regressor
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf.fit(X_train, y_train)
y_pred=clf.predict(X_test)
y_pred

  from numpy.core.umath_tests import inner1d


array([10175.        , 30082.42587065, 19539.3       ,  7974.23333333,
        8605.        ,  9878.35174129,  8653.2       ,  7506.23333333,
        7154.83333333, 11840.30348259,  9497.65174129, 14398.375     ,
        6075.9       ,  6682.7       , 13923.825     , 10616.5       ,
        9775.        , 10256.4       ,  6767.4       , 11239.39054726,
       14318.5       , 11156.975     ,  8431.5       ,  5704.75      ,
       16027.8       , 19129.2       , 10415.575     ,  8817.3       ,
       34961.7       ,  7276.81293532,  9739.2       , 11567.6       ,
        6529.3       , 11680.5       , 14641.43333333, 10247.41293532,
        7154.83333333,  6417.425     , 11637.3       , 15442.        ,
        6598.3       , 13991.5       , 16027.8       ,  7262.33333333,
       10425.17380952,  8172.41293532, 16318.31293532,  8651.7       ,
        8652.1       , 11567.6       , 15442.        ,  9060.7       ,
        6417.425     , 19539.3       , 15012.275     ,  7678.21293532,
      

In [12]:
#  Evaluate the model
from sklearn import metrics
import numpy as np
from sklearn.metrics import r2_score
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

MAE: 1557.0934892240477
MSE: 4946602.95887069
RMSE: 2224.095986883365
R2: 0.8781652951563608


# Exercise 13.3

Create dummy variables for the categorical features

Train a Random Forest Regressor and compare

In [13]:
# Create the dummy variables
X2 = df[["make", "fuel_type", "aspiration", "num_doors", "body_style", "drive_wheels", 
         "engine_location", "engine_type", "fuel_system"]]
X2 = X2.applymap(str)
X2 = pd.get_dummies(X2)
X2.head()

Unnamed: 0,make_alfa-romero,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_isuzu,make_jaguar,make_mazda,make_mercedes-benz,...,engine_type_ohcv,engine_type_rotor,fuel_system_1bbl,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [14]:
#Merge the two df into a single one
Z = pd.concat([X.reset_index(drop=True),X2.reset_index(drop=True)], axis=1)

# Split in train and test sets
Z_train, Z_test, y_train, y_test = train_test_split(Z, y, test_size=0.33, random_state=123)

# Ramdom forest regressor
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf.fit(Z_train, y_train)
y_pred=clf.predict(Z_test)
y_pred

array([ 9639.8       , 35232.2       , 20089.3       ,  8367.21293532,
        8245.7       ,  8976.12587065,  9594.8       ,  7702.43333333,
        7300.66666667,  9889.26467662,  8739.12587065, 14471.66666667,
        6123.2       ,  6984.8       , 14666.5       ,  9755.6       ,
        8805.        , 10416.3       ,  6805.        ,  9839.36467662,
       14988.51293532, 10224.1       ,  8571.4       ,  5678.1       ,
       17130.1       , 17300.1       ,  8044.3       ,  9087.        ,
       37438.8       ,  6835.4       ,  9509.8       , 11109.5       ,
        6272.1       , 11461.1       , 15361.5       , 10426.91293532,
        7300.66666667,  6614.2       , 12231.8       , 16545.5       ,
        6678.7       , 15572.5       , 17130.1       ,  7555.66666667,
       11193.175     ,  7947.6       , 16899.71293532,  9126.9       ,
        8350.8       , 11180.5       , 16545.5       ,  9398.        ,
        6366.        , 20089.3       , 14701.67587065,  7162.2       ,
      

In [15]:
#  Evaluate the model
from sklearn import metrics
import numpy as np
from sklearn.metrics import r2_score
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

MAE: 1344.7529210564824
MSE: 4025577.991327115
RMSE: 2006.3843079846679
R2: 0.9008501166403782


# Exercise 13.4

Apply two other methods of categorical encoding

compare the results

## Binary Encoder

In [16]:
import category_encoders as ce
X2_ = ce.BinaryEncoder().fit_transform(X2, axis=1)

#Merge the two df into a single one
Z_ = pd.concat([X.reset_index(drop=True),X2_.reset_index(drop=True)], axis=1)

# Split in train and test sets
Z_train, Z_test, y_train, y_test = train_test_split(Z_, y, test_size=0.33, random_state=123)

# Ramdom forest regressor
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf.fit(Z_train, y_train)
y_pred=clf.predict(Z_test)
y_pred

#  Evaluate the model
from sklearn import metrics
import numpy as np
from sklearn.metrics import r2_score
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

MAE: 1526.075917157908
MSE: 5112973.765296178
RMSE: 2261.1885735816413
R2: 0.8740675864330241


## Herlmert Encoding

In [18]:
X3_ = ce.HelmertEncoder().fit_transform(X2, axis=1)
#Merge the two df into a single one
Z_ = pd.concat([X.reset_index(drop=True),X3_.reset_index(drop=True)], axis=1)

# Split in train and test sets
Z_train, Z_test, y_train, y_test = train_test_split(Z_, y, test_size=0.33, random_state=123)

# Ramdom forest regressor
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
clf.fit(Z_train, y_train)
y_pred=clf.predict(Z_test)
y_pred

#  Evaluate the model
from sklearn import metrics
import numpy as np
from sklearn.metrics import r2_score
print('MAE:', metrics.mean_absolute_error(y_test, y_pred))
print('MSE:', metrics.mean_squared_error(y_test, y_pred))
print('RMSE:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
print('R2:', metrics.r2_score(y_test, y_pred))

MAE: 1246.854488586479
MSE: 3211584.4624820687
RMSE: 1792.0894125244054
R2: 0.92089875651628


El mejor modelo es el que usó el Herlmet Encoding con un RMSE de 1792.08, el cual es mucho mejor que el modelo que utiliza variables dummy (RMSE = 1883) o el binary encoder (RMSE = 2261)