# Exercise 13

This particular Automobile Data Set includes a good mix of categorical values as well as continuous values and serves as a useful example that is relatively easy to understand. Since domain understanding is an important aspect when deciding how to encode various categorical values - this data set makes a good case study.

Read the data into Pandas

In [1]:
import pandas as pd

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [2]:
df.shape

(205, 26)

In [3]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [4]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


# Exercise 13.1

Does the database contain missing values? If so, replace them using one of the methods explained in class

In [5]:
# de los 205 registros 41 datos están en nulo para el campo Normalized_losses
def quick_analysis(df):
 print('Data Types:')
 print(df.dtypes)
 print('Rows and Columns:')
 print(df.shape)
 print('Column Names:')
 print(df.columns)
 print('Null Values:')
 print(df.apply(lambda x: sum(x.isnull()) / len(df)))

quick_analysis(df)

Data Types:
symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object
Rows and Columns:
(205, 26)
Column Names:
Index(['symboling', 'normalized_losses', 'make', 'fuel_type', 'aspiration',
       'num_doors', 'body_style', 'drive_wheels', 'engine_location',
       'wheel_base', 'length', 'w

In [6]:
#se usará la media de esta variable para completar los missing values
df.normalized_losses.describe()

count    164.000000
mean     122.000000
std       35.442168
min       65.000000
25%       94.000000
50%      115.000000
75%      150.000000
max      256.000000
Name: normalized_losses, dtype: float64

In [7]:
df.normalized_losses.fillna(df.normalized_losses.median(), inplace=True)
df.normalized_losses.describe()

count    205.000000
mean     120.600000
std       31.805105
min       65.000000
25%      101.000000
50%      115.000000
75%      137.000000
max      256.000000
Name: normalized_losses, dtype: float64

In [8]:
#los demás reigistros nulos se borraran del df original

df.dropna(inplace=True)
df.isnull().sum()
df.shape

(193, 26)

# Exercise 13.2

Split the data into training and testing sets

Train a Random Forest Regressor to predict the price of a car using the nominal features

In [9]:
#defining X and y
X = df.select_dtypes(exclude=object)
feature_columns = X.columns
feature_columns = feature_columns[:-1]
X = X[feature_columns]
#X = df['make']
y = df.price

#train and test split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

#train random forest regressor
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state=1)

rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)

from sklearn.metrics import mean_squared_error
import numpy as np
print(np.sqrt(mean_squared_error(y_predict,y_test)))


2113.720067199543


  from numpy.core.umath_tests import inner1d


# Exercise 13.3

Create dummy variables for the categorical features

Train a Random Forest Regressor and compare

In [10]:
#print(df.select_dtypes(include=object).head())
 
df_categorical = df.select_dtypes(include=object)
df_categorical.head()
columns = df_categorical.columns

#for column in columns:
 # print(df_categorical[column].unique())

In [11]:
df_dummies = pd.get_dummies(df_categorical)
df_dummies.shape

(193, 55)

In [12]:
X_dummies = df_dummies
y = df.price
X_dummies.head()

Unnamed: 0,make_alfa-romero,make_audi,make_bmw,make_chevrolet,make_dodge,make_honda,make_isuzu,make_jaguar,make_mazda,make_mercedes-benz,...,num_cylinders_six,num_cylinders_three,num_cylinders_twelve,fuel_system_1bbl,fuel_system_2bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
3,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [13]:
from sklearn.decomposition import PCA
X =  pd.concat([X, X_dummies], axis=1)
X = PCA(n_components=8).fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

#Train random regressor for categoricals + dummies df

rf = RandomForestRegressor(random_state=1)

rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)

print(np.sqrt(mean_squared_error(y_predict,y_test)))

3256.055328248496


# Exercise 13.4

Apply two other methods of categorical encoding

compare the results

## BinaryEncoder
Se usa el Binary Enconder en conjunto con las variable numéricas 

In [14]:
import category_encoders as ce
X_ = df.select_dtypes(include=object)
X_ = ce.BinaryEncoder().fit_transform(X_)
X_continuos = df.select_dtypes(exclude=object)
X_continuos = X_continuos.drop(['price'], axis = 1)
X = pd.concat([X_continuos ,X_],axis =1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

#Train random regressor for categoricasl encoded using Binary

rf = RandomForestRegressor(random_state=1)

rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)

print("RMSE codificando las variables categóricas con BinaryEncoder en conjunto con las numéricas")
print(np.sqrt(mean_squared_error(y_predict,y_test)))

RMSE codificando las variables categóricas con BinaryEncoder en conjunto con las numéricas
2243.1446252780474


Se usa el mismo encoder pero se entrena utilizando únicamente las variables categóricas

In [15]:
import category_encoders as ce
X_ = df.select_dtypes(include=object)
X_ = ce.BinaryEncoder().fit_transform(X_)

X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.3,random_state=1)

#Train random regressor for categoricasl encoded using Binary

rf = RandomForestRegressor(random_state=1)

rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)

print("RMSE usando variables categóricas y BinaryEncoder")
print(np.sqrt(mean_squared_error(y_predict,y_test)))

RMSE usando variables categóricas y BinaryEncoder
3764.957384143284


## HashingEncoder

In [16]:
X_ = df.select_dtypes(include=object)
X_ = ce.HashingEncoder(n_components=8).fit_transform(X_)

X_continuos = df.select_dtypes(exclude=object)
X_continuos = X_continuos.drop(['price'], axis = 1)
X = pd.concat([X_continuos ,X_],axis =1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=1)

#Train random regressor for categoricals using HashingEncoder

rf = RandomForestRegressor(random_state=1)

rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)

print("RMSE codificando las variables categóricas con HashingEncoder en conjunto con las variables numéricas")
print(np.sqrt(mean_squared_error(y_predict,y_test)))

RMSE codificando las variables categóricas con HashingEncoder en conjunto con las variables numéricas
2387.9515060731032


In [17]:
X_ = df.select_dtypes(include=object)
X_ = ce.HashingEncoder(n_components=8).fit_transform(X_)


X_train, X_test, y_train, y_test = train_test_split(X_, y, test_size=0.3,random_state=1)

#Train random regressor for categoricals using HashingEncoder

rf = RandomForestRegressor(random_state=1)

rf.fit(X_train,y_train)
y_predict = rf.predict(X_test)

print("RMSE codificando las variables categóricas con HashingEncoder en conjunto con las variables numéricas")
print(np.sqrt(mean_squared_error(y_predict,y_test)))

RMSE codificando las variables categóricas con HashingEncoder en conjunto con las variables numéricas
4107.035056359665
