In [2]:
import pandas as pd
import numpy as np

# Pre-processing

In [3]:
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration", "num_doors", "body_style", "drive_wheels", 
           "engine_location", "wheel_base", "length", "width", "height", "curb_weight", "engine_type", "num_cylinders", "engine_size",
           "fuel_system", "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm", "city_mpg", "highway_mpg", "price"]

In [4]:
# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data", header=None, names=headers, na_values="?" )

In [5]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [6]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [7]:
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


In [8]:
obj_df["num_doors"].value_counts()

four    114
two      89
Name: num_doors, dtype: int64

In [9]:
#fill with four
obj_df = obj_df.fillna({"num_doors":"four"})

In [10]:
obj_df

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi
...,...,...,...,...,...,...,...,...,...,...
200,volvo,gas,std,four,sedan,rwd,front,ohc,four,mpfi
201,volvo,gas,turbo,four,sedan,rwd,front,ohc,four,mpfi
202,volvo,gas,std,four,sedan,rwd,front,ohcv,six,mpfi
203,volvo,diesel,turbo,four,sedan,rwd,front,ohc,six,idi


# Find and replace

In [11]:
obj_df["num_cylinders"].value_counts()

four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: num_cylinders, dtype: int64

In [12]:
cleanup_nums = {"num_doors": {"four": 4, "two": 2}, 
                "num_cylinders": {"four": 4, "six": 6, "five": 5, "eight": 8, "two": 2, "twelve": 12, "three":3 }
                }

In [13]:
#replace
obj_df.replace(cleanup_nums, inplace=True)

In [14]:
obj_df.dtypes

make               object
fuel_type          object
aspiration         object
num_doors           int64
body_style         object
drive_wheels       object
engine_location    object
engine_type        object
num_cylinders       int64
fuel_system        object
dtype: object

# Label encoding

Covert each value in a column to a number

Shortcut: Convert column to 'category' then do the label encoding to it

In [None]:
obj_df['body_style'] = obj_df['body_style'].astype('category')

In [None]:
obj_df.dtypes

make                 object
fuel_type            object
aspiration           object
num_doors            object
body_style         category
drive_wheels         object
engine_location      object
engine_type          object
num_cylinders        object
fuel_system          object
dtype: object

Assign encoded variable to a new column using cat.codes

In [None]:
obj_df['body_style_cat'] = obj_df['body_style'].cat.codes
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,body_style_cat
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi,0
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi,0
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi,2
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi,3
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi,3


Disadvantage: Numerical values can be misinterpreted as having ordering respective to the value, i.e., 0<1<2 etc

# One-hot coding

get_dummies()

In [15]:
pd.get_dummies(obj_df, columns = ['drive_wheels']).head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,1
1,alfa-romero,gas,std,2,convertible,front,dohc,4,mpfi,0,0,1
2,alfa-romero,gas,std,2,hatchback,front,ohcv,6,mpfi,0,0,1
3,audi,gas,std,4,sedan,front,ohc,4,mpfi,0,1,0
4,audi,gas,std,4,sedan,front,ohc,5,mpfi,1,0,0


Disadvantage: Numbe rof columns increases proportional to the number of unique values in the column

# Custom binary coding

If we need to only filter for a broad category of input values in a column based on a common pattern:

In [16]:
obj_df['engine_type'].value_counts()

ohc      148
ohcf      15
ohcv      13
l         12
dohc      12
rotor      4
dohcv      1
Name: engine_type, dtype: int64

In [17]:
obj_df['OHC_Code'] = np.where(obj_df['engine_type'].str.contains('ohc'),1,0)

In [18]:
obj_df[['make', 'engine_type', 'OHC_Code']].head()

Unnamed: 0,make,engine_type,OHC_Code
0,alfa-romero,dohc,1
1,alfa-romero,dohc,1
2,alfa-romero,ohcv,1
3,audi,ohc,1
4,audi,ohc,1


# Using Scikit-learn

LabelEncoder

In [19]:
from sklearn.preprocessing import LabelEncoder

In [20]:
lb_make = LabelEncoder()
obj_df['make_code'] = lb_make.fit_transform(obj_df['make'])
obj_df[['make', 'make_code']].head()

Unnamed: 0,make,make_code
0,alfa-romero,0
1,alfa-romero,0
2,alfa-romero,0
3,audi,1
4,audi,1


LabelBinarizer

In [21]:
from sklearn.preprocessing import LabelBinarizer

In [22]:
lb_style = LabelBinarizer()
lb_results = lb_style.fit_transform(obj_df['body_style'])
pd.DataFrame(lb_results, columns=lb_style.classes_).head()

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,1,0,0,0,0
1,1,0,0,0,0
2,0,0,1,0,0
3,0,0,0,1,0
4,0,0,0,1,0
