In [1]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype

In [2]:
# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" ).loc[:,["symboling","body_style"]]
df.head()

Unnamed: 0,symboling,body_style
0,3,convertible
1,3,convertible
2,1,hatchback
3,2,sedan
4,2,sedan


In [3]:
df['body_style'].unique()

array(['convertible', 'hatchback', 'sedan', 'wagon', 'hardtop'],
      dtype=object)

Converting the object __body_style__ to ordinal category type. For label encoding, One trick we can use in pandas is to convert a column to a category, then use those category values for label encoding.

While encoding to ordinal type make sure that we have all categories in the ordered list.

In [4]:
cat_type = CategoricalDtype(['convertible', 'hardtop', 'hatchback', 'sedan', 'wagon'], ordered=True)
df['body_style'] = df['body_style'].astype(cat_type)
df['body_style'].unique()

[convertible, hatchback, sedan, wagon, hardtop]
Categories (5, object): [convertible < hardtop < hatchback < sedan < wagon]

Then we can assign the encoded variable to a new column using the cat.codes accessor and we get a number mapped to each of the categories

In [5]:
df['body_style_cat'] = df['body_style'].cat.codes

In [6]:
df['body_style_cat'].unique()

array([0, 2, 3, 4, 1], dtype=int8)

In [7]:
df.loc[:,['body_style', 'body_style_cat']].drop_duplicates()

Unnamed: 0,body_style,body_style_cat
0,convertible,0
2,hatchback,2
3,sedan,3
7,wagon,4
69,hardtop,1


Observe there is an integer mapping as per the ordinal category.

Alternately we can do the above steps of conversion to ordinal category value and label encoding in a single step.

In [8]:
df['body_Style_label_encoded'] = df['body_style'].astype(cat_type).cat.codes
df[['body_style', 'body_Style_label_encoded']].drop_duplicates()

Unnamed: 0,body_style,body_Style_label_encoded
0,convertible,0
2,hatchback,2
3,sedan,3
7,wagon,4
69,hardtop,1


Using [OrdinalEncoder()](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html#sklearn.preprocessing.OrdinalEncoder).

In [9]:
from sklearn.preprocessing import OrdinalEncoder

In [10]:
df[['body_style']][:10]

Unnamed: 0,body_style
0,convertible
1,convertible
2,hatchback
3,sedan
4,sedan
5,sedan
6,sedan
7,wagon
8,sedan
9,hatchback


In [11]:
oe = OrdinalEncoder()
oe.fit_transform(df[['body_style']])[1:10]

array([[0.],
       [2.],
       [3.],
       [3.],
       [3.],
       [3.],
       [4.],
       [3.],
       [2.]])

Reference:
1. [Guide to Encoding Categorical Values in Python](https://pbpython.com/categorical-encoding.html)
2. [Using The Pandas Category Data Type](https://pbpython.com/pandas_dtypes_cat.html)

In [12]:
del(df,cat_type,oe,headers)

### Reasons for using pipeline

1. It allows to properly cross-validate a process instead of a model.
2. We can do a grid-search or randomized-search of a pipeline for tuning parameters in model and the pre-processing steps so we can do the search for both individually or together. Eg: along with model parameters we can search which imputation methods provide the best result.

In [13]:
df = pd.read_csv('http://bit.ly/kaggletrain')
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [14]:
df.shape

(891, 12)

In [15]:
df = df.loc[df['Embarked'].notna(),['Survived', 'Pclass', 'Sex', 'Embarked']]

In [16]:
df.shape

(889, 4)

In [17]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [18]:
X = df.loc[:,['Pclass']]
y = df.Survived

In [19]:
X.shape

(889, 1)

In [20]:
y.shape

(889,)

In [21]:
from sklearn.linear_model import LogisticRegression

In [22]:
logreg = LogisticRegression(solver = 'lbfgs')

In [23]:
from sklearn.model_selection import cross_val_score

In [24]:
cross_val_score(logreg,X,y,cv=5,scoring='accuracy').mean()

0.6783406335301212

In [25]:
y.value_counts(normalize = True)

0    0.617548
1    0.382452
Name: Survived, dtype: float64

If we need to add more features to our model and cross validate it we need we can do that using pipelines.

In [26]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Embarked
0,0,3,male,S
1,1,1,female,C
2,1,3,female,S
3,1,1,female,S
4,0,3,male,S


In [27]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

In [28]:
ohe.fit_transform(df[['Embarked']])

array([[0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       ...,
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.]])

In [29]:
ohe.categories_

[array(['C', 'Q', 'S'], dtype=object)]

Defining X as 3 features here

In [30]:
X = df.drop('Survived', axis = 'columns')
X.head()

Unnamed: 0,Pclass,Sex,Embarked
0,3,male,S
1,1,female,C
2,3,female,S
3,1,female,S
4,3,male,S


In [31]:
from sklearn.compose import make_column_transformer

Use `make_column_transformer` when you have features in your dataframe that requires different pre-processing (like one-hot encoder here).

In [32]:
columns_trans = make_column_transformer((OneHotEncoder(), ['Sex', 'Embarked']),
                                       remainder = 'passthrough')

In [33]:
columns_trans.fit_transform(X)

array([[0., 1., 0., 0., 1., 3.],
       [1., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 1., 3.],
       ...,
       [1., 0., 0., 0., 1., 3.],
       [0., 1., 1., 0., 0., 1.],
       [0., 1., 0., 1., 0., 3.]])

In [34]:
from sklearn.pipeline import make_pipeline

In [35]:
pipe = make_pipeline(columns_trans, logreg)

Pass entire pipeline to cross_val_score

In [36]:
cross_val_score(pipe, X, y, cv = 5, scoring = 'accuracy').mean()

0.7727924839713071

So our accuracy improved from 0.67 to 0.773.

Now to avoid the presence of dummy variable we remove one columns from the set of dummy variables created for each column.
#### Avoiding dummy varaiables

In [37]:
columns_trans_2 = make_column_transformer((OneHotEncoder(drop = 'first'), ['Sex', 'Embarked']),
                                       remainder = 'passthrough')
from sklearn.pipeline import make_pipeline
pipe2 = make_pipeline(columns_trans_2, logreg)
cross_val_score(pipe2, X, y, cv = 5, scoring = 'accuracy').mean()

0.7727924839713071

In [38]:
columns_trans_2.fit_transform(X)

array([[1., 0., 1., 3.],
       [0., 0., 0., 1.],
       [0., 0., 1., 3.],
       ...,
       [0., 0., 1., 3.],
       [1., 0., 0., 1.],
       [1., 1., 0., 3.]])

1. What happens when we run this line of code `cross_val_score(pipe2, X, y, cv = 5, scoring = 'accuracy').mean()`?

we will be cross validating a pipeline of steps that include pre-processing of data and model building.

cross_val_score will split X,y in 5 folds and take accuracy of mean.

In [39]:
X_new = X.sample(5, random_state = 99)
X_new

Unnamed: 0,Pclass,Sex,Embarked
599,1,male,C
512,1,male,S
273,1,male,C
215,1,female,C
790,3,male,Q


In [40]:
pipe2.fit(X,y)

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='passthrough',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop='first',
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['Sex', 'Embarked'])],
                                   verbose=False)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                      

`pipe.fit()` is like `model.fit()` except it runs pre-processing(standarizing and encoding) as well as modelling steps.

In [41]:
pipe2.predict(X_new)

array([1, 0, 1, 1, 0], dtype=int64)

In [42]:
df.isna().sum()

Survived    0
Pclass      0
Sex         0
Embarked    0
dtype: int64

Reference:

- [Smarter Ways to Encode Categorical Data for Machine Learning](https://towardsdatascience.com/smarter-ways-to-encode-categorical-data-for-machine-learning-part-1-of-3-6dca2f71b159)
- [All about Categorical Variable Encoding](https://towardsdatascience.com/all-about-categorical-variable-encoding-305f3361fd02)
