### OneHotEncoder vs. OrdinalEncoder

In [3]:
import pandas as pd
import numpy as np

In [67]:
df_train = pd.read_csv('data/train.csv')
df_train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [84]:
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']
X.shape, y.shape

((891, 11), (891,))

In [85]:
X.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [86]:
# filling some nan values with mean respectively mode
X['Age'].fillna(X['Age'].mean(), inplace=True)
X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)

In [87]:
X.isna().sum()

PassengerId      0
Pclass           0
Name             0
Sex              0
Age              0
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         0
dtype: int64

#### OneHotEncoding

In [88]:
categorial_variables = ['Sex', 'Embarked', 'Pclass']
numerical_variables = ['Age', 'Fare', 'SibSp', 'Parch']

In [92]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder()
X_categories = encoder.fit_transform(X[categorial_variables])

In [120]:
pd.DataFrame(X_categories.toarray()).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
3,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0


OneHotEncoder creates new columns for every distinct categorical value. Ordinal information is not regarded.

In [104]:
X[numerical_variables].values.shape, X_categories.shape

((891, 4), (891, 8))

In [110]:
X = np.concatenate((X[numerical_variables].values, X_categories.toarray()), axis=1)
X.shape

(891, 12)

In [115]:
from sklearn.ensemble import GradientBoostingClassifier

# we're just sticking with the default parameters
clf = GradientBoostingClassifier()

# and score with cross validation 
score = cross_val_score(clf, X, y)
score, score.mean()

(array([0.80446927, 0.79775281, 0.84269663, 0.79213483, 0.84831461]),
 0.8170736300295023)

#### OrdinalEncoder

In [122]:
# Load data again
X = df_train.drop('Survived', axis=1)
y = df_train['Survived']

# and perform same preprocessing
X['Age'].fillna(X['Age'].mean(), inplace=True)
X['Embarked'].fillna(X['Embarked'].mode()[0], inplace=True)

In [129]:
from sklearn.preprocessing import OrdinalEncoder

encoder = OrdinalEncoder()
X_categories = encoder.fit_transform(X[categorial_variables])

In [133]:
pd.DataFrame(X_categories).head()

Unnamed: 0,0,1,2
0,1.0,2.0,2.0
1,0.0,0.0,0.0
2,0.0,2.0,2.0
3,0.0,2.0,0.0
4,1.0,2.0,2.0


Without passing in an index all orginal information is lost. Lets try how far we get.

In [135]:
X_encoded = np.concatenate((X[numerical_variables].values, X_categories), axis=1)

In [137]:
# and score with cross validation 
score = cross_val_score(clf, X_encoded, y)
score, score.mean()

(array([0.79888268, 0.82022472, 0.83146067, 0.79775281, 0.85393258]),
 0.82045069361622)

The performance is just a little bit better. Lets try to improve this by passing ordinal information.

In [151]:
X[categorial_variables].head()

Unnamed: 0,Sex,Embarked,Pclass
0,male,S,3
1,female,C,1
2,female,S,3
3,female,S,1
4,male,S,3


In [152]:
X['Embarked'].unique(), X['Pclass'].unique()

(array(['S', 'C', 'Q'], dtype=object), array([3, 1, 2]))

In [154]:
# we now add ordinal information
encoder = OrdinalEncoder(categories=[['female','male'], ['C', 'S', 'Q'], ['1', '2', '3']])
X_categories = encoder.fit_transform(X[categorial_variables])

In [155]:
X_encoded = np.concatenate((X[numerical_variables].values, X_categories), axis=1)

# and score with cross validation 
score = cross_val_score(clf, X_encoded, y)
score, score.mean()

(array([0.81564246, 0.81460674, 0.85393258, 0.80337079, 0.83146067]),
 0.8238026489234824)

A small improvement has been made. We leave it open if this is due to the ordinal information in 'Pclass'...