# One hot encoding using different libraries in python

## Using Pandas

In [1]:
# Importing required libraries
import pandas as pd # For dataframe handling and operations.
import numpy as np # For numpy array operations.

from sklearn.model_selection import train_test_split # For Dataset split.
from sklearn.preprocessing import OneHotEncoder # Object for One-hot Encoding

In [23]:
# Load the dataset
def load_data():
    data = pd.read_csv("../../data/Titanic-Dataset.csv")
    return data

data = load_data() # Variable to store the Titanic dataset used throughout the examples.

# selecting attributes from the DataFrame
data = data[['sex', 'survived']]

 #sex is input feature while survived is the target
print(data.head())

# splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(
    data['sex'],
    data['survived'],
    test_size=0.3)

X_train.unique() # unique categories of sex feature

      sex  survived
0    male         0
1  female         1
2  female         1
3  female         1
4    male         0


array(['female', 'male'], dtype=object)

In [6]:
# One-hot encoding with pandas
# Use get_dummies function to generate one-hot encoded output

onehot1 = pd.get_dummies(X_train)
pd.concat([X_train, onehot1], axis=1).head()

Unnamed: 0,sex,female,male
684,male,False,True
57,male,False,True
589,male,False,True
862,female,True,False
4,male,False,True


In [7]:
# With attribute drop_first=True, k-1 dummy encoding

onehot2 = pd.get_dummies(X_train, drop_first=True)
pd.concat([X_train, onehot2], axis=1).head()

Unnamed: 0,sex,male
684,male,True
57,male,True
589,male,True
862,female,False
4,male,True


## Using Scikit-Learn

In [14]:
# One-hot encoding using Scikit-learn
onehot_encoder = OneHotEncoder(
    drop=None, # returns k dummy. for k-1 drop = first
  sparse_output=False) # returns numpy array

onehot_encoder.fit(np.array(X_train).reshape(-1,1)) #fit the training set

In [15]:
print('categories:', onehot_encoder.categories_)
print('Untransformed :')
print(X_test.head(6))
# Sklearn's one-hot encoder returns the numppy array or sparse matrix instead of dataframe.
print('Transformed:')
onehot_encoder.transform(np.array(X_test).reshape(-1,1))[:6]

categories: [array(['female', 'male'], dtype=object)]
Untransformed :
756      male
759    female
422      male
210      male
259    female
676      male
Name: sex, dtype: object
Transformed:


array([[0., 1.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [0., 1.]])

## Label encoding with pandas and scikit learn

In [26]:
data = pd.read_csv("../../data/Titanic-Dataset.csv")
data = data[['cabin', 'embarked', 'survived']]

data.head()

Unnamed: 0,cabin,embarked,survived
0,,S,0
1,C85,C,1
2,,S,1
3,C123,S,1
4,,S,0


In [27]:
# taking the 1st letters from the cabin
data['cabin']=data['cabin'].str[0]
data.head()

Unnamed: 0,cabin,embarked,survived
0,,S,0
1,C,C,1
2,,S,1
3,C,S,1
4,,S,0


In [28]:
print(data['cabin'].unique())
print(data['embarked'].unique())

[nan 'C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']
['S' 'C' 'Q' nan]


In [29]:
# taking rows that are not NaN from the dataset

for col in data.columns:
    data = data[data[col].notna()]

print(data['cabin'].unique())
print(data['embarked'].unique())

['C' 'E' 'G' 'D' 'A' 'B' 'F' 'T']
['C' 'S' 'Q']


In [30]:
# splitting the dataset

X_train, X_test, y_train, y_test = train_test_split(
    data[['cabin','embarked']],
    data['survived'],
    test_size=0.3,)

X_train.head()

Unnamed: 0,cabin,embarked
209,A,C
297,C,S
193,F,S
435,B,S
327,D,S


In [31]:
label_mapping = {
    key: i for i, key in enumerate(X_train['cabin'].unique())
}
print(label_mapping)

{'A': 0, 'C': 1, 'F': 2, 'B': 3, 'D': 4, 'E': 5, 'T': 6, 'G': 7}


In [32]:
# function to compute label mapping

def compute_category_mappings(df, variable):
    return {k: i for i, k in enumerate(df[variable].unique())}

#function to replace the variables with label encoding

def label_encoding(train, test, variable, label_mapping):

    X_train[variable] = X_train[variable].map(label_mapping)
    X_test[variable] = X_test[variable].map(label_mapping)

In [33]:
# Label encoding for all variables
for variable in ['cabin','embarked']:
    mapping = compute_category_mappings(X_train, variable)
    label_encoding(X_train, X_test, variable, mapping)

In [34]:
X_train.head()

Unnamed: 0,cabin,embarked
209,0,0
297,1,1
193,2,1
435,3,1
327,4,1


In [35]:
# using sklearn
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(
    data[['cabin','embarked']],
    data['survived'],
    test_size=0.3,)

# LabelEncoder only takes one feature at a time
label_encoder.fit_transform(X_train['cabin']) #label encoding of train set
label_encoder.transform(X_test['cabin']) #label encoding for test set

array([2, 2, 2, 2, 2, 2, 1, 2, 1, 4, 1, 2, 3, 4, 2, 5, 3, 3, 3, 1, 4, 4,
       2, 2, 1, 2, 3, 4, 1, 4, 0, 1, 1, 2, 1, 4, 1, 2, 6, 5, 3, 2, 1, 5,
       2, 1, 0, 3, 1, 3, 2, 4, 1, 6, 2, 2, 5, 2, 1, 3, 1])

In [36]:
from collections import defaultdict

X_train, X_test, y_train, y_test = train_test_split(
    data[['cabin','embarked']],
    data['survived'],
    test_size=0.3,)
# creating a defaultdict object.
dic = defaultdict(LabelEncoder)

In [37]:
# Encoding the training set's variable
train_label_encoded = X_train.apply(lambda x: dic[x.name].fit_transform(x))

# Encoding the test set's variable
test_label_encoded = X_test.apply(lambda x: dic[x.name].transform(x))

In [38]:
#side by side comparision
print(X_train.head())
print(train_label_encoded.head())

    cabin embarked
698     C        C
523     B        C
248     D        S
291     B        C
148     F        S
     cabin  embarked
698      2         0
523      1         0
248      3         2
291      1         0
148      5         2
