## OneHotEncoder

One-Hot Encoding is a method of converting categorical data to numeric data in which for every unique value in the categorical column we create a new numeric column.

### Sources

- https://lifewithdata.com/2022/03/09/onehotencoder-how-to-do-one-hot-encoding-in-sklearn/

In [1]:
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split

In [2]:
df = sns.load_dataset('penguins')

In [3]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [4]:
X = df.drop(['island'], axis=1)
y = df['island']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [5]:
num_cols = X_train.select_dtypes(exclude=['object']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object']).columns.tolist()

transformer = make_column_transformer(
    (OneHotEncoder(), cat_cols),
    remainder='passthrough')

X_train_transformed  = transformer.fit_transform(X_train)
X_test_transformed = transformer.transform(X_test)

In [6]:
X_train_transformed_df = pd.DataFrame(
    X_train_transformed, 
    columns=transformer.get_feature_names_out()
)

X_test_transformed_df = pd.DataFrame(
    X_test_transformed, 
    columns=transformer.get_feature_names_out()
)

In [7]:
X_train_transformed_df.head()

Unnamed: 0,onehotencoder__species_Adelie,onehotencoder__species_Chinstrap,onehotencoder__species_Gentoo,onehotencoder__sex_Female,onehotencoder__sex_Male,onehotencoder__sex_nan,remainder__bill_length_mm,remainder__bill_depth_mm,remainder__flipper_length_mm,remainder__body_mass_g
0,0.0,1.0,0.0,1.0,0.0,0.0,50.2,18.7,198.0,3775.0
1,0.0,0.0,1.0,0.0,1.0,0.0,48.5,14.1,220.0,5300.0
2,0.0,0.0,1.0,1.0,0.0,0.0,45.5,13.9,210.0,4200.0
3,0.0,0.0,1.0,0.0,1.0,0.0,55.1,16.0,230.0,5850.0
4,0.0,1.0,0.0,0.0,1.0,0.0,49.6,18.2,193.0,3775.0


In [8]:
X_train_transformed_df.head()

Unnamed: 0,onehotencoder__species_Adelie,onehotencoder__species_Chinstrap,onehotencoder__species_Gentoo,onehotencoder__sex_Female,onehotencoder__sex_Male,onehotencoder__sex_nan,remainder__bill_length_mm,remainder__bill_depth_mm,remainder__flipper_length_mm,remainder__body_mass_g
0,0.0,1.0,0.0,1.0,0.0,0.0,50.2,18.7,198.0,3775.0
1,0.0,0.0,1.0,0.0,1.0,0.0,48.5,14.1,220.0,5300.0
2,0.0,0.0,1.0,1.0,0.0,0.0,45.5,13.9,210.0,4200.0
3,0.0,0.0,1.0,0.0,1.0,0.0,55.1,16.0,230.0,5850.0
4,0.0,1.0,0.0,0.0,1.0,0.0,49.6,18.2,193.0,3775.0


In [11]:
X_train_transformed

array([[0.000e+00, 1.000e+00, 0.000e+00, ..., 1.870e+01, 1.980e+02,
        3.775e+03],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 1.410e+01, 2.200e+02,
        5.300e+03],
       [0.000e+00, 0.000e+00, 1.000e+00, ..., 1.390e+01, 2.100e+02,
        4.200e+03],
       ...,
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 2.050e+01, 1.990e+02,
        3.775e+03],
       [1.000e+00, 0.000e+00, 0.000e+00, ..., 1.890e+01, 1.790e+02,
        2.975e+03],
       [0.000e+00, 1.000e+00, 0.000e+00, ..., 1.730e+01, 1.810e+02,
        3.600e+03]])