In [1]:
# Import libraries
import numpy as np
import pandas as pd

# Display all columns
pd.set_option('display.max_columns', None)

# Import Houseprice data from GitHub
df = pd.read_csv('https://github.com/jurand71/datasets/raw/master/HouseSalePriceCompetition/houseprice.csv')

In [2]:
# Determine categorical variables in the dataset
categorical_variables = [var for var in df.columns if df[var].dtype == 'O']

In [3]:
# Let's explore the cardinality in variables
categories = {}
for cat_variable in categorical_variables:
    categories[df[cat_variable].name] = list(df[cat_variable].unique())

In [4]:
# Three variables were chosen from categorical variables for OneHotEncoder
usecols = ['HeatingQC','KitchenQual','CentralAir']
df = df[usecols]

In [5]:
# Print categories from selected categorical_variables
for elem in usecols:
    print('In {} are the following categories: {}'.format(elem, categories[elem]))

In HeatingQC are the following categories: ['Ex', 'Gd', 'TA', 'Fa', 'Po']
In KitchenQual are the following categories: ['Gd', 'TA', 'Ex', 'Fa']
In CentralAir are the following categories: ['Y', 'N']


In [6]:
# Import OneHotEncoder class
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(categories='auto',
                    drop='first', # to return k-1, drop=false to return k dummies
                    sparse=False,
                    handle_unknown='error') # helps deal with rare labels)
enc.fit(df.fillna('Missing'))

OneHotEncoder(drop='first', sparse=False)

In [7]:
# learned categories in dataset
enc.categories_

[array(['Ex', 'Fa', 'Gd', 'Po', 'TA'], dtype=object),
 array(['Ex', 'Fa', 'Gd', 'TA'], dtype=object),
 array(['N', 'Y'], dtype=object)]

In [8]:
enc_data = enc.transform(df.fillna('Missing'))

In [9]:
pd.DataFrame(enc_data).head()

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0
2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0


In [10]:
enc.get_feature_names_out()

array(['HeatingQC_Fa', 'HeatingQC_Gd', 'HeatingQC_Po', 'HeatingQC_TA',
       'KitchenQual_Fa', 'KitchenQual_Gd', 'KitchenQual_TA',
       'CentralAir_Y'], dtype=object)