# Preprocessing Categorical Features
### Preliminaries

In [2]:
from sklearn import preprocessing
from sklearn.pipeline import Pipeline
import pandas as pd

### Create Data

In [3]:
raw_data = {'first_name': ['Jason', 'Molly', 'Tina', 'Jake', 'Amy'], 
        'last_name': ['Miller', 'Jacobson', 'Ali', 'Milner', 'Cooze'], 
        'age': [42, 52, 36, 24, 73], 
        'city': ['San Francisco', 'Baltimore', 'Miami', 'Douglas', 'Boston']}
df = pd.DataFrame(raw_data, columns = ['first_name', 'last_name', 'age', 'city'])
df

Unnamed: 0,first_name,last_name,age,city
0,Jason,Miller,42,San Francisco
1,Molly,Jacobson,52,Baltimore
2,Tina,Ali,36,Miami
3,Jake,Milner,24,Douglas
4,Amy,Cooze,73,Boston


### Convert Nominal Categorical Feature Into Dummy Variables Using Pandas

In [5]:
# Create dummy variables for every unique category in df.city
pd.get_dummies(df['city'])

Unnamed: 0,Baltimore,Boston,Douglas,Miami,San Francisco
0,0,0,0,0,1
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,1,0,0
4,0,1,0,0,0


### Convert Nominal Categorical Data Into Dummy (OneHot) Features Using Scikit

In [7]:
# Convert strings categorical names to integers
integerized_data = preprocessing.LabelEncoder().fit_transform(df['city'])

# View data
integerized_data

array([4, 0, 3, 2, 1])

In [9]:
# Convert integer categorical representations to OneHot encodings
preprocessing.OneHotEncoder().fit_transform(integerized_data.reshape(-1,1)).toarray()

array([[ 0.,  0.,  0.,  0.,  1.],
       [ 1.,  0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  1.,  0.],
       [ 0.,  0.,  1.,  0.,  0.],
       [ 0.,  1.,  0.,  0.,  0.]])

Source: https://chrisalbon.com/machine-learning/preprocessing_categorical_features.html