# Different Strategies in dealing with Categorical
### Ordinal Attributes

In [1]:
import pandas as pd
import numpy as np

In [2]:
poke_df = pd.read_csv('Pokemon_1.csv', encoding='utf-8')
poke_df

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,Gen 1,False
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,Gen 1,False
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,Gen 1,False
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,Gen 1,False
4,4,Charmander,Fire,,309,39,52,43,60,50,65,Gen 1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...
795,719,Diancie,Rock,Fairy,600,50,100,150,100,150,50,Gen 6,True
796,719,DiancieMega Diancie,Rock,Fairy,700,50,160,110,160,110,110,Gen 6,True
797,720,HoopaHoopa Confined,Psychic,Ghost,600,80,110,60,150,130,70,Gen 6,True
798,720,HoopaHoopa Unbound,Psychic,Dark,680,80,160,60,170,130,80,Gen 6,True


In [3]:
np.unique(poke_df['Generation'])

array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)

In [4]:
gen_ord_map = {'Gen 1': 1, 'Gen 2': 2, 'Gen 3': 3, 
               'Gen 4': 4, 'Gen 5': 5, 'Gen 6': 6}

poke_df['GenerationLabel']= poke_df['Generation'].map(gen_ord_map)
poke_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,GenerationLabel
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,Gen 1,False,1
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,Gen 1,False,1
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,Gen 1,False,1
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,Gen 1,False,1
4,4,Charmander,Fire,,309,39,52,43,60,50,65,Gen 1,False,1


## OHE & Label Encoder

In [5]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

gen_le = LabelEncoder()
gen_labels = gen_le.fit_transform(poke_df['Generation'])
poke_df['Gen_label'] = gen_labels

leg_le = LabelEncoder()
leg_labels = leg_le.fit_transform(poke_df['Legendary'])
poke_df['Lgnd_Label'] = leg_labels

poke_df.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,Speed,Generation,Legendary,GenerationLabel,Gen_label,Lgnd_Label
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,45,Gen 1,False,1,0,0
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,60,Gen 1,False,1,0,0
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,80,Gen 1,False,1,0,0
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,80,Gen 1,False,1,0,0
4,4,Charmander,Fire,,309,39,52,43,60,50,65,Gen 1,False,1,0,0


In [6]:
gen_le.classes_

array(['Gen 1', 'Gen 2', 'Gen 3', 'Gen 4', 'Gen 5', 'Gen 6'], dtype=object)

In [8]:
# encode generation labels using one-hot encoding scheme
gen_ohe = OneHotEncoder()
gen_feature_arr = gen_ohe.fit_transform(poke_df[['Gen_label']]).toarray()
gen_feature_labels = list(gen_le.classes_)
gen_features = pd.DataFrame(gen_feature_arr, columns=gen_feature_labels)

# encode legendary status labels using one-hot encoding scheme
leg_ohe = OneHotEncoder()
leg_feature_arr = leg_ohe.fit_transform(
                                poke_df[['Lgnd_Label']]).toarray()
leg_feature_labels = ['Legendary_'+str(cls_label) 
                           for cls_label in leg_le.classes_]
leg_features = pd.DataFrame(leg_feature_arr, 
                            columns=leg_feature_labels)

In [13]:
poke_df_ohe = pd.concat([poke_df, gen_features, leg_features], axis=1)
columns = sum([['Name', 'Generation', 'Gen_Label'],   
               gen_feature_labels, ['Legendary', 'Lgnd_Label'], 
               leg_feature_labels], [])


In [14]:
poke_df_ohe.head()

Unnamed: 0,#,Name,Type 1,Type 2,Total,HP,Attack,Defense,Sp. Atk,Sp. Def,...,Gen_label,Lgnd_Label,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6,Legendary_False,Legendary_True
0,1,Bulbasaur,Grass,Poison,318,45,49,49,65,65,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2,Ivysaur,Grass,Poison,405,60,62,63,80,80,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,3,Venusaur,Grass,Poison,525,80,82,83,100,100,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,3,VenusaurMega Venusaur,Grass,Poison,625,80,100,123,122,120,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,4,Charmander,Fire,,309,39,52,43,60,50,...,0,0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### Transformation
Use the fitted label encoders to transform unseen data
#### Transform Label Encoder

In [15]:
new_poke_df = pd.DataFrame([['PikaZoom', 'Gen 3', True], 
                           ['CharMyToast', 'Gen 4', False]],
                       columns=['Name', 'Generation', 'Legendary'])
new_poke_df

Unnamed: 0,Name,Generation,Legendary
0,PikaZoom,Gen 3,True
1,CharMyToast,Gen 4,False


#### Transform OHE

In [16]:
new_gen_labels = gen_le.transform(new_poke_df['Generation'])
new_poke_df['Gen_Label'] = new_gen_labels
new_leg_labels = leg_le.transform(new_poke_df['Legendary'])
new_poke_df['Lgnd_Label'] = new_leg_labels
# new_poke_df[['Name', 'Generation', 'Gen_Label', 'Legendary', 
#              'Lgnd_Label']]
new_poke_df

Unnamed: 0,Name,Generation,Legendary,Gen_Label,Lgnd_Label
0,PikaZoom,Gen 3,True,2,1
1,CharMyToast,Gen 4,False,3,0


In [17]:
new_gen_feature_arr = gen_ohe.transform(new_poke_df[['Gen_Label']]).toarray()
new_gen_features = pd.DataFrame(new_gen_feature_arr, columns=gen_feature_labels)

new_leg_feature_arr = leg_ohe.transform(new_poke_df[['Lgnd_Label']]).toarray()
new_leg_features = pd.DataFrame(new_leg_feature_arr, columns=leg_feature_labels)

new_poke_ohe = pd.concat([new_poke_df, new_gen_features, new_leg_features], axis=1)
new_poke_ohe

Unnamed: 0,Name,Generation,Legendary,Gen_Label,Lgnd_Label,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6,Legendary_False,Legendary_True
0,PikaZoom,Gen 3,True,2,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
1,CharMyToast,Gen 4,False,3,0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


### Dummies
The original feature gets dropped

In [18]:
gen_onehot_features = pd.get_dummies(poke_df['Generation'])
pd.concat([poke_df[['Name', 'Generation']], gen_onehot_features], 
           axis=1).iloc[4:10]

Unnamed: 0,Name,Generation,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
4,Charmander,Gen 1,1,0,0,0,0,0
5,Charmeleon,Gen 1,1,0,0,0,0,0
6,Charizard,Gen 1,1,0,0,0,0,0
7,CharizardMega Charizard X,Gen 1,1,0,0,0,0,0
8,CharizardMega Charizard Y,Gen 1,1,0,0,0,0,0
9,Squirtle,Gen 1,1,0,0,0,0,0


In [19]:
gen_onehot_features

Unnamed: 0,Gen 1,Gen 2,Gen 3,Gen 4,Gen 5,Gen 6
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
...,...,...,...,...,...,...
795,0,0,0,0,0,1
796,0,0,0,0,0,1
797,0,0,0,0,0,1
798,0,0,0,0,0,1
