In [1]:
'''
%%sh
pip install numpy --upgrade
pip install pandas --upgrade
pip install sklearn --upgrade
'''

'\n%%sh\npip install numpy --upgrade\npip install pandas --upgrade\npip install sklearn --upgrade\n'

In [2]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

# Cretaing Data to Be Used as Example

In [3]:
data = np.array([['',     'Gender'],
                ['Luis',  'M'],
                ['Mari',  'F'],
                ['Lucas', 'M'], 
                ['Filipe','O'],
                ['Aline', 'F'],
                ['Lari',  'O'],
                ['Lele',  'O']])
                
data = pd.DataFrame(data=data[1:,1:],
                  index=data[1:,0],
                  columns=data[0,1:])

data_train = data[(data.index != 'Aline') & (data.index != 'Lari') & (data.index != 'Lele')]
data_test = data[(data.index == 'Aline') | (data.index == 'Lari') | (data.index == 'Lele')]
feat = data.columns[0]

print('\nTrain Data: \n\n', data_train)
print('\nTest Data: \n\n', data_test)
print('\nNote: The test data has no all labels of the train data.')


Train Data: 

        Gender
Luis        M
Mari        F
Lucas       M
Filipe      O

Test Data: 

       Gender
Aline      F
Lari       O
Lele       O

Note: The test data has no all labels of the train data.


# Encoding Labels for the Train Data

In [4]:
print("Processing feature {} for the train data.\n".format(feat))

le = LabelEncoder()
le.fit(data_train[feat])

[print("Value {} was encoded.".format(value)) for value in le.classes_]

Processing feature Gender for the train data.

Value F was encoded.
Value M was encoded.
Value O was encoded.


[None, None, None]

# Aplying the One Hot Encorder for the Train Data

In [5]:
_label_enc_data_train = le.transform(data_train[feat])
data_train['Label'] = _label_enc_data_train
categoryLabels_train = np.sort(np.unique(_label_enc_data_train))

oh = OneHotEncoder(sparse=False)
oh.fit(_label_enc_data_train.reshape(-1,1))
_one_hot_enc_data = oh.transform(_label_enc_data_train.reshape(-1,1))

feat_oh = []
data_oh = data_train.copy()
for i in range(len(oh.categories_[0])):    
    col_name = feat+"_oh_{}_{}".format(oh.categories_[0][i],le.classes_[i])
    data_oh[col_name] = _one_hot_enc_data[:,i]
    feat_oh.append(col_name)

data_oh

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Gender,Label,Gender_oh_0_F,Gender_oh_1_M,Gender_oh_2_O
Luis,M,1,0.0,1.0,0.0
Mari,F,0,1.0,0.0,0.0
Lucas,M,1,0.0,1.0,0.0
Filipe,O,2,0.0,0.0,1.0


# Aplying the One Hot Encoder for the Test Data: ` Wrong way`
Do not define the categories (labels) to be encoded (they are extracted from test data)

In [6]:
_label_enc_data_test = le.transform(data_test[feat])
data_test['Label'] = _label_enc_data_test

oh = OneHotEncoder(sparse=False)
oh.fit(_label_enc_data_test.reshape(-1,1))
_one_hot_enc_data = oh.transform(_label_enc_data_test.reshape(-1,1))

feat_oh = []
data_oh = data_test.copy()
for i in range(len(oh.categories_[0])):    
    col_name = feat+"_oh_{}_{}".format(oh.categories_[0][i],le.classes_[i])
    data_oh[col_name] = _one_hot_enc_data[:,i]
    feat_oh.append(col_name)
    
print('''Note: There are NOT columns for all labels, only {}'''.format(feat_oh))
data_oh

Note: There are NOT columns for all labels, only ['Gender_oh_0_F', 'Gender_oh_2_M']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Gender,Label,Gender_oh_0_F,Gender_oh_2_M
Aline,F,0,1.0,0.0
Lari,O,2,0.0,1.0
Lele,O,2,0.0,1.0


# Aplying the One Hot Encoder for the Test Data: `Correct way`
Define the categories (labels) to be encoded as those of the train data

In [7]:
_label_enc_data_test = le.transform(data_test[feat])
data_test['Label'] = _label_enc_data_test

oh = OneHotEncoder(categories=[categoryLabels_train], sparse=False)
oh.fit(_label_enc_data_test.reshape(-1,1))
_one_hot_enc_data = oh.transform(_label_enc_data_test.reshape(-1,1))

feat_oh = []
data_oh = data_test.copy()
for i in range(len(oh.categories_[0])):    
    col_name = feat+"_oh_{}_{}".format(oh.categories_[0][i],le.classes_[i])
    data_oh[col_name] = _one_hot_enc_data[:,i]
    feat_oh.append(col_name)
    
print('''Note: There are columns for all labels, that is {}'''.format(feat_oh))
data_oh

Note: There are columns for all labels, that is ['Gender_oh_0_F', 'Gender_oh_1_M', 'Gender_oh_2_O']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,Gender,Label,Gender_oh_0_F,Gender_oh_1_M,Gender_oh_2_O
Aline,F,0,1.0,0.0,0.0
Lari,O,2,0.0,0.0,1.0
Lele,O,2,0.0,0.0,1.0
