In [1]:
import pandas as pd

# Label Encoder

In [2]:
from sklearn.preprocessing import LabelEncoder

In [16]:
data = {'Cor': ['Vermelho', 'Verde', 'Azul', 'Verde', 'Vermelho', 'Preto','Preto', 'Rosa', 'Cyan']}
df = pd.DataFrame(data)                
df.head(10)                
            

Unnamed: 0,Cor
0,Vermelho
1,Verde
2,Azul
3,Verde
4,Vermelho
5,Preto
6,Preto
7,Rosa
8,Cyan


In [18]:
label_encoder = LabelEncoder()

In [19]:
df['Cor_Encoded'] = label_encoder.fit_transform(df['Cor'])  

In [20]:
df.head(10)

Unnamed: 0,Cor,Cor_Encoded
0,Vermelho,5
1,Verde,4
2,Azul,0
3,Verde,4
4,Vermelho,5
5,Preto,2
6,Preto,2
7,Rosa,3
8,Cyan,1


# One Hot Encoder

In [23]:
from sklearn.preprocessing import OneHotEncoder

In [26]:
oh = OneHotEncoder(sparse_output=False).set_output(transform="pandas") 

In [27]:
data = {'Cor': ['Vermelho', 'Verde', 'Azul', 'Verde', 'Vermelho', 'Preto','Preto', 'Rosa', 'Cyan']}
df = pd.DataFrame(data)    

In [28]:
oh.fit(df[['Cor']])

In [29]:
oh.categories_

[array(['Azul', 'Cyan', 'Preto', 'Rosa', 'Verde', 'Vermelho'], dtype=object)]

In [31]:
one_hot = oh.fit_transform(df[['Cor']])
one_hot

Unnamed: 0,Cor_Azul,Cor_Cyan,Cor_Preto,Cor_Rosa,Cor_Verde,Cor_Vermelho
0,0.0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,0.0,0.0,1.0
5,0.0,0.0,1.0,0.0,0.0,0.0
6,0.0,0.0,1.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0
8,0.0,1.0,0.0,0.0,0.0,0.0


In [34]:
df_concatenado = pd.concat([df,one_hot], ignore_index=True, axis=1)

In [35]:
df_concatenado.head()

Unnamed: 0,0,1,2,3,4,5,6
0,Vermelho,0.0,0.0,0.0,0.0,0.0,1.0
1,Verde,0.0,0.0,0.0,0.0,1.0,0.0
2,Azul,1.0,0.0,0.0,0.0,0.0,0.0
3,Verde,0.0,0.0,0.0,0.0,1.0,0.0
4,Vermelho,0.0,0.0,0.0,0.0,0.0,1.0


# Dummy Encoder

In [45]:
data = {
    'Cor': ['Vermelho', 'Verde', 'Azul', 'Verde', 'Vermelho', 'Rosa', 'Rosa']
}
df = pd.DataFrame(data)

In [46]:
dummy = OneHotEncoder(sparse_output=False,drop='first').set_output(transform="pandas")

In [47]:
df_encoded = dummy.fit_transform(df[['Cor']])

In [48]:
df_encoded

Unnamed: 0,Cor_Rosa,Cor_Verde,Cor_Vermelho
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,0.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0
6,1.0,0.0,0.0


## Comparando OneHot X Dummy

In [49]:
data = {
    'Cor': ['Vermelho', 'Verde', 'Azul', 'Verde', 'Vermelho', 'Rosa', 'Rosa']
}
df = pd.DataFrame(data)

In [50]:
oh = OneHotEncoder(sparse_output=False).set_output(transform="pandas") 

In [51]:
dummy = OneHotEncoder(sparse_output=False,drop='first').set_output(transform="pandas")

In [52]:
one_hot = oh.fit_transform(df[['Cor']])
one_hot

Unnamed: 0,Cor_Azul,Cor_Rosa,Cor_Verde,Cor_Vermelho
0,0.0,0.0,0.0,1.0
1,0.0,0.0,1.0,0.0
2,1.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0
5,0.0,1.0,0.0,0.0
6,0.0,1.0,0.0,0.0


In [54]:
dummy_df = dummy.fit_transform(df[['Cor']])
dummy_df

Unnamed: 0,Cor_Rosa,Cor_Verde,Cor_Vermelho
0,0.0,0.0,1.0
1,0.0,1.0,0.0
2,0.0,0.0,0.0
3,0.0,1.0,0.0
4,0.0,0.0,1.0
5,1.0,0.0,0.0
6,1.0,0.0,0.0


# Effect Encoding

In [55]:
data = {
    'Cor': ['Vermelho', 'Verde', 'Azul', 'Verde', 'Vermelho'],
    'Valor': [10, 20, 30, 40, 80]
}

In [56]:
df = pd.DataFrame(data)
df

Unnamed: 0,Cor,Valor
0,Vermelho,10
1,Verde,20
2,Azul,30
3,Verde,40
4,Vermelho,80


In [75]:
mean_value = df['Valor'].mean()
mean_value

36.0

In [76]:
effects = df.groupby('Cor')['Valor'].mean() - mean_value
effects

Cor
Azul       -6.0
Verde      -6.0
Vermelho    9.0
Name: Valor, dtype: float64

In [79]:
#Effect De Cada Cor
df['Effect_Vermelho'] = (df['Cor'] == 'Vermelho').astype(int) * effects['Vermelho']
df['Effect_Verde'] = (df['Cor'] == 'Verde').astype(int) * effects['Verde']
df['Effect_Azul'] = (df['Cor'] == 'Azul').astype(int) * effects['Azul']


In [73]:
df = df.fillna(mean_value)

In [80]:
df.head()

Unnamed: 0,Cor,Valor,Effect_Vermelho,Effect_Verde,Effect_Azul
0,Vermelho,10,9.0,-0.0,-0.0
1,Verde,20,0.0,-6.0,-0.0
2,Azul,30,0.0,-0.0,-6.0
3,Verde,40,0.0,-6.0,-0.0
4,Vermelho,80,9.0,-0.0,-0.0


# Bin Counting 

In [91]:
data = {
    'Categoria': ['A', 'B', 'C', 'D', 'A', 'B', 'A', 'C', 'D', 'D']
}

In [92]:
df = pd.DataFrame(data)
df.head(3)

Unnamed: 0,Categoria
0,A
1,B
2,C


In [93]:
category_counts = df['Categoria'].value_counts()

In [94]:
df = df.join(df['Categoria'].map(category_counts).rename('Count'))
            

In [96]:
df

Unnamed: 0,Categoria,Count
0,A,3
1,B,2
2,C,2
3,D,3
4,A,3
5,B,2
6,A,3
7,C,2
8,D,3
9,D,3


# Feature Hashing

In [99]:
from sklearn.feature_extraction import FeatureHasher

In [100]:
data = {
    'Categoria': ['A', 'B', 'C', 'D', 'A', 'B']
}

In [101]:
df = pd.DataFrame(data)


In [114]:
hasher = FeatureHasher(input_type='string', n_features=6) 

In [118]:
hashed_features = hasher.transform(df['Categoria'].apply(lambda x: [x])) 

In [117]:
hashed_df = pd.DataFrame(hashed_features.toarray(), columns = [f'feature_{i}' for i in range(6) ])

In [119]:
hashed_df

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5
0,0.0,0.0,0.0,0.0,1.0,0.0
1,0.0,0.0,-1.0,0.0,0.0,0.0
2,0.0,0.0,0.0,-1.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0
5,0.0,0.0,-1.0,0.0,0.0,0.0
