<a href="https://colab.research.google.com/github/lauroPereira/data-wrangling-lessons/blob/master/data_wrangling_categorical.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Setup

In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px

from sklearn import datasets
from sklearn.preprocessing import OneHotEncoder

In [2]:
sns.set_theme(
    context='talk',
    style='ticks',
    font_scale=.8,
    rc={
        'figure.figsize': (12,8)
    }
)

In [3]:
df = sns.load_dataset('tips')

In [4]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


# Utilizando Pandas dummies
'''
Dessa forma cada possível categoria se torna uma coluna de verdadeiro ou falso, por exemplo, **uma coluna "periodo do dia"** com as possibilidades manhã, tarde e noite, se tornarão **3 colunas booleanas: manhã, tarde e noite.**
'''

In [5]:
pd.get_dummies(df)

Unnamed: 0,total_bill,tip,size,sex_Male,sex_Female,smoker_Yes,smoker_No,day_Thur,day_Fri,day_Sat,day_Sun,time_Lunch,time_Dinner
0,16.99,1.01,2,0,1,0,1,0,0,0,1,0,1
1,10.34,1.66,3,1,0,0,1,0,0,0,1,0,1
2,21.01,3.50,3,1,0,0,1,0,0,0,1,0,1
3,23.68,3.31,2,1,0,0,1,0,0,0,1,0,1
4,24.59,3.61,4,0,1,0,1,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,3,1,0,0,1,0,0,1,0,0,1
240,27.18,2.00,2,0,1,1,0,0,0,1,0,0,1
241,22.67,2.00,2,1,0,1,0,0,0,1,0,0,1
242,17.82,1.75,2,1,0,0,1,0,0,1,0,0,1


## Utilizando Pandas e  Caregorical/Ordinal Encoder

**Encoder é o processo de transformar categorias em representantes numéricos**
'''
Dessa forma cada possível categoria se torna uma coluna de verdadeiro ou falso, por exemplo, **uma coluna "periodo do dia"** com as possibilidades manhã, tarde e noite, se tornarão **3 colunas booleanas: manhã, tarde e noite.**
'''

In [6]:
# o paramtro drop if_binary vai, ao invés de criar apenas uma coluna ao invés de 2 colunas booleanas para categorias binarias (isso ocorrerá com Sex e Smoker)
encoder = OneHotEncoder(drop='if_binary')
encoder

In [7]:
cat_cols_ = df.select_dtypes(exclude='number').columns

In [8]:
encoder.fit(df[cat_cols_])

In [16]:
display(print('nomes: ', encoder.feature_names_in_))
display(print('categorias: ', encoder.categories_))
display(print('indices: ', encoder.drop_idx_))
display(print('result features: ', encoder.get_feature_names_out()))

nomes:  ['sex' 'smoker' 'day' 'time']


None

categorias:  [array(['Female', 'Male'], dtype=object), array(['No', 'Yes'], dtype=object), array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object), array(['Dinner', 'Lunch'], dtype=object)]


None

indices:  [0 0 None 0]


None

colunas dummies:  ['sex_Male' 'smoker_Yes' 'day_Fri' 'day_Sat' 'day_Sun' 'day_Thur'
 'time_Lunch']


None

In [21]:
df.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


In [29]:
pd.DataFrame(
  encoder.transform(df[cat_cols_]).toarray(),
  columns=encoder.get_feature_names_out()
)

Unnamed: 0,sex_Male,smoker_Yes,day_Fri,day_Sat,day_Sun,day_Thur,time_Lunch
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
239,1.0,0.0,0.0,1.0,0.0,0.0,0.0
240,0.0,1.0,0.0,1.0,0.0,0.0,0.0
241,1.0,1.0,0.0,1.0,0.0,0.0,0.0
242,1.0,0.0,0.0,1.0,0.0,0.0,0.0


# Usando Factorize

## Convertendo para categoria

In [31]:
cat_cols_ = df.select_dtypes(exclude='number').columns
cat_cols_

Index(['sex', 'smoker', 'day', 'time'], dtype='object')

In [33]:
df_cat = df.copy()
df_cat[cat_cols_] = df_cat[cat_cols_].astype('category')
df_cat.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


## pegando as categorias

In [34]:
# loop de dicionario
{col:dict(enumerate(df_cat[col].cat.categories)) for col in cat_cols_}

{'sex': {0: 'Male', 1: 'Female'},
 'smoker': {0: 'Yes', 1: 'No'},
 'day': {0: 'Thur', 1: 'Fri', 2: 'Sat', 3: 'Sun'},
 'time': {0: 'Lunch', 1: 'Dinner'}}

## Modificando as colunas

In [35]:
df_cat[cat_cols_] = df_cat[cat_cols_].apply(lambda x: x.cat.codes)
df_cat.head()

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,1,1,3,1,2
1,10.34,1.66,0,1,3,1,3
2,21.01,3.5,0,1,3,1,3
3,23.68,3.31,0,1,3,1,2
4,24.59,3.61,1,1,3,1,4


## Aplicando factorize

In [38]:
df.day.factorize()

(array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        3, 3, 3, 3, 3, 3, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 2]),
 CategoricalIndex(['Sun', 'Sat', 'Thur', 'Fri'], categories=['Thur', 'Fri', 'Sat', 'Sun'], ordered=False, dtype='category'))

#Usando sklearn para aplicar Encoding

In [39]:
from sklearn.preprocessing import OrdinalEncoder

In [43]:
# Instanciando OriginalEncoder
encoder = OrdinalEncoder().fit(df[cat_cols_])

In [45]:
# Mostrando atributos
encoder.categories_

[array(['Female', 'Male'], dtype=object),
 array(['No', 'Yes'], dtype=object),
 array(['Fri', 'Sat', 'Sun', 'Thur'], dtype=object),
 array(['Dinner', 'Lunch'], dtype=object)]

In [52]:
  #Transformando dados esperados
  df_cat[cat_cols_] = encoder.transform(df[cat_cols_])
  df_cat

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0.0,0.0,2.0,0.0,2
1,10.34,1.66,1.0,0.0,2.0,0.0,3
2,21.01,3.50,1.0,0.0,2.0,0.0,3
3,23.68,3.31,1.0,0.0,2.0,0.0,2
4,24.59,3.61,0.0,0.0,2.0,0.0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,1.0,0.0,1.0,0.0,3
240,27.18,2.00,0.0,1.0,1.0,0.0,2
241,22.67,2.00,1.0,1.0,1.0,0.0,2
242,17.82,1.75,1.0,0.0,1.0,0.0,2
