#### Pré-processamento

In [2]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

df_power_base = pd.read_csv('C:/Users/assis/Desktop/dataset_open/openpowerlifting.csv', low_memory=False)

colunas_categoricas = ['Sex', 'Event', 'Equipment']

print(df_power_base.head())

                     Name Sex Event Equipment   Age AgeClass BirthYearClass  \
0             Alona Vladi   F   SBD       Raw  33.0    24-34          24-39   
1      Galina Solovyanova   F   SBD       Raw  43.0    40-44          40-49   
2          Daniil Voronin   M   SBD       Raw  15.5    16-17          14-18   
3          Aleksey Krasov   M   SBD       Raw  35.0    35-39          24-39   
4  Margarita Pleschenkova   M   SBD       Raw  26.5    24-34          24-39   

  Division  BodyweightKg WeightClassKg  ...  Tested  Country  State  \
0        O         58.30            60  ...     Yes   Russia    NaN   
1       M1         73.10            75  ...     Yes   Russia    NaN   
2        T         67.40            75  ...     Yes   Russia    NaN   
3        O         66.65            75  ...     Yes   Russia    NaN   
4        O         72.45            75  ...     Yes   Russia    NaN   

   Federation  ParentFederation        Date  MeetCountry  MeetState  MeetTown  \
0         GFP    

#### One-hot

In [None]:
df_onehot = pd.get_dummies(df_power_base, columns=colunas_categoricas, drop_first=False)

print("One-Hot Encoding:")
print(df_onehot.head())

One-Hot Encoding:
                     Name   Age AgeClass BirthYearClass Division  \
0             Alona Vladi  33.0    24-34          24-39        O   
1      Galina Solovyanova  43.0    40-44          40-49       M1   
2          Daniil Voronin  15.5    16-17          14-18        T   
3          Aleksey Krasov  35.0    35-39          24-39        O   
4  Margarita Pleschenkova  26.5    24-34          24-39        O   

   BodyweightKg WeightClassKg  Squat1Kg  Squat2Kg  Squat3Kg  ...  Event_S  \
0         58.30            60      75.0      80.0     -90.0  ...    False   
1         73.10            75      95.0     100.0     105.0  ...    False   
2         67.40            75      85.0      90.0     100.0  ...    False   
3         66.65            75     125.0     132.0     137.5  ...    False   
4         72.45            75      80.0      85.0      90.0  ...    False   

   Event_SB  Event_SBD  Event_SD  Equipment_Multi-ply  Equipment_Raw  \
0     False       True     False      

#### Dummy Coding

In [None]:
df_dummy = pd.get_dummies(df_power_base, columns=colunas_categoricas, drop_first=True)

#Comparação com One-hot
#Enquanto o one-hot mantém todas as categorias, o dummy coding reduz uma coluna por variável -> drop_first=True
print("Dummy Coding:")
print(df_dummy.head())

Dummy Coding:
                     Name   Age AgeClass BirthYearClass Division  \
0             Alona Vladi  33.0    24-34          24-39        O   
1      Galina Solovyanova  43.0    40-44          40-49       M1   
2          Daniil Voronin  15.5    16-17          14-18        T   
3          Aleksey Krasov  35.0    35-39          24-39        O   
4  Margarita Pleschenkova  26.5    24-34          24-39        O   

   BodyweightKg WeightClassKg  Squat1Kg  Squat2Kg  Squat3Kg  ...  Event_D  \
0         58.30            60      75.0      80.0     -90.0  ...    False   
1         73.10            75      95.0     100.0     105.0  ...    False   
2         67.40            75      85.0      90.0     100.0  ...    False   
3         66.65            75     125.0     132.0     137.5  ...    False   
4         72.45            75      80.0      85.0      90.0  ...    False   

   Event_S  Event_SB  Event_SBD  Event_SD  Equipment_Raw  \
0    False     False       True     False           Tr

#### Effect Coding

In [None]:
def effect_coding(dataset):
    dummy = pd.get_dummies(dataset)
    coluna_referencia = dummy.columns[-1]
    dummy[coluna_referencia] = dummy[coluna_referencia].replace(1, -1)
    return dummy.drop(columns=[coluna_referencia])

effect = []

for col in colunas_categoricas:
    effect_df = effect_coding(df_power_base[col])
    effect.append(effect_df)

df_effect_coding = pd.concat(effect, axis=1)

print(df_effect_coding.head())

       F      M      B     BD      D      S     SB   SBD  Multi-ply   Raw  \
0   True  False  False  False  False  False  False  True      False  True   
1   True  False  False  False  False  False  False  True      False  True   
2  False   True  False  False  False  False  False  True      False  True   
3  False   True  False  False  False  False  False  True      False  True   
4  False   True  False  False  False  False  False  True      False  True   

   Single-ply  Straps  Unlimited  
0       False   False      False  
1       False   False      False  
2       False   False      False  
3       False   False      False  
4       False   False      False  


#### FeatureHasher

In [None]:
df_categ = df_power_base[colunas_categoricas]
cat_dici = df_categ.to_dict(orient='records')

hasher = FeatureHasher(n_features=32, input_type='dict')
hasher_matrix = hasher.transform(cat_dici)

df_hasher = pd.DataFrame(hasher_matrix.toarray())

print(df_hasher.head())


    0    1    2    3    4    5    6    7    8    9   ...   22   23   24   25  \
0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
1  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
3  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
4  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

    26   27   28   29   30   31  
0  0.0  0.0  0.0  0.0  0.0  1.0  
1  0.0  0.0  0.0  0.0  0.0  1.0  
2  0.0  0.0  0.0  0.0  0.0  0.0  
3  0.0  0.0  0.0  0.0  0.0  0.0  
4  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 32 columns]


#### Bin-Couting

In [None]:
df_bin_count = pd.get_dummies(df_power_base[colunas_categoricas], drop_first=False)

print("Bin Counting:")
print(df_bin_count.head())

Bin Counting:
   Sex_F  Sex_M  Sex_Mx  Event_B  Event_BD  Event_D  Event_S  Event_SB  \
0   True  False   False    False     False    False    False     False   
1   True  False   False    False     False    False    False     False   
2  False   True   False    False     False    False    False     False   
3  False   True   False    False     False    False    False     False   
4  False   True   False    False     False    False    False     False   

   Event_SBD  Event_SD  Equipment_Multi-ply  Equipment_Raw  \
0       True     False                False           True   
1       True     False                False           True   
2       True     False                False           True   
3       True     False                False           True   
4       True     False                False           True   

   Equipment_Single-ply  Equipment_Straps  Equipment_Unlimited  \
0                 False             False                False   
1                 False             