In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import pandas as pd
import numpy as np
from sklearn.metrics import f1_score

In [None]:
df = df[df["sex"].notna()]
df = df[df["sex"] != "."]

In [None]:
df = df.dropna()

In [None]:
X = df.drop(columns=["species"])
y = df["species"]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
le = LabelEncoder()

y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)

In [None]:
categorical_atributes = ['island', 'sex']
numerical_atributes = ['culmen_length_mm', 'culmen_depth_mm', 'flipper_length_mm', 'body_mass_g']

In [None]:
numeric_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_atributes),
        ('cat', categorical_transformer, categorical_atributes)
    ],
    remainder='passthrough'
)

In [None]:
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
regressao = LogisticRegression(random_state=42)

In [None]:
cv_scores = cross_val_score(regressao, X_train_processed, y_train_encoded, cv=cv, scoring='accuracy')

In [None]:
regressao.fit(X_train_processed, y_train_encoded)

In [None]:
y_pred = regressao.predict(X_test_processed)

In [None]:
# fiquei receiosa da acuracia ser enganosa e ignorar a classe minorit√°ria, entao quis usar f1
f1_macro = f1_score(y_test_encoded, y_pred, average='macro')
print(f"fi score: {f1_macro:.4f}")