<a href="https://colab.research.google.com/github/joelma78/GitPY/blob/main/Atividades_ML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Atividade Fundamentos de Machine Learning

Pós-Graduação Lato Sensu em Ciência de Dados

In [3]:
#  Atividade Pós-graduação — Pré-processamento e Modelagem
# Joelma Printes

## 1. Importação de bibliotecas e funções utilitárias

## 2. Funções gerais reutilizáveis
#- limpeza de dados incorretos
#- imputação de valores faltantes
#- codificação (ordinal / one hot)
#- escalonamento (standard / minmax)
#- balanceamento (Tomek / SMOTE)
#- treino e avaliação de modelos
#- salvar e carregar modelos/transformadores


## 2.1 Carregar e inspecionar dados
## 2.2 Limpeza
## 2.3 Imputação
## 2.4 Codificação
## 2.5 Escalonamento
## 2.6 Balanceamento
## 2.7 Treino e avaliação
## 2.8 Salvar modelo e transformadores
## 2.9 Predição com novos dados

# Dataset 1 — Adult (UCI)
# Dataset 2 — Breast Cancer (UCI)
# Dataset 3 — Credit Approval (UCI)
# Dataset 4 — Ames Housing (Kaggle)



Dataset 1 — Adult (UCI)

In [7]:
# ================================
# 1. Carregar e inspecionar dados
# ================================

import pandas as pd
import numpy as np

# Definição dos nomes das colunas (baseado no adult.names)
colunas = [
    "age", "workclass", "fnlwgt", "education", "education-num",
    "marital-status", "occupation", "relationship", "race", "sex",
    "capital-gain", "capital-loss", "hours-per-week", "native-country", "income"
]

# Caminhos dos arquivos - ajuste se necessário
caminho_treino = '/content/adult.data'
caminho_teste = '/content/adult.test'

# Carregar os datasets
df_train = pd.read_csv(caminho_treino, names=colunas, sep=',', na_values=' ?', skipinitialspace=True)
df_test = pd.read_csv(caminho_teste, names=colunas, sep=',', na_values=' ?', skipinitialspace=True, skiprows=1)


# Mostrar primeiras linhas
print("Treino:")
display(df_train.head())

print("Teste:")
display(df_test.head())

# Verificar dimensões
print(f"Treino: {df_train.shape}")
print(f"Teste: {df_test.shape}")

Treino:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


Teste:


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


Treino: (32561, 15)
Teste: (16281, 15)


In [9]:
# ================================
#  2. Limpeza de dados incorretos
# ================================

# Exemplo de regra: idade >= 0 e <= 100
df_train.loc[(df_train['age'] < 0) | (df_train['age'] > 100), 'age'] = np.nan
df_test.loc[(df_test['age'] < 0) | (df_test['age'] > 100), 'age'] = np.nan


# Horas por semana: mínimo 1, máximo 100
df_train.loc[(df_train['hours-per-week'] < 1) | (df_train['hours-per-week'] > 100), 'hours-per-week'] = np.nan
df_test.loc[(df_test['hours-per-week'] < 1) | (df_test['hours-per-week'] > 100), 'hours-per-week'] = np.nan

# Conferir quantos NaN temos agora
print("Valores NaN no treino após limpeza:")
display(df_train.isna().sum())
print("\nValores NaN no teste após limpeza:")
display(df_test.isna().sum())

Valores NaN no treino após limpeza:


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0



Valores NaN no teste após limpeza:


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0


In [10]:
# ================================
#  3. Imputação de valores faltantes
# ================================
from sklearn.impute import SimpleImputer

# Separar colunas numéricas e categóricas para treino e teste
colunas_num_train = df_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
colunas_cat_train = df_train.select_dtypes(include=['object']).columns.tolist()
if 'income' in colunas_cat_train:
    colunas_cat_train.remove('income')  # income é o alvo, não vamos imputar

colunas_num_test = df_test.select_dtypes(include=['int64', 'float64']).columns.tolist()
colunas_cat_test = df_test.select_dtypes(include=['object']).columns.tolist()
# O income no df_test tem um '.' no final, removeremos antes de remover a coluna
df_test['income'] = df_test['income'].str.replace('.', '', regex=False)
if 'income' in colunas_cat_test:
    colunas_cat_test.remove('income') # income é o alvo, não vamos imputar


# Imputação numérica no treino: média
imp_mean = SimpleImputer(strategy='mean')
df_train[colunas_num_train] = imp_mean.fit_transform(df_train[colunas_num_train])

# Imputação categórica no treino: moda
imp_mode = SimpleImputer(strategy='most_frequent')
df_train[colunas_cat_train] = imp_mode.fit_transform(df_train[colunas_cat_train])

# Imputação numérica no teste: usar o FIT do treino (para evitar data leakage)
df_test[colunas_num_test] = imp_mean.transform(df_test[colunas_num_test])

# Imputação categórica no teste: usar o FIT do treino (para evitar data leakage)
df_test[colunas_cat_test] = imp_mode.transform(df_test[colunas_cat_test])


print("Valores NaN no treino após imputação:")
display(df_train.isna().sum())
print("\nValores NaN no teste após imputação:")
display(df_test.isna().sum())

Valores NaN no treino após imputação:


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0



Valores NaN no teste após imputação:


Unnamed: 0,0
age,0
workclass,0
fnlwgt,0
education,0
education-num,0
marital-status,0
occupation,0
relationship,0
race,0
sex,0


In [12]:
# ================================
# 📌 4. Codificação de variáveis categóricas
# ================================
from sklearn.preprocessing import OneHotEncoder

# Aplicando OneHotEncoder nas categóricas do treino
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_train = encoder.fit_transform(df_train[colunas_cat_train])

# Criando dataframe com as colunas novas para o treino
encoded_train_df = pd.DataFrame(encoded_train, columns=encoder.get_feature_names_out(colunas_cat_train))

# Concatenando com as numéricas + target para o treino
# Resetar index para garantir a concatenação correta
df_train_reset = df_train.reset_index(drop=True)
df_train_encoded = pd.concat([df_train_reset[colunas_num_train], encoded_train_df, df_train_reset['income']], axis=1)

# Aplicando OneHotEncoder nas categóricas do teste (usando o FIT do treino)
encoded_test = encoder.transform(df_test[colunas_cat_test])

# Criando dataframe com as colunas novas para o teste
encoded_test_df = pd.DataFrame(encoded_test, columns=encoder.get_feature_names_out(colunas_cat_test))

# Definir colunas numéricas para o teste (já feito na imputação, mas redefinir para clareza)
colunas_num_test = df_test.select_dtypes(include=['int64', 'float64']).columns.tolist()
# Concatenando com as numéricas + target para o teste
# Resetar index para garantir a concatenação correta
df_test_reset = df_test.reset_index(drop=True)
df_test_encoded = pd.concat([df_test_reset[colunas_num_test], encoded_test_df, df_test_reset['income']], axis=1)


print("Treino codificado:")
display(df_train_encoded.head())
print("\nTeste codificado:")
display(df_test_encoded.head())

Treino codificado:


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income
0,39.0,77516.0,13.0,2174.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
1,50.0,83311.0,13.0,0.0,0.0,13.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
2,38.0,215646.0,9.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
3,53.0,234721.0,7.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
4,28.0,338409.0,13.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<=50K



Teste codificado:


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income
0,25.0,226802.0,7.0,0.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
1,38.0,89814.0,9.0,0.0,0.0,50.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
2,28.0,336951.0,12.0,0.0,0.0,40.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,>50K
3,44.0,160323.0,10.0,7688.0,0.0,40.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,>50K
4,18.0,103497.0,10.0,0.0,0.0,30.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K


In [13]:
# ================================
# 📌 5. Escalonamento das variáveis numéricas
# ================================
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Aplicando o escalonamento nas colunas numéricas do treino
df_train_encoded[colunas_num_train] = scaler.fit_transform(df_train_encoded[colunas_num_train])

# Aplicando o escalonamento nas colunas numéricas do teste (usando o FIT do treino)
df_test_encoded[colunas_num_test] = scaler.transform(df_test_encoded[colunas_num_test])


print("Treino escalonado:")
display(df_train_encoded.head())
print("\nTeste escalonado:")
display(df_test_encoded.head())

Treino escalonado:


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,<=50K



Teste escalonado:


Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,workclass_?,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native-country_Puerto-Rico,native-country_Scotland,native-country_South,native-country_Taiwan,native-country_Thailand,native-country_Trinadad&Tobago,native-country_United-States,native-country_Vietnam,native-country_Yugoslavia,income
0,-0.995706,0.350774,-1.197459,-0.14592,-0.21666,-0.035429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
1,-0.042642,-0.947095,-0.42006,-0.14592,-0.21666,0.774468,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K
2,-0.775768,1.394362,0.746039,-0.14592,-0.21666,-0.035429,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,>50K
3,0.397233,-0.27907,-0.03136,0.895083,-0.21666,-0.035429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,>50K
4,-1.508894,-0.817458,-0.03136,-0.14592,-0.21666,-0.845327,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,<=50K


In [14]:
# ================================
# 📌 6. Balanceamento dos dados
# ================================
from imblearn.over_sampling import SMOTE

# Separar features (X) e target (y) do treino
X = df_train_encoded.drop(columns=['income'])
y = df_train_encoded['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

print("Distribuição antes do balanceamento:")
print(y.value_counts())

smote = SMOTE(random_state=42) # Adicionado random_state para reprodutibilidade
X_bal, y_bal = smote.fit_resample(X, y)

print("Distribuição após SMOTE:")
print(pd.Series(y_bal).value_counts())

Distribuição antes do balanceamento:
income
0    24720
1     7841
Name: count, dtype: int64
Distribuição após SMOTE:
income
0    24720
1    24720
Name: count, dtype: int64


In [15]:
# ================================
# 📌 7. Treinamento e avaliação de modelos
# ================================
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Usar dados balanceados para treino (pode opcionalmente dividir em treino/validação aqui)
X_train = X_bal
y_train = y_bal

# Preparar dados de teste
X_test = df_test_encoded.drop(columns=['income'])
y_test = df_test_encoded['income'].apply(lambda x: 1 if x.strip() == '>50K' else 0)

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

y_pred_train = model.predict(X_train)
y_pred_test = model.predict(X_test)

print("📊 Avaliação no treino:")
print(classification_report(y_train, y_pred_train))

print("📊 Avaliação no teste:")
print(classification_report(y_test, y_pred_test))

📊 Avaliação no treino:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     24720
           1       1.00      1.00      1.00     24720

    accuracy                           1.00     49440
   macro avg       1.00      1.00      1.00     49440
weighted avg       1.00      1.00      1.00     49440

📊 Avaliação no teste:
              precision    recall  f1-score   support

           0       0.90      0.89      0.90     12435
           1       0.66      0.69      0.67      3846

    accuracy                           0.84     16281
   macro avg       0.78      0.79      0.78     16281
weighted avg       0.84      0.84      0.84     16281



In [16]:
# ================================
# 📌 8. Salvando modelo e transformadores
# ================================
import joblib

joblib.dump(model, 'modelo_adult.pkl')
joblib.dump(scaler, 'scaler_adult.pkl')
joblib.dump(encoder, 'encoder_adult.pkl')


['encoder_adult.pkl']

In [18]:
# ================================
# 📌 9. Predição com novos dados
# ================================

# Definir colunas numéricas e categóricas com base no dataframe original (excluindo o alvo)
colunas_num = df_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
colunas_cat = df_train.select_dtypes(include=['object']).columns.tolist()
if 'income' in colunas_cat:
    colunas_cat.remove('income')

# Criar um exemplo
novo_dado = pd.DataFrame([{
    'age': 39, 'workclass': 'State-gov', 'fnlwgt': 77516, 'education': 'Bachelors',
    'education-num': 13, 'marital-status': 'Never-married', 'occupation': 'Adm-clerical',
    'relationship': 'Not-in-family', 'race': 'White', 'sex': 'Male',
    'capital-gain': 2174, 'capital-loss': 0, 'hours-per-week': 40, 'native-country': 'United-States'
}])

# Imputar valores (aqui não tem faltante, mas deixamos a lógica)
# Use os imputers FITADOS nos dados de treino
novo_dado[colunas_num] = imp_mean.transform(novo_dado[colunas_num])
novo_dado[colunas_cat] = imp_mode.transform(novo_dado[colunas_cat])

# Codificar
# Use o encoder FITADO nos dados de treino
novo_encoded = encoder.transform(novo_dado[colunas_cat])
novo_encoded_df = pd.DataFrame(novo_encoded, columns=encoder.get_feature_names_out(colunas_cat))

# Para concatenar, precisamos garantir que todas as colunas codificadas que o modelo espera estejam presentes.
# Crie um DataFrame vazio com todas as colunas esperadas pelo encoder e depois preencha com os valores do novo dado.
expected_columns = encoder.get_feature_names_out(colunas_cat)
novo_encoded_aligned = pd.DataFrame(0, index=novo_dado.index, columns=expected_columns)
novo_encoded_aligned.update(novo_encoded_df)


# Concatenar com as numéricas
novo_dado_final = pd.concat([novo_dado[colunas_num], novo_encoded_aligned], axis=1)


# Escalar
# Use o scaler FITADO nos dados de treino
novo_dado_final[colunas_num] = scaler.transform(novo_dado_final[colunas_num])

# Predizer
modelo_carregado = joblib.load('modelo_adult.pkl')

# Garantir que as colunas estejam na ordem correta para a predição.
# A ordem das colunas no X_train (usado para treinar o modelo) define a ordem esperada.
novo_dado_final_aligned = novo_dado_final[X_train.columns]

predicao = modelo_carregado.predict(novo_dado_final_aligned)
print("Predição:", " >50K" if predicao[0] == 1 else " <=50K")

Predição:  <=50K


---

Dataset 2 — Breast Cancer (UCI)

In [19]:
# ========================================
# Dataset 2 — Breast Cancer (UCI)
# 1. Carregar e inspecionar dados
# ========================================

# O dataset Breast Cancer Wisconsin (Diagnostic) geralmente está disponível no scikit-learn
from sklearn.datasets import load_breast_cancer

breast_cancer = load_breast_cancer()
X_bc, y_bc = breast_cancer.data, breast_cancer.target
feature_names_bc = breast_cancer.feature_names
target_names_bc = breast_cancer.target_names

# Criar um dataframe para facilitar a inspeção
df_bc = pd.DataFrame(X_bc, columns=feature_names_bc)
df_bc['target'] = y_bc

print("Primeiras linhas do dataset Breast Cancer:")
display(df_bc.head())

print("\nInformações do dataset Breast Cancer:")
df_bc.info()

print("\nDescrição estatística do dataset Breast Cancer:")
display(df_bc.describe())

print("\nDistribuição da variável alvo:")
display(df_bc['target'].value_counts())

print("\nNomes das features:", feature_names_bc)
print("Nomes das classes:", target_names_bc)

Primeiras linhas do dataset Breast Cancer:


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0



Informações do dataset Breast Cancer:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
count,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,...,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0,569.0
mean,14.127292,19.289649,91.969033,654.889104,0.09636,0.104341,0.088799,0.048919,0.181162,0.062798,...,25.677223,107.261213,880.583128,0.132369,0.254265,0.272188,0.114606,0.290076,0.083946,0.627417
std,3.524049,4.301036,24.298981,351.914129,0.014064,0.052813,0.07972,0.038803,0.027414,0.00706,...,6.146258,33.602542,569.356993,0.022832,0.157336,0.208624,0.065732,0.061867,0.018061,0.483918
min,6.981,9.71,43.79,143.5,0.05263,0.01938,0.0,0.0,0.106,0.04996,...,12.02,50.41,185.2,0.07117,0.02729,0.0,0.0,0.1565,0.05504,0.0
25%,11.7,16.17,75.17,420.3,0.08637,0.06492,0.02956,0.02031,0.1619,0.0577,...,21.08,84.11,515.3,0.1166,0.1472,0.1145,0.06493,0.2504,0.07146,0.0
50%,13.37,18.84,86.24,551.1,0.09587,0.09263,0.06154,0.0335,0.1792,0.06154,...,25.41,97.66,686.5,0.1313,0.2119,0.2267,0.09993,0.2822,0.08004,1.0
75%,15.78,21.8,104.1,782.7,0.1053,0.1304,0.1307,0.074,0.1957,0.06612,...,29.72,125.4,1084.0,0.146,0.3391,0.3829,0.1614,0.3179,0.09208,1.0
max,28.11,39.28,188.5,2501.0,0.1634,0.3454,0.4268,0.2012,0.304,0.09744,...,49.54,251.2,4254.0,0.2226,1.058,1.252,0.291,0.6638,0.2075,1.0



Distribuição da variável alvo:


Unnamed: 0_level_0,count
target,Unnamed: 1_level_1
1,357
0,212



Nomes das features: ['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']
Nomes das classes: ['malignant' 'benign']


# ================================
# 2. Limpeza de dados incorretos
# ================================

In [20]:
# Verificar se há valores faltantes (embora improvável para este dataset do scikit-learn)
print("Valores NaN no dataset Breast Cancer após carregamento:")
display(df_bc.isna().sum())

# Adicionar quaisquer regras de limpeza específicas para este dataset, se necessário.
# Por exemplo, verificar ranges de valores ou outliers.
# No entanto, para o dataset Breast Cancer do scikit-learn, geralmente não há a necessidade de regras de limpeza explícitas
# como fizemos para o dataset Adult.

Valores NaN no dataset Breast Cancer após carregamento:


Unnamed: 0,0
mean radius,0
mean texture,0
mean perimeter,0
mean area,0
mean smoothness,0
mean compactness,0
mean concavity,0
mean concave points,0
mean symmetry,0
mean fractal dimension,0


# ================================
# 3. Imputação de valores faltantes
# ================================

In [21]:
# Verificar novamente se há valores faltantes.
# Para este dataset (Breast Cancer do scikit-learn), não há valores faltantes,
# então esta etapa serve mais como um placeholder e confirmação.
print("Valores NaN no dataset Breast Cancer antes da imputação:")
display(df_bc.isna().sum())

# Se houvesse valores faltantes, usaríamos um imputer aqui.
# Exemplo (comentado, pois não é necessário para este dataset):
# from sklearn.impute import SimpleImputer
# imputer = SimpleImputer(strategy='mean') # ou 'median', 'most_frequent'
# df_bc_imputed = imputer.fit_transform(df_bc.drop(columns=['target']))
# df_bc_imputed = pd.DataFrame(df_bc_imputed, columns=df_bc.drop(columns=['target']).columns)
# df_bc_imputed['target'] = df_bc['target']
# print("\nValores NaN após imputação (se houvesse):")
# display(df_bc_imputed.isna().sum())

Valores NaN no dataset Breast Cancer antes da imputação:


Unnamed: 0,0
mean radius,0
mean texture,0
mean perimeter,0
mean area,0
mean smoothness,0
mean compactness,0
mean concavity,0
mean concave points,0
mean symmetry,0
mean fractal dimension,0


# ================================
# 4. Escalonamento das variáveis numéricas
# ================================

In [22]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Separar features (X) e target (y)
X_bc = df_bc.drop(columns=['target'])
y_bc = df_bc['target']

# Dividir em treino e teste ANTES do escalonamento
X_train_bc, X_test_bc, y_train_bc, y_test_bc = train_test_split(X_bc, y_bc, test_size=0.25, random_state=42, stratify=y_bc) # Adicionado stratify

# Aplicar escalonamento (FIT apenas no treino)
scaler_bc = StandardScaler()
X_train_scaled_bc = scaler_bc.fit_transform(X_train_bc)
X_test_scaled_bc = scaler_bc.transform(X_test_bc)

# Opcional: converter de volta para DataFrame para melhor visualização (mantendo os nomes das colunas)
X_train_scaled_bc_df = pd.DataFrame(X_train_scaled_bc, columns=X_train_bc.columns)
X_test_scaled_bc_df = pd.DataFrame(X_test_scaled_bc, columns=X_test_bc.columns)


print("Features de treino escalonadas:")
display(X_train_scaled_bc_df.head())

print("\nFeatures de teste escalonadas:")
display(X_test_scaled_bc_df.head())

print("\nDimensões após escalonamento e divisão:")
print(f"Treino (features): {X_train_scaled_bc.shape}")
print(f"Teste (features): {X_test_scaled_bc.shape}")
print(f"Treino (target): {y_train_bc.shape}")
print(f"Teste (target): {y_test_bc.shape}")

Features de treino escalonadas:


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,1.659096,0.217205,1.61062,1.633339,0.576312,0.523545,0.645326,1.198745,-9.4e-05,-0.124425,...,1.567319,-0.075879,1.607223,1.384969,0.412628,0.461629,0.642584,0.701835,-0.556084,0.388781
1,-0.338165,-1.389968,-0.401667,-0.387017,-1.985604,-1.257886,-0.8205,-0.949158,-1.684127,-0.96426,...,-0.53772,-1.613244,-0.580788,-0.52916,-1.6004,-0.871596,-0.726165,-0.900606,-0.923646,-0.797233
2,0.874457,-0.651659,1.01037,0.761353,1.694102,2.359914,1.657179,2.389453,4.483419,1.570465,...,1.259163,-0.683527,1.364776,1.053712,0.978433,0.856293,0.491059,2.096751,1.767211,1.165217
3,0.920109,-0.498594,0.88618,0.806211,0.358755,0.012174,0.465964,0.918425,0.039744,-0.919986,...,0.75945,-0.09809,0.721243,0.625763,0.408208,-0.095834,0.274268,1.065079,0.345973,-0.157501
4,2.263981,0.58636,2.301943,2.408951,0.771362,1.747791,1.928079,2.64949,0.079581,-0.190837,...,2.385598,0.014555,2.639868,2.425295,-0.131075,0.816827,0.90319,1.921083,-0.262035,0.088673



Features de teste escalonadas:


Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,-0.378111,-0.58413,-0.376001,-0.450976,1.236483,0.156651,-0.620108,-0.474851,1.151568,0.477434,...,-0.364903,-0.629584,-0.394313,-0.444823,0.669009,-0.358532,-0.617998,-0.501782,0.243362,0.079595
1,1.116982,0.307243,1.084884,0.989985,0.56881,0.512138,0.38556,1.040903,0.688006,-0.289072,...,1.015553,-0.047321,0.936752,0.853167,0.699951,0.724327,0.239125,1.239257,0.226516,0.050759
2,0.252453,-0.043904,0.225077,0.109028,-0.457457,-0.099414,-0.36529,-0.000544,0.278768,-0.589309,...,0.009881,0.07643,0.068731,-0.112313,-0.047088,0.28958,-0.444109,0.517236,0.076428,0.034205
3,-0.341019,-0.241987,-0.295692,-0.453002,1.934164,1.190797,-0.503213,0.125313,-0.3369,1.349091,...,-0.239975,-0.212322,-0.2243,-0.35422,0.585022,0.24703,-0.698781,-0.067825,-0.505544,0.259019
4,0.149737,0.899241,0.098403,0.043622,-0.690017,-0.705264,-0.062228,0.116992,-0.63387,-1.192552,...,-0.085897,0.952204,-0.137798,-0.204707,-0.051509,-0.603964,-0.046583,0.323704,-0.666353,-0.839419



Dimensões após escalonamento e divisão:
Treino (features): (426, 30)
Teste (features): (143, 30)
Treino (target): (426,)
Teste (target): (143,)


# ================================
# 5. Balanceamento dos dados
# ================================

In [23]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Usar os dados de treino escalonados para balanceamento
X_train_bc_bal, y_train_bc_bal = SMOTE(random_state=42).fit_resample(X_train_scaled_bc, y_train_bc)

print("Distribuição da variável alvo no treino ANTES do balanceamento:")
print(pd.Series(y_train_bc).value_counts())

print("\nDistribuição da variável alvo no treino APÓS SMOTE:")
print(pd.Series(y_train_bc_bal).value_counts())

Distribuição da variável alvo no treino ANTES do balanceamento:
target
1    267
0    159
Name: count, dtype: int64

Distribuição da variável alvo no treino APÓS SMOTE:
target
0    267
1    267
Name: count, dtype: int64


# ================================
# 6. Treinamento e avaliação de modelos
# ================================

In [24]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Treinar o modelo com os dados de treino balanceados
model_bc = RandomForestClassifier(random_state=42)
model_bc.fit(X_train_bc_bal, y_train_bc_bal)

# Fazer previsões nos dados de treino e teste
y_pred_train_bc = model_bc.predict(X_train_bc_bal)
y_pred_test_bc = model_bc.predict(X_test_scaled_bc)

# Avaliar o modelo
print("📊 Avaliação no treino (Dataset Breast Cancer):")
print(classification_report(y_train_bc_bal, y_pred_train_bc))

print("\n📊 Avaliação no teste (Dataset Breast Cancer):")
print(classification_report(y_test_bc, y_pred_test_bc))

📊 Avaliação no treino (Dataset Breast Cancer):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       267
           1       1.00      1.00      1.00       267

    accuracy                           1.00       534
   macro avg       1.00      1.00      1.00       534
weighted avg       1.00      1.00      1.00       534


📊 Avaliação no teste (Dataset Breast Cancer):
              precision    recall  f1-score   support

           0       0.93      0.96      0.94        53
           1       0.98      0.96      0.97        90

    accuracy                           0.96       143
   macro avg       0.95      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143



# ================================
# 7. Salvando modelo e transformadores
# ================================

In [25]:
import joblib

joblib.dump(model_bc, 'modelo_breast_cancer.pkl')
joblib.dump(scaler_bc, 'scaler_breast_cancer.pkl')

print("Modelo e scaler do Breast Cancer salvos.")

Modelo e scaler do Breast Cancer salvos.


# ================================
# 8. Predição com novos dados
# ================================

In [26]:
# ================================
# 📌 8. Predição com novos dados
# ================================
import joblib
import pandas as pd

# Carregar o modelo e o scaler salvos para o Breast Cancer
modelo_bc_carregado = joblib.load('modelo_breast_cancer.pkl')
scaler_bc_carregado = joblib.load('scaler_breast_cancer.pkl')

# Criar um exemplo de novo dado (usando os nomes das features do dataset Breast Cancer)
# Os valores são apenas exemplos, você pode substituí-los
novo_dado_bc = pd.DataFrame([{
    'mean radius': 15.0, 'mean texture': 20.0, 'mean perimeter': 100.0,
    'mean area': 750.0, 'mean smoothness': 0.1, 'mean compactness': 0.1,
    'mean concavity': 0.05, 'mean concave points': 0.04, 'mean symmetry': 0.18,
    'mean fractal dimension': 0.06, 'radius error': 0.3, 'texture error': 1.0,
    'perimeter error': 2.0, 'area error': 30.0, 'smoothness error': 0.007,
    'compactness error': 0.02, 'concavity error': 0.02, 'concave points error': 0.01,
    'symmetry error': 0.02, 'fractal dimension error': 0.003, 'worst radius': 17.0,
    'worst texture': 25.0, 'worst perimeter': 115.0, 'worst area': 1000.0,
    'worst smoothness': 0.13, 'worst compactness': 0.25, 'worst concavity': 0.2,
    'worst concave points': 0.1, 'worst symmetry': 0.28, 'worst fractal dimension': 0.08
}])

# O scaler espera os dados na mesma ordem das colunas de treino.
# Garantir que as colunas do novo dado estejam na ordem correta.
# Podemos usar as colunas de X_train_bc (antes do escalonamento, apenas para obter a ordem dos nomes)
novo_dado_bc_ordered = novo_dado_bc[X_train_bc.columns]


# Escalonar o novo dado usando o scaler carregado
novo_dado_bc_scaled = scaler_bc_carregado.transform(novo_dado_bc_ordered)

# Predizer usando o modelo carregado
predicao_bc = modelo_bc_carregado.predict(novo_dado_bc_scaled)

# Os nomes das classes são 'malignant' (0) e 'benign' (1)
print("Predição para o novo dado (Breast Cancer):", breast_cancer.target_names[predicao_bc[0]])

Predição para o novo dado (Breast Cancer): malignant


In [34]:
# ========================================
# Dataset 3 — Credit Approval (UCI)
# 1. Carregar e inspecionar dados
# ========================================

# Importando bibliotecas
import pandas as pd

# URL do dataset direto do UCI
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/credit-screening/crx.data"

# Nomes das colunas conforme descrito na documentação do UCI
col_names = [
    'A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9',
    'A10', 'A11', 'A12', 'A13', 'A14', 'A15', 'Class'
]

# Carregando o dataset
df_credit = pd.read_csv(url, names=col_names, na_values='?')

# Exibir as 5 primeiras linhas (opcional, já foi feito, mas para referência)
# display(df_credit.head())

# ================================
# 2. Limpeza de dados incorretos (Verificação inicial de NaNs e tipos)
# ================================

# Verificar valores faltantes
print("Valores NaN no dataset Credit Approval após carregamento:")
display(df_credit.isna().sum())

# Verificar tipos de dados
print("\nTipos de dados das colunas:")
display(df_credit.info())

# Nota: A coluna A2 e A14 foram carregadas como 'object' devido aos valores faltantes '?'.
# Precisaremos convertê-las para numérico mais tarde.

Valores NaN no dataset Credit Approval após carregamento:


Unnamed: 0,0
A1,12
A2,12
A3,0
A4,6
A5,6
A6,9
A7,9
A8,0
A9,0
A10,0



Tipos de dados das colunas:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A1      678 non-null    object 
 1   A2      678 non-null    float64
 2   A3      690 non-null    float64
 3   A4      684 non-null    object 
 4   A5      684 non-null    object 
 5   A6      681 non-null    object 
 6   A7      681 non-null    object 
 7   A8      690 non-null    float64
 8   A9      690 non-null    object 
 9   A10     690 non-null    object 
 10  A11     690 non-null    int64  
 11  A12     690 non-null    object 
 12  A13     690 non-null    object 
 13  A14     677 non-null    float64
 14  A15     690 non-null    int64  
 15  Class   690 non-null    object 
dtypes: float64(4), int64(2), object(10)
memory usage: 86.4+ KB


None

# ================================
# 4. Codificação de variáveis categóricas
# ================================

In [39]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import pandas as pd

# Separar features (X) e target (y)
X_credit = df_credit.drop(columns=['Class'])
y_credit = df_credit['Class'].map({'+': 1, '-': 0}) # Converter target para numérico

# Identificar colunas categóricas e numéricas após imputação (vamos assumir que as colunas com object dtype são categóricas)
# Vamos também tratar A2 e A14 que eram numéricas mas podem ter sido imputadas e precisam ser tratadas como numéricas
colunas_num_credit = ['A2', 'A3', 'A8', 'A11', 'A14', 'A15'] # Colunas numéricas conhecidas
colunas_cat_credit = [col for col in X_credit.columns if col not in colunas_num_credit] # O restante é categórico

# Dividir em treino e teste ANTES da codificação para evitar data leakage
X_train_credit, X_test_credit, y_train_credit, y_test_credit = train_test_split(X_credit, y_credit, test_size=0.25, random_state=42, stratify=y_credit)

# --- Início da modificação para garantir imputação antes da concatenação ---

# Aplicar a imputação numérica nas colunas numéricas dos conjuntos de treino e teste divididos
# Usando o imputer já FITADO na etapa 3 (célula 5b9f93b0)
X_train_credit_num_imputed = imp_mean_credit.transform(X_train_credit[colunas_num_credit])
X_test_credit_num_imputed = imp_mean_credit.transform(X_test_credit[colunas_num_credit])

# Converter os arrays numpy imputados de volta para dataframes para facilitar a concatenação
X_train_credit_num_imputed_df = pd.DataFrame(X_train_credit_num_imputed, columns=colunas_num_credit, index=X_train_credit.index)
X_test_credit_num_imputed_df = pd.DataFrame(X_test_credit_num_imputed, columns=colunas_num_credit, index=X_test_credit.index)


# Aplicar a imputação categórica nas colunas categóricas dos conjuntos de treino e teste divididos
# Usando o imputer já FITADO na etapa 3 (célula 5b9f93b0)
X_train_credit_cat_imputed = imp_mode_credit.transform(X_train_credit[colunas_cat_credit])
X_test_credit_cat_imputed = imp_mode_credit.transform(X_test_credit[colunas_cat_credit])

# Converter os arrays numpy imputados de volta para dataframes
X_train_credit_cat_imputed_df = pd.DataFrame(X_train_credit_cat_imputed, columns=colunas_cat_credit, index=X_train_credit.index)
X_test_credit_cat_imputed_df = pd.DataFrame(X_test_credit_cat_imputed, columns=colunas_cat_credit, index=X_test_credit.index)


# Aplicando OneHotEncoder nas categóricas imputadas (FIT apenas no treino)
encoder_credit = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

# Fit e transform no treino
X_train_cat_encoded = encoder_credit.fit_transform(X_train_credit_cat_imputed_df)
X_train_cat_encoded_df = pd.DataFrame(X_train_cat_encoded, columns=encoder_credit.get_feature_names_out(colunas_cat_credit), index=X_train_credit.index)

# Transform no teste
X_test_cat_encoded = encoder_credit.transform(X_test_credit_cat_imputed_df)
X_test_cat_encoded_df = pd.DataFrame(X_test_cat_encoded, columns=encoder_credit.get_feature_names_out(colunas_cat_credit), index=X_test_credit.index)

# --- Fim da modificação para garantir imputação antes da concatenação ---


# Concatenar colunas numéricas imputadas e categóricas codificadas para treino e teste
X_train_processed_credit = pd.concat([X_train_credit_num_imputed_df, X_train_cat_encoded_df], axis=1)
X_test_processed_credit = pd.concat([X_test_credit_num_imputed_df, X_test_cat_encoded_df], axis=1)


print("Features de treino processadas (após imputação e codificação):")
display(X_train_processed_credit.head())

print("\nFeatures de teste processadas (após imputação e codificação):")
display(X_test_processed_credit.head())

print("\nDimensões após imputação, codificação e divisão:")
print(f"Treino (features): {X_train_processed_credit.shape}")
print(f"Teste (features): {X_test_processed_credit.shape}")
print(f"Treino (target): {y_train_credit.shape}")
print(f"Teste (target): {y_test_credit.shape}")

Features de treino processadas (após imputação e codificação):


Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_l,A4_u,...,A7_z,A9_f,A9_t,A10_f,A10_t,A12_f,A12_t,A13_g,A13_p,A13_s
394,41.17,1.25,0.25,0.0,0.0,195.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
209,39.5,4.25,6.5,16.0,117.0,1210.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
449,20.0,7.0,0.5,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
128,34.42,4.25,3.25,2.0,274.0,610.0,0.0,1.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
182,20.67,3.0,0.165,3.0,100.0,6.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0



Features de teste processadas (após imputação e codificação):


Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A4_l,A4_u,...,A7_z,A9_f,A9_t,A10_f,A10_t,A12_f,A12_t,A13_g,A13_p,A13_s
235,20.67,1.835,2.085,5.0,220.0,2503.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
650,48.08,3.75,1.0,0.0,100.0,2.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
657,38.92,1.665,0.25,0.0,0.0,390.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
149,52.83,15.0,5.5,14.0,0.0,2200.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
666,21.75,11.75,0.25,0.0,180.0,0.0,1.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0



Dimensões após imputação, codificação e divisão:
Treino (features): (517, 46)
Teste (features): (173, 46)
Treino (target): (517,)
Teste (target): (173,)


# ================================
# 3. Imputação de valores faltantes
# ================================

In [36]:
from sklearn.impute import SimpleImputer
import numpy as np

# Precisamos garantir que as colunas numéricas estejam com dtype numérico antes de imputar com a média
# As colunas A2 e A14 foram carregadas como object devido aos '?'
# Vamos tentar converter para float, coercing erros para NaN (embora já tenhamos na_values='?')
# É uma boa prática garantir o dtype antes da imputação numérica.
for col in ['A2', 'A14']:
    X_train_credit[col] = pd.to_numeric(X_train_credit[col], errors='coerce')
    X_test_credit[col] = pd.to_numeric(X_test_credit[col], errors='coerce')


# Identificar colunas numéricas e categóricas no X_train_credit e X_test_credit
# (re-identificar após possível conversão de dtype)
colunas_num_credit_impute = X_train_credit.select_dtypes(include=np.number).columns.tolist()
colunas_cat_credit_impute = X_train_credit.select_dtypes(include='object').columns.tolist()


# Imputação numérica no treino: média
imp_mean_credit = SimpleImputer(strategy='mean')
X_train_credit[colunas_num_credit_impute] = imp_mean_credit.fit_transform(X_train_credit[colunas_num_credit_impute])

# Imputação categórica no treino: moda
imp_mode_credit = SimpleImputer(strategy='most_frequent')
X_train_credit[colunas_cat_credit_impute] = imp_mode_credit.fit_transform(X_train_credit[colunas_cat_credit_impute])


# Imputação numérica no teste: usar o FIT do treino (para evitar data leakage)
X_test_credit[colunas_num_credit_impute] = imp_mean_credit.transform(X_test_credit[colunas_num_credit_impute])

# Imputação categórica no teste: usar o FIT do treino (para evitar data leakage)
X_test_credit[colunas_cat_credit_impute] = imp_mode_credit.transform(X_test_credit[colunas_cat_credit_impute])


print("Valores NaN no treino após imputação:")
display(X_train_credit.isna().sum())
print("\nValores NaN no teste após imputação:")
display(X_test_credit.isna().sum())

Valores NaN no treino após imputação:


Unnamed: 0,0
A1,0
A2,0
A3,0
A4,0
A5,0
A6,0
A7,0
A8,0
A9,0
A10,0



Valores NaN no teste após imputação:


Unnamed: 0,0
A1,0
A2,0
A3,0
A4,0
A5,0
A6,0
A7,0
A8,0
A9,0
A10,0


# ================================
# 5. Escalonamento das variáveis numéricas
# ================================

In [37]:
from sklearn.preprocessing import StandardScaler

# Identificar colunas numéricas nos dataframes processados (após imputação e codificação)
# Usamos X_train_processed_credit para obter os nomes das colunas numéricas
colunas_num_processed_credit = X_train_processed_credit.select_dtypes(include=np.number).columns.tolist()


scaler_credit = StandardScaler()

# Aplicando o escalonamento nas colunas numéricas do treino
X_train_processed_credit[colunas_num_processed_credit] = scaler_credit.fit_transform(X_train_processed_credit[colunas_num_processed_credit])

# Aplicando o escalonamento nas colunas numéricas do teste (usando o FIT do treino)
X_test_processed_credit[colunas_num_processed_credit] = scaler_credit.transform(X_test_processed_credit[colunas_num_processed_credit])


print("Features de treino escalonadas (Credit Approval):")
display(X_train_processed_credit.head())

print("\nFeatures de teste escalonadas (Credit Approval):")
display(X_test_processed_credit.head())

Features de treino escalonadas (Credit Approval):


Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A1_nan,A4_l,...,A7_nan,A9_f,A9_t,A10_f,A10_t,A12_f,A12_t,A13_g,A13_p,A13_s
394,0.822004,-0.700459,-0.569162,-0.491625,-1.135979,-0.152354,-0.660387,0.693812,-0.147442,-0.062318,...,-0.125368,1.045523,-1.045523,0.888216,-0.888216,0.920016,-0.920016,0.334407,-0.108359,-0.312513
209,0.68007,-0.107383,1.321142,2.610373,-0.402747,0.035041,-0.660387,0.693812,-0.147442,-0.062318,...,-0.125368,-0.95646,0.95646,-1.125853,1.125853,0.920016,-0.920016,0.334407,-0.108359,-0.312513
449,-0.977249,0.43627,-0.49355,-0.491625,-1.135979,-0.188356,-0.660387,0.693812,-0.147442,-0.062318,...,-0.125368,1.045523,-1.045523,0.888216,-0.888216,0.920016,-0.920016,0.334407,-0.108359,-0.312513
128,0.248317,-0.107383,0.338184,-0.103875,0.581164,-0.075734,-0.660387,0.693812,-0.147442,-0.062318,...,-0.125368,-0.95646,0.95646,-1.125853,1.125853,0.920016,-0.920016,0.334407,-0.108359,-0.312513
182,-0.920305,-0.354498,-0.59487,0.09,-0.509285,-0.187248,1.514263,-1.441312,-0.147442,-0.062318,...,-0.125368,-0.95646,0.95646,-1.125853,1.125853,0.920016,-0.920016,0.334407,-0.108359,-0.312513



Features de teste escalonadas (Credit Approval):


Unnamed: 0,A2,A3,A8,A11,A14,A15,A1_a,A1_b,A1_nan,A4_l,...,A7_nan,A9_f,A9_t,A10_f,A10_t,A12_f,A12_t,A13_g,A13_p,A13_s
235,-0.920305,-0.584809,-0.014169,0.47775,0.242749,0.273763,1.514263,-1.441312,-0.147442,-0.062318,...,-0.125368,-0.95646,0.95646,-1.125853,1.125853,0.920016,-0.920016,0.334407,-0.108359,-0.312513
650,1.40929,-0.206229,-0.342326,-0.491625,-0.509285,-0.187987,-0.660387,0.693812,-0.147442,-0.062318,...,-0.125368,1.045523,-1.045523,0.888216,-0.888216,0.920016,-0.920016,0.334407,-0.108359,-0.312513
657,0.630775,-0.618417,-0.569162,-0.491625,-1.135979,-0.116352,1.514263,-1.441312,-0.147442,-0.062318,...,-0.125368,1.045523,-1.045523,0.888216,-0.888216,0.920016,-0.920016,0.334407,-0.108359,-0.312513
149,1.812995,2.017807,1.018693,2.222623,-1.135979,0.217821,1.514263,-1.441312,-0.147442,-0.062318,...,-0.125368,-0.95646,0.95646,-1.125853,1.125853,0.920016,-0.920016,0.334407,-0.108359,-0.312513
666,-0.828515,1.375308,-0.569162,-0.491625,-0.007929,-0.188356,1.514263,-1.441312,-0.147442,-0.062318,...,-0.125368,1.045523,-1.045523,0.888216,-0.888216,-1.086938,1.086938,0.334407,-0.108359,-0.312513


# ================================
# 6. Balanceamento dos dados
# ================================

In [40]:
from imblearn.over_sampling import SMOTE
import pandas as pd

# Usar os dados de treino processados para balanceamento
X_train_credit_bal, y_train_credit_bal = SMOTE(random_state=42).fit_resample(X_train_processed_credit, y_train_credit)

print("Distribuição da variável alvo no treino ANTES do balanceamento:")
print(pd.Series(y_train_credit).value_counts())

print("\nDistribuição da variável alvo no treino APÓS SMOTE:")
print(pd.Series(y_train_credit_bal).value_counts())

Distribuição da variável alvo no treino ANTES do balanceamento:
Class
0    287
1    230
Name: count, dtype: int64

Distribuição da variável alvo no treino APÓS SMOTE:
Class
0    287
1    287
Name: count, dtype: int64


# ================================
# 7. Treinamento e avaliação de modelos
# ================================

In [41]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

# Treinar o modelo com os dados de treino balanceados
model_credit = RandomForestClassifier(random_state=42)
model_credit.fit(X_train_credit_bal, y_train_credit_bal)

# Fazer previsões nos dados de treino e teste
y_pred_train_credit = model_credit.predict(X_train_credit_bal)
y_pred_test_credit = model_credit.predict(X_test_processed_credit)

# Avaliar o modelo
print("📊 Avaliação no treino (Dataset Credit Approval):")
print(classification_report(y_train_credit_bal, y_pred_train_credit))

print("\n📊 Avaliação no teste (Dataset Credit Approval):")
print(classification_report(y_test_credit, y_pred_test_credit))

📊 Avaliação no treino (Dataset Credit Approval):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       287
           1       1.00      1.00      1.00       287

    accuracy                           1.00       574
   macro avg       1.00      1.00      1.00       574
weighted avg       1.00      1.00      1.00       574


📊 Avaliação no teste (Dataset Credit Approval):
              precision    recall  f1-score   support

           0       0.89      0.91      0.90        96
           1       0.88      0.86      0.87        77

    accuracy                           0.88       173
   macro avg       0.88      0.88      0.88       173
weighted avg       0.88      0.88      0.88       173



# ================================
# 8. Salvando modelo e transformadores
# ================================

In [42]:
import joblib

joblib.dump(model_credit, 'modelo_credit_approval.pkl')
joblib.dump(imp_mean_credit, 'imputer_mean_credit.pkl')
joblib.dump(imp_mode_credit, 'imputer_mode_credit.pkl')
joblib.dump(encoder_credit, 'encoder_credit.pkl')
joblib.dump(scaler_credit, 'scaler_credit.pkl')

print("Modelo e transformadores do Credit Approval salvos.")

Modelo e transformadores do Credit Approval salvos.


# ================================
# 9. Predição com novos dados
# ================================

In [52]:
# ================================
# 📌 9. Predição com novos dados
# ================================
import joblib
import pandas as pd
import numpy as np # Importar numpy para usar np.array_equal
from sklearn.preprocessing import StandardScaler # Importar StandardScaler aqui

# Carregar o modelo e os transformadores salvos para o Credit Approval
modelo_credit_carregado = joblib.load('modelo_credit_approval.pkl')
imputer_mean_credit_carregado = joblib.load('imputer_mean_credit.pkl')
imputer_mode_credit_carregado = joblib.load('imputer_mode_credit.pkl')
encoder_credit_carregado = joblib.load('encoder_credit.pkl')
# scaler_credit_carregado = joblib.load('scaler_credit.pkl') # Não vamos usar o scaler salvo devido ao erro

# Criar um exemplo de novo dado (usando os nomes das colunas originais)
# Certifique-se de que todas as colunas originais estejam presentes, mesmo que com valores faltantes se aplicável
novo_dado_credit = pd.DataFrame([{
    'A1': 'b', 'A2': 25.0, 'A3': 1.5, 'A4': 'u', 'A5': 'g', 'A6': 'c',
    'A7': 'v', 'A8': 0.5, 'A9': 't', 'A10': 't', 'A11': 1, 'A12': 'f',
    'A13': 'g', 'A14': 200.0, 'A15': 0
}])


# Separar colunas numéricas e categóricas no novo dado (baseado nas definições usadas no treino)
colunas_num_credit_pred = ['A2', 'A3', 'A8', 'A11', 'A14', 'A15']
colunas_cat_credit_pred = [col for col in novo_dado_credit.columns if col not in colunas_num_credit_pred]


# Aplicar imputação (usando os imputers carregados, transform apenas)
novo_dado_credit_num_imputed = imputer_mean_credit_carregado.transform(novo_dado_credit[colunas_num_credit_pred])
novo_dado_credit_cat_imputed = imputer_mode_credit_carregado.transform(novo_dado_credit[colunas_cat_credit_pred])

# Converter arrays numpy imputados de volta para dataframes
novo_dado_credit_num_imputed_df = pd.DataFrame(novo_dado_credit_num_imputed, columns=colunas_num_credit_pred, index=novo_dado_credit.index)
novo_dado_credit_cat_imputed_df = pd.DataFrame(novo_dado_credit_cat_imputed, columns=colunas_cat_credit_pred, index=novo_dado_credit.index)


# Aplicar codificação (usando o encoder carregado, transform apenas)
# O handle_unknown='ignore' garantirá que novas categorias sejam tratadas
novo_encoded_credit = encoder_credit_carregado.transform(novo_dado_credit_cat_imputed_df)
novo_encoded_credit_df = pd.DataFrame(novo_encoded_credit, columns=encoder_credit_carregado.get_feature_names_out(colunas_cat_credit_pred), index=novo_dado_credit.index)


# --- Início da modificação para construção robusta do dataframe final ---

# Obter a lista exata de colunas do dataframe de treino processado (que foi usado para treinar o modelo)
expected_columns_credit = X_train_processed_credit.columns

# Criar um dataframe para o novo dado com TODAS as colunas esperadas, preenchidas com 0 inicialmente
# Isso garante que todas as colunas que o modelo espera estejam presentes
novo_dado_credit_processed_final = pd.DataFrame(0, index=novo_dado_credit.index, columns=expected_columns_credit)

# Preencher as colunas numéricas imputadas
# Asseguramos que as colunas numéricas no novo dado correspondam às esperadas
for col in colunas_num_credit_pred:
    if col in novo_dado_credit_processed_final.columns:
        novo_dado_credit_processed_final[col] = novo_dado_credit_num_imputed_df[col]

# Preencher as colunas categóricas codificadas
# Asseguramos que as colunas categóricas codificadas no novo dado correspondam às esperadas
for col in novo_encoded_credit_df.columns:
    if col in novo_dado_credit_processed_final.columns:
        novo_dado_credit_processed_final[col] = novo_encoded_credit_df[col]

# --- Fim da modificação ---


# Aplicar escalonamento (usando um NOVO scaler ajustado aqui, apenas para demonstração)
# Ajustar um novo scaler APENAS nas colunas numéricas do dataframe de treino processado original
scaler_credit_pred = StandardScaler()
colunas_num_processed_credit = X_train_processed_credit.select_dtypes(include=np.number).columns.tolist() # Obter nomes das colunas numéricas do treino processado

# Ajustar o scaler em um array NumPy das colunas numéricas de treino
scaler_credit_pred.fit(X_train_processed_credit[colunas_num_processed_credit].values)

# --- Início da modificação para workaround do erro do scaler ---
# Criar um dataframe temporário com 46 features (zeros)
# Copiar dados numéricos para as colunas numéricas correspondentes
temp_df_for_scaling = pd.DataFrame(0, index=novo_dado_credit_processed_final.index, columns=expected_columns_credit)
temp_df_for_scaling[colunas_num_credit_pred] = novo_dado_credit_processed_final[colunas_num_credit_pred]

# Escalar o dataframe temporário de 46 features (o scaler só deve operar nas colunas em que foi ajustado)
scaled_numerical_data_temp = scaler_credit_pred.transform(temp_df_for_scaling)

# Converter o array NumPy escalonado de volta para DataFrame e selecionar apenas as colunas numéricas
scaled_numerical_data_df = pd.DataFrame(scaled_numerical_data_temp, columns=expected_columns_credit, index=novo_dado_credit_processed_final.index)[colunas_num_credit_pred]

# Atualizar as colunas numéricas escalonadas no dataframe final
novo_dado_credit_processed_final[colunas_num_credit_pred] = scaled_numerical_data_df
# --- Fim da modificação para workaround do erro do scaler ---


# --- Adicionar verificação de colunas antes da predição ---
print("\nVerificando colunas antes da predição:")
print("Colunas no dataframe de treino processado:", list(X_train_processed_credit.columns))
print("Colunas no dataframe para predição:", list(novo_dado_credit_processed_final.columns))

if np.array_equal(X_train_processed_credit.columns, novo_dado_credit_processed_final.columns):
    print("As colunas correspondem. Prosseguindo com a predição.")
    # Predizer (usando o dataframe final com todas as colunas esperadas, escalonadas e na ordem correta)
    predicao_credit = modelo_credit_carregado.predict(novo_dado_credit_processed_final)

    # Mapear a predição de volta para os rótulos originais ('+' ou '-')
    print("Predição para o novo dado (Credit Approval):", '+' if predicao_credit[0] == 1 else '-')
else:
    print("Erro: As colunas do dataframe de predição não correspondem às colunas do dataframe de treino.")
    # Para ajudar na depuração, podemos imprimir as colunas que não correspondem
    diff = set(X_train_processed_credit.columns) ^ set(novo_dado_credit_processed_final.columns)
    print("Diferença nas colunas:", diff)


Verificando colunas antes da predição:
Colunas no dataframe de treino processado: ['A2', 'A3', 'A8', 'A11', 'A14', 'A15', 'A1_a', 'A1_b', 'A4_l', 'A4_u', 'A4_y', 'A5_g', 'A5_gg', 'A5_p', 'A6_aa', 'A6_c', 'A6_cc', 'A6_d', 'A6_e', 'A6_ff', 'A6_i', 'A6_j', 'A6_k', 'A6_m', 'A6_q', 'A6_r', 'A6_w', 'A6_x', 'A7_bb', 'A7_dd', 'A7_ff', 'A7_h', 'A7_j', 'A7_n', 'A7_o', 'A7_v', 'A7_z', 'A9_f', 'A9_t', 'A10_f', 'A10_t', 'A12_f', 'A12_t', 'A13_g', 'A13_p', 'A13_s']
Colunas no dataframe para predição: ['A2', 'A3', 'A8', 'A11', 'A14', 'A15', 'A1_a', 'A1_b', 'A4_l', 'A4_u', 'A4_y', 'A5_g', 'A5_gg', 'A5_p', 'A6_aa', 'A6_c', 'A6_cc', 'A6_d', 'A6_e', 'A6_ff', 'A6_i', 'A6_j', 'A6_k', 'A6_m', 'A6_q', 'A6_r', 'A6_w', 'A6_x', 'A7_bb', 'A7_dd', 'A7_ff', 'A7_h', 'A7_j', 'A7_n', 'A7_o', 'A7_v', 'A7_z', 'A9_f', 'A9_t', 'A10_f', 'A10_t', 'A12_f', 'A12_t', 'A13_g', 'A13_p', 'A13_s']
As colunas correspondem. Prosseguindo com a predição.
Predição para o novo dado (Credit Approval): +


