# Processamento/tratamento dos dados

In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

In [27]:
df = pd.read_csv(r'C:\Users\João Pedro\Documents\UFG\AMS\AS2\data\train.csv')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [28]:
df_copy = df.copy()

## Tratando os nulos da coluna *Name* e criação de novas features

In [29]:
def processar_nomes(df):
    df_proc = df.copy()
    
    # 1. Extrair Grupo e Sobrenome inicial
    df_proc['Group'] = df_proc['PassengerId'].str.split('_', expand=True)[0]
    df_proc['Surname'] = df_proc['Name'].str.split().str[-1]
    
    # 2. Tentar preencher Sobrenome faltante usando o Grupo
    # Cria um dicionário: {Grupo: Sobrenome_Mais_Comum}
    group_surname_map = df_proc.groupby('Group')['Surname'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)
    
    # Preenche onde Surname é nulo usando o mapa
    df_proc['Surname'] = df_proc['Surname'].fillna(df_proc['Group'].map(group_surname_map))
    
    # 3. Quem ainda está nulo vira "Unknown"
    df_proc['Surname'] = df_proc['Surname'].fillna('Unknown')
    
    # 4. Criar Features de Tamanho
    # Tamanho da Família (baseado em Sobrenome)
    df_proc['FamilySize'] = df_proc['Surname'].map(df_proc['Surname'].value_counts())
    
    # Tamanho do Grupo (baseado no ID - esse não tem nulos!)
    df_proc['GroupSize'] = df_proc['Group'].map(df_proc['Group'].value_counts())
    
    # 5. Limpeza Final: Remover Name (já extraímos o útil)
    # Nota: Mantemos Surname por enquanto pois ele ajuda a preencher HomePlanet/CryoSleep
    df_proc = df_proc.drop(columns=['Name'])
    
    return df_proc

# Aplicar
df_copy = processar_nomes(df_copy)

In [30]:
df_copy.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,Surname,FamilySize,GroupSize
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,Ofracculy,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,Vines,4,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,Susent,6,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,Susent,6,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,Santantines,6,1


In [33]:
df_copy.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Group           0
Surname         0
FamilySize      0
GroupSize       0
dtype: int64

## Tratamento do resto das colunas nulas

In [32]:
def tratar_nulos(df):
    df_clean = df.copy()
    
    if 'Group' not in df_clean.columns:
        df_clean['Group'] = df_clean['PassengerId'].str.split('_', expand=True)[0]
    if 'Surname' not in df_clean.columns and 'Name' in df_clean.columns:
        df_clean['Surname'] = df_clean['Name'].str.split().str[-1]

    # --- 1. RELAÇÃO CRYOSLEEP x GASTOS ---
    gastos_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    # Regra A: Se CryoSleep=True, Gastos devem ser 0
    df_clean.loc[df_clean['CryoSleep'] == True, gastos_cols] = 0.0
    
    # Regra B: Se Gastos > 0, CryoSleep deve ser False
    total_gastos = df_clean[gastos_cols].sum(axis=1)
    df_clean.loc[(df_clean['CryoSleep'].isna()) & (total_gastos > 0), 'CryoSleep'] = False

    # --- 2. IMPUTAÇÃO INTELIGENTE (Grupo e Família) ---
    # Colunas alvo para preencher usando Grupo/Sobrenome
    target_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep'] 
    
    for col in target_cols:
        # Estratégia 1: Tentar preencher com a moda do GRUPO (PassengerId)
        # Ex: Se alguém do meu grupo é de 'Europa', eu provavelmente também sou.
        df_clean[col] = df_clean[col].fillna(
            df_clean.groupby('Group')[col].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
        )
        
        # Estratégia 2: Tentar preencher com a moda do SOBRENOME (Família)
        # Ex: Se meu grupo não ajudou, vejo minha família.
        if 'Surname' in df_clean.columns:
            df_clean[col] = df_clean[col].fillna(
                df_clean.groupby('Surname')[col].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
            )

    # --- 3. PREENCHIMENTO FINAL (Valores Globais) ---
    
    # Numéricas -> Mediana (incluindo gastos que sobraram como NA e Idade)
    num_cols = ['Age'] + gastos_cols
    for col in num_cols:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())
    
    # Categóricas Restantes -> Moda Global
    cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
    for col in cat_cols:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

    # Cabin -> Valor Genérico (Placeholder)
    df_clean['Cabin'] = df_clean['Cabin'].fillna('Z/9999/Z')
    
    return df_clean

# Aplicação
df_copy = tratar_nulos(df_copy)
# df_test_clean = tratar_nulos(df_test)

print("Nulos restantes no treino após tratamento avançado:")
print(df_copy.isnull().sum().sum())

Nulos restantes no treino após tratamento avançado:
0


In [34]:
df_copy.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,Surname,FamilySize,GroupSize
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,Ofracculy,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,Vines,4,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,Susent,6,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,Susent,6,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,Santantines,6,1


In [35]:
def clean_bool(x):
    # Converte tudo para string primeiro, depois padroniza
    x = str(x).lower()
    if x in ['true', '1.0', '1']:
        return True
    elif x in ['false', '0.0', '0']:
        return False
    return False # Valor padrão caso sobre algo estranho

# Aplicar correção
df_copy['CryoSleep'] = df_copy['CryoSleep'].apply(clean_bool)
df_copy['VIP'] = df_copy['VIP'].apply(clean_bool)

# Verificar novamente
print(f"CryoSleep únicos: {df_copy['CryoSleep'].unique()}")
print(f"VIP únicos: {df_copy['VIP'].unique()}")

CryoSleep únicos: [False  True]
VIP únicos: [False  True]


In [25]:
df_copy.to_csv(r'C:\Users\João Pedro\Documents\UFG\AMS\AS2\data\test_cleaned.csv', index=False)

# Engenharia de features

In [42]:
df_cleaned = pd.read_csv(r'C:\Users\João Pedro\Documents\UFG\AMS\AS2\data\test_cleaned.csv')
df_cleaned.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Group,Surname,FamilySize,GroupSize
0,0013_01,Earth,True,G/3/S,TRAPPIST-1e,27.0,False,0.0,0.0,0.0,0.0,0.0,13,Carsoning,4,1
1,0018_01,Earth,False,F/4/S,TRAPPIST-1e,19.0,False,0.0,9.0,0.0,2823.0,0.0,18,Peckers,1,1
2,0019_01,Europa,True,C/0/S,55 Cancri e,31.0,False,0.0,0.0,0.0,0.0,0.0,19,Unhearfus,1,1
3,0021_01,Europa,False,C/1/S,TRAPPIST-1e,38.0,False,0.0,6652.0,0.0,181.0,585.0,21,Caltilter,1,1
4,0023_01,Earth,False,F/5/S,TRAPPIST-1e,20.0,False,10.0,0.0,635.0,0.0,0.0,23,Harperez,3,1


In [40]:
def feature_engineering(df):
    df_feat = df.copy()
    
    # 1. Cabine
    if 'Cabin' in df_feat.columns:
        df_feat[['Deck', 'Num', 'Side']] = df_feat['Cabin'].str.split('/', expand=True)
        df_feat['Num'] = pd.to_numeric(df_feat['Num'], errors='coerce').fillna(0) # Correção segura
    
    # 2. Idade
    if 'Age' in df_feat.columns:
        df_feat['AgeGroup'] = pd.cut(df_feat['Age'], 
                                     bins=[-1, 12, 17, 60, 200], 
                                     labels=['Child', 'Teen', 'Adult', 'Senior'])

    # 3. Gastos
    cols_gastos = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    # Garante que as colunas existam (no teste as vezes pode falhar se não conferir)
    existing_gastos = [c for c in cols_gastos if c in df_feat.columns]
    
    if existing_gastos:
        df_feat['TotalSpend'] = df_feat[existing_gastos].sum(axis=1)
        df_feat['NoSpending'] = (df_feat['TotalSpend'] == 0).astype(int)
    
    # 4. Grupo
    if 'GroupSize' in df_feat.columns:
        df_feat['IsAlone'] = (df_feat['GroupSize'] == 1).astype(int)

    # Nota: NÃO dropamos colunas aqui ainda, deixamos para o preprocessador decidir
    # para evitar erros de "coluna não encontrada".
    
    return df_feat

# Aplicar a engenharia
df_cleaned = feature_engineering(df_cleaned)

# Visualizar as novas colunas
print(df_cleaned[['Deck', 'Side', 'Num', 'AgeGroup', 'TotalSpend', 'IsAlone']].head())

  Deck Side  Num AgeGroup  TotalSpend  IsAlone
0    B    P    0    Adult         0.0        1
1    F    S    0    Adult       736.0        1
2    A    S    0    Adult     10383.0        0
3    A    S    0    Adult      5176.0        0
4    F    S    1     Teen      1091.0        1


In [38]:
df_cleaned.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,...,Surname,FamilySize,GroupSize,Deck,Num,Side,AgeGroup,TotalSpend,NoSpending,IsAlone
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,...,Ofracculy,1,1,B,0,P,Adult,0.0,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,...,Vines,4,1,F,0,S,Adult,736.0,0,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,...,Susent,6,2,A,0,S,Adult,10383.0,0,0
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,...,Susent,6,2,A,0,S,Adult,5176.0,0,0
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,...,Santantines,6,1,F,1,S,Teen,1091.0,0,1


In [5]:
df_cleaned.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8693 entries, 0 to 8692
Data columns (total 20 columns):
 #   Column        Non-Null Count  Dtype   
---  ------        --------------  -----   
 0   HomePlanet    8693 non-null   object  
 1   CryoSleep     8693 non-null   bool    
 2   Destination   8693 non-null   object  
 3   Age           8693 non-null   float64 
 4   VIP           8693 non-null   bool    
 5   RoomService   8693 non-null   float64 
 6   FoodCourt     8693 non-null   float64 
 7   ShoppingMall  8693 non-null   float64 
 8   Spa           8693 non-null   float64 
 9   VRDeck        8693 non-null   float64 
 10  Transported   8693 non-null   bool    
 11  FamilySize    8693 non-null   int64   
 12  GroupSize     8693 non-null   int64   
 13  Deck          8693 non-null   object  
 14  Num           8693 non-null   int64   
 15  Side          8693 non-null   object  
 16  AgeGroup      8693 non-null   category
 17  TotalSpend    8693 non-null   float64 
 18  NoSpendi

In [41]:
def preparar_dados_treino(df_train_cleaned):
    # 1. Aplicar Engenharia
    df_proc = feature_engineering(df_train_cleaned)

    # 2. Log Transform (Crucial replicar isso no teste depois)
    cols_to_log = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend', 'Num']
    for col in cols_to_log:
        if col in df_proc.columns:
            df_proc[col] = np.log1p(df_proc[col])

    # 3. Split X e y
    cols_drop = ['PassengerId', 'Name', 'Cabin', 'Group', 'Surname', 'Transported']
    # Mantemos apenas features que existem no dataframe
    features = [c for c in df_proc.columns if c not in cols_drop]
    
    X = df_proc[features]
    y = df_proc['Transported'].astype(int)
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

    # 4. Preprocessor (Fit no Treino)
    num_cols = [c for c in X_train.columns if X_train[c].dtype in ['int64', 'float64', 'int32', 'float32']]
    cat_cols = [c for c in X_train.columns if X_train[c].dtype == 'object' or X_train[c].dtype.name == 'category']

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), num_cols),
            ('cat', OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False), cat_cols)
        ],
        verbose_feature_names_out=False
    )

    # FIT + TRANSFORM (Aprende com o treino)
    X_train_final = preprocessor.fit_transform(X_train)
    X_val_final = preprocessor.transform(X_val) # Apenas Transform na validação
    
    feature_names = preprocessor.get_feature_names_out()
    
    # RETORNAMOS O PREPROCESSOR AGORA
    return X_train_final, y_train, X_val_final, y_val, feature_names, preprocessor

# --- EXECUÇÃO NO TREINO ---
# df_clean = pd.read_csv('train_cleaned.csv') # Seu dataframe de treino tratado
X_train, y_train, X_val, y_val, col_names, preprocessor_treinado = preparar_dados_treino(df_cleaned)
print("Treino processado e Preprocessor treinado com sucesso!")

Treino processado e Preprocessor treinado com sucesso!


In [14]:
import pandas as pd
import numpy as np

# --- SALVANDO EM 4 ARQUIVOS DISTINTOS ---

# 1. Salvar X_train (Features de Treino)
# Convertendo o array numpy de volta para DataFrame para ter cabeçalho
df_X_train = pd.DataFrame(X_train, columns=col_names)
df_X_train.to_csv('X_train.csv', index=False)

# 2. Salvar y_train (Alvo de Treino)
# O y_train original é uma Series, convertemos para DataFrame
df_y_train = pd.DataFrame(y_train).reset_index(drop=True)
df_y_train.columns = ['Transported'] # Garante o nome da coluna
df_y_train.to_csv('y_train.csv', index=False)

# 3. Salvar X_val (Features de Validação)
df_X_val = pd.DataFrame(X_val, columns=col_names)
df_X_val.to_csv('X_val.csv', index=False)

# 4. Salvar y_val (Alvo de Validação)
df_y_val = pd.DataFrame(y_val).reset_index(drop=True)
df_y_val.columns = ['Transported']
df_y_val.to_csv('y_val.csv', index=False)

print("Arquivos gerados com sucesso:")
print(f"- X_train_final.csv: {df_X_train.shape}")
print(f"- y_train_final.csv: {df_y_train.shape}")
print(f"- X_val.csv:         {df_X_val.shape}")
print(f"- y_val.csv:         {df_y_val.shape}")

Arquivos gerados com sucesso:
- X_train_final.csv: (6954, 26)
- y_train_final.csv: (6954, 1)
- X_val.csv:         (1739, 26)
- y_val.csv:         (1739, 1)


# Teste

In [None]:
def processar_teste_para_kaggle(df_test_cleaned, preprocessor_treinado):
    # 1. Salvar o PassengerId (Obrigatório para o Kaggle)
    submission_ids = df_test_cleaned['PassengerId'].copy()
    
    # 2. Aplicar a MESMA engenharia de features
    df_test_proc = feature_engineering(df_test_cleaned)
    
    # 3. Aplicar a MESMA transformação de Log (Manual)
    # O preprocessor do sklearn não faz o log, nós fizemos na mão, então repetimos aqui
    cols_to_log = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck', 'TotalSpend', 'Num']
    for col in cols_to_log:
        if col in df_test_proc.columns:
            df_test_proc[col] = np.log1p(df_test_proc[col])
            
    # 4. Selecionar as colunas (Garantir que são as mesmas do treino)
    # O preprocessor vai reclamar se tiver colunas extras ou faltando, 
    # mas o ColumnTransformer com 'ignore' ajuda.
    # Vamos garantir removendo as colunas que dropamos no treino.
    cols_drop = ['PassengerId', 'Name', 'Cabin', 'Group', 'Surname', 'Transported'] # Transported não existe no teste, mas ok
    features_test = [c for c in df_test_proc.columns if c not in cols_drop]
    
    X_test = df_test_proc[features_test]
    
    # 5. TRANSFORM (NUNCA FIT) NO TESTE
    # Usamos o cérebro treinado no passo anterior
    X_test_final = preprocessor_treinado.transform(X_test)
    
    return X_test_final, submission_ids

# --- EXECUÇÃO NO TESTE ---
# Carregue seu teste limpo (com nulos tratados e Surname/Group criados)
# df_test_clean = pd.read_csv('test.csv') -> trate os nulos antes!
# Exemplo assumindo que você já tratou nulos do teste em 'df_test_cleaned':

X_test_kaggle, ids_kaggle = processar_teste_para_kaggle(df_cleaned, preprocessor_treinado)

print(f"Dados de teste prontos: {X_test_kaggle.shape}")

# --- PREVISÃO E SALVAMENTO ---
# model = ... (seu modelo treinado carregado)
# previsoes = model.predict(X_test_kaggle)
# previsoes_bool = (previsoes >= 0.5).astype(bool).flatten()

# submission = pd.DataFrame({'PassengerId': ids_kaggle, 'Transported': previsoes_bool})
# submission.to_csv('submission.csv', index=False)