# Processamento/tratamento dos dados

In [13]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv(r'C:\Users\João Pedro\Documents\UFG\AMS\AS2\data\train.csv')
df.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True


In [3]:
df_copy = df.copy()

## Tratando os nulos da coluna *Name* e criação de novas features

In [4]:
def processar_nomes(df):
    df_proc = df.copy()
    
    # 1. Extrair Grupo e Sobrenome inicial
    df_proc['Group'] = df_proc['PassengerId'].str.split('_', expand=True)[0]
    df_proc['Surname'] = df_proc['Name'].str.split().str[-1]
    
    # 2. Tentar preencher Sobrenome faltante usando o Grupo
    # Cria um dicionário: {Grupo: Sobrenome_Mais_Comum}
    group_surname_map = df_proc.groupby('Group')['Surname'].apply(lambda x: x.mode()[0] if not x.mode().empty else None)
    
    # Preenche onde Surname é nulo usando o mapa
    df_proc['Surname'] = df_proc['Surname'].fillna(df_proc['Group'].map(group_surname_map))
    
    # 3. Quem ainda está nulo vira "Unknown"
    df_proc['Surname'] = df_proc['Surname'].fillna('Unknown')
    
    # 4. Criar Features de Tamanho
    # Tamanho da Família (baseado em Sobrenome)
    df_proc['FamilySize'] = df_proc['Surname'].map(df_proc['Surname'].value_counts())
    
    # Tamanho do Grupo (baseado no ID - esse não tem nulos!)
    df_proc['GroupSize'] = df_proc['Group'].map(df_proc['Group'].value_counts())
    
    # 5. Limpeza Final: Remover Name (já extraímos o útil)
    # Nota: Mantemos Surname por enquanto pois ele ajuda a preencher HomePlanet/CryoSleep
    df_proc = df_proc.drop(columns=['Name'])
    
    return df_proc

# Aplicar
df_copy = processar_nomes(df_copy)

In [12]:
df_copy.head()

Unnamed: 0,PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Transported,Group,Surname,FamilySize,GroupSize
0,0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,False,1,Ofracculy,1,1
1,0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,True,2,Vines,4,1
2,0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,False,3,Susent,6,2
3,0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,False,3,Susent,6,2
4,0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,True,4,Santantines,6,1


In [15]:
df_copy.isna().sum()

PassengerId     0
HomePlanet      0
CryoSleep       0
Cabin           0
Destination     0
Age             0
VIP             0
RoomService     0
FoodCourt       0
ShoppingMall    0
Spa             0
VRDeck          0
Transported     0
Group           0
Surname         0
FamilySize      0
GroupSize       0
dtype: int64

## Tratamento do resto das colunas nulas

In [14]:
def tratar_nulos(df):
    df_clean = df.copy()
    
    if 'Group' not in df_clean.columns:
        df_clean['Group'] = df_clean['PassengerId'].str.split('_', expand=True)[0]
    if 'Surname' not in df_clean.columns and 'Name' in df_clean.columns:
        df_clean['Surname'] = df_clean['Name'].str.split().str[-1]

    # --- 1. RELAÇÃO CRYOSLEEP x GASTOS ---
    gastos_cols = ['RoomService', 'FoodCourt', 'ShoppingMall', 'Spa', 'VRDeck']
    
    # Regra A: Se CryoSleep=True, Gastos devem ser 0
    df_clean.loc[df_clean['CryoSleep'] == True, gastos_cols] = 0.0
    
    # Regra B: Se Gastos > 0, CryoSleep deve ser False
    total_gastos = df_clean[gastos_cols].sum(axis=1)
    df_clean.loc[(df_clean['CryoSleep'].isna()) & (total_gastos > 0), 'CryoSleep'] = False

    # --- 2. IMPUTAÇÃO INTELIGENTE (Grupo e Família) ---
    # Colunas alvo para preencher usando Grupo/Sobrenome
    target_cols = ['HomePlanet', 'Destination', 'VIP', 'CryoSleep'] 
    
    for col in target_cols:
        # Estratégia 1: Tentar preencher com a moda do GRUPO (PassengerId)
        # Ex: Se alguém do meu grupo é de 'Europa', eu provavelmente também sou.
        df_clean[col] = df_clean[col].fillna(
            df_clean.groupby('Group')[col].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
        )
        
        # Estratégia 2: Tentar preencher com a moda do SOBRENOME (Família)
        # Ex: Se meu grupo não ajudou, vejo minha família.
        if 'Surname' in df_clean.columns:
            df_clean[col] = df_clean[col].fillna(
                df_clean.groupby('Surname')[col].transform(lambda x: x.mode()[0] if not x.mode().empty else np.nan)
            )

    # --- 3. PREENCHIMENTO FINAL (Valores Globais) ---
    
    # Numéricas -> Mediana (incluindo gastos que sobraram como NA e Idade)
    num_cols = ['Age'] + gastos_cols
    for col in num_cols:
        df_clean[col] = df_clean[col].fillna(df_clean[col].median())
    
    # Categóricas Restantes -> Moda Global
    cat_cols = ['HomePlanet', 'CryoSleep', 'Destination', 'VIP']
    for col in cat_cols:
        df_clean[col] = df_clean[col].fillna(df_clean[col].mode()[0])

    # Cabin -> Valor Genérico (Placeholder)
    df_clean['Cabin'] = df_clean['Cabin'].fillna('Z/9999/Z')
    
    return df_clean

# Aplicação
df_copy = tratar_nulos(df_copy)
# df_test_clean = tratar_nulos(df_test)

print("Nulos restantes no treino após tratamento avançado:")
print(df_copy.isnull().sum().sum())

Nulos restantes no treino após tratamento avançado:
0
