## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings("ignore")

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('titanic-train.csv', index_col='person')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home_destination
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
416,2,no,"Gaskell, Mr. Alfred",male,16.0,0,0,239865,26.0,,S,"Liverpool / Montreal, PQ"
194,1,no,"Maguire, Mr. John Edward",male,30.0,0,0,110469,26.0,C106,S,"Brockton, MA"
600,3,no,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,
1112,3,no,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S,
878,3,no,"Ilmakangas, Miss. Pieta Sofia",female,25.0,1,0,STON/O2. 3101271,7.925,,S,


In [6]:
# transformar colunas textuais em categóricas
data['survived'] = data['survived'].map({'yes': 1, 'no': 0})

In [5]:
# extrair títulos das pessoas a partir do nome
data['title'] = data['name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# exibir relação entre título e sexo
pd.crosstab(data['title'], data['sex']).T

title,Col,Countess,Don,Dr,Jonkheer,Master,Miss,Mlle,Mr,Mrs,Ms,Rev,Sir
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
female,0,1,0,1,0,0,179,1,0,125,1,0,0
male,4,0,1,4,1,40,0,0,508,0,0,5,1


In [12]:
# agregar títulos incomuns
replacements = {
    'Miss': ['Mlle', 'Ms'],
    'Mrs': ['Mme'],
    'Rare': ['Lady', 'Countess', 'Capt', 'Col', 'Don', \
             'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
}

for k, v in replacements.items():
    data['title'] = data['title'].replace(v, k)
    
# exibir relação entre título e sexo
pd.crosstab(data['title'], data['sex']).T

title,Master,Miss,Mr,Mrs,Rare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0,181,0,125,2
male,40,0,508,0,16


In [14]:
# categorizar os valores
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}

data['title'] = data['title'].map(title_mapping)
data['title'] = data['title'].fillna(0)

In [16]:
data['sex'] = data['sex'].map({'female': 1, 'male': 0}).astype(int)

In [17]:
data.head()

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home_destination,title
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
416,2,0,"Gaskell, Mr. Alfred",0,16.0,0,0,239865,26.0,,S,"Liverpool / Montreal, PQ",1
194,1,0,"Maguire, Mr. John Edward",0,30.0,0,0,110469,26.0,C106,S,"Brockton, MA",1
600,3,0,"Abbing, Mr. Anthony",0,42.0,0,0,C.A. 5547,7.55,,S,,1
1112,3,0,"Peacock, Miss. Treasteall",1,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S,,2
878,3,0,"Ilmakangas, Miss. Pieta Sofia",1,25.0,1,0,STON/O2. 3101271,7.925,,S,,2
