## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('titanic-train.csv', index_col='person')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home_destination
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
416,2,no,"Gaskell, Mr. Alfred",male,16.0,0,0,239865,26.0,,S,"Liverpool / Montreal, PQ"
194,1,no,"Maguire, Mr. John Edward",male,30.0,0,0,110469,26.0,C106,S,"Brockton, MA"
600,3,no,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,
1112,3,no,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S,
878,3,no,"Ilmakangas, Miss. Pieta Sofia",female,25.0,1,0,STON/O2. 3101271,7.925,,S,


In [4]:
# quantas linhas e colunas existem?
data.shape

(872, 12)

## Análise dos dados

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 872 entries, 416 to 1125
Data columns (total 12 columns):
pclass              872 non-null int64
survived            872 non-null object
name                872 non-null object
sex                 872 non-null object
age                 704 non-null float64
sibsp               872 non-null int64
parch               872 non-null int64
ticket              872 non-null object
fare                872 non-null float64
cabin               208 non-null object
embarked            870 non-null object
home_destination    494 non-null object
dtypes: float64(2), int64(3), object(7)
memory usage: 64.7+ KB


In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

age                 168
cabin               664
embarked              2
home_destination    378
dtype: int64

In [7]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pclass,872.0,2.281,0.8438,1.0,1.0,3.0,3.0,3.0
age,704.0,29.4871,14.3915,0.17,21.0,28.0,38.0,80.0
sibsp,872.0,0.4908,1.0131,0.0,0.0,0.0,1.0,8.0
parch,872.0,0.3773,0.8347,0.0,0.0,0.0,0.0,9.0
fare,872.0,31.9279,46.2903,0.0,7.8958,14.4542,30.7719,512.3292


In [8]:
# sumário das características textuais
data.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
survived,872,2,no,550
name,872,872,"Daly, Miss. Margaret Marcella ""Maggie""",1
sex,872,2,male,564
ticket,872,674,1601,7
cabin,208,154,G6,5
embarked,870,3,S,617
home_destination,494,289,"New York, NY",41


In [9]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,pclass,age,sibsp,parch,fare
pclass,1.0,-0.4396,0.0695,0.0156,-0.5721
age,-0.4396,1.0,-0.2487,-0.1694,0.1838
sibsp,0.0695,-0.2487,1.0,0.3849,0.1865
parch,0.0156,-0.1694,0.3849,1.0,0.2559
fare,-0.5721,0.1838,0.1865,0.2559,1.0


In [10]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.6].dropna(how='all', axis=1).dropna(how='all', axis=0)

In [11]:
data.groupby('survived').mean()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,2.4709,30.0396,0.5109,0.3273,23.4128
yes,1.9565,28.6148,0.4565,0.4627,46.4723


In [12]:
numeric_feats = data.dtypes[data.dtypes != "object"].index
numeric_feats

Index(['pclass', 'age', 'sibsp', 'parch', 'fare'], dtype='object')

In [13]:
data.head(10).T

person,416,194,600,1112,878,912,1305,1061,1019,1231
pclass,2,1,3,3,3,3,3,3,3,3
survived,no,no,no,no,no,no,no,yes,no,no
name,"Gaskell, Mr. Alfred","Maguire, Mr. John Edward","Abbing, Mr. Anthony","Peacock, Miss. Treasteall","Ilmakangas, Miss. Pieta Sofia","Karaic, Mr. Milan","Zabour, Miss. Thamine","Nilsson, Miss. Helmina Josefina","Mineff, Mr. Ivan","Strom, Mrs. Wilhelm (Elna Matilda Persson)"
sex,male,male,male,female,female,male,female,female,male,female
age,16,30,42,3,25,30,,26,24,29
sibsp,0,0,0,1,1,0,1,0,0,1
parch,0,0,0,1,0,0,0,0,0,1
ticket,239865,110469,C.A. 5547,SOTON/O.Q. 3101315,STON/O2. 3101271,349246,2665,347470,349233,347054
fare,26,26,7.55,13.78,7.925,7.896,14.45,7.854,7.896,10.46
cabin,,C106,,,,,,,,G6


In [14]:
data.isna().sum()

pclass                0
survived              0
name                  0
sex                   0
age                 168
sibsp                 0
parch                 0
ticket                0
fare                  0
cabin               664
embarked              2
home_destination    378
dtype: int64

In [15]:
data.sex.describe()

count      872
unique       2
top       male
freq       564
Name: sex, dtype: object

In [16]:
data.head()

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home_destination
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
416,2,no,"Gaskell, Mr. Alfred",male,16.0,0,0,239865,26.0,,S,"Liverpool / Montreal, PQ"
194,1,no,"Maguire, Mr. John Edward",male,30.0,0,0,110469,26.0,C106,S,"Brockton, MA"
600,3,no,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,
1112,3,no,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S,
878,3,no,"Ilmakangas, Miss. Pieta Sofia",female,25.0,1,0,STON/O2. 3101271,7.925,,S,
