## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [3]:
# carregar arquivo de dados de treino
data = pd.read_csv('titanic-train.csv', index_col='person')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,boat,body,home_destination
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
810,3,no,"Ford, Mr. William Neal",male,16.0,1,3,W./C. 6608,34.375,,S,,,"Rotherfield, Sussex, England Essex Co, MA"
350,2,yes,"Brown, Miss. Edith Eileen",female,15.0,0,2,29750,39.0,,S,14,,"Cape Town, South Africa / Seattle, WA"
1286,3,yes,"Whabee, Mrs. George Joseph (Shawneene Abi-Saab)",female,38.0,0,0,2688,7.2292,,C,C,,
860,3,yes,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,,,
91,1,yes,"Dick, Mr. Albert Adrian",male,31.0,1,0,17474,57.0,B20,S,3,,"Calgary, AB"


In [4]:
# quantas linhas e colunas existem?
data.shape

(872, 14)

## Análise dos dados

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 872 entries, 810 to 1211
Data columns (total 14 columns):
pclass              872 non-null int64
survived            872 non-null object
name                872 non-null object
sex                 872 non-null object
age                 697 non-null float64
sibsp               872 non-null int64
parch               872 non-null int64
ticket              872 non-null object
fare                871 non-null float64
cabin               184 non-null object
embarked            871 non-null object
boat                325 non-null object
body                73 non-null float64
home_destination    506 non-null object
dtypes: float64(3), int64(3), object(8)
memory usage: 102.2+ KB


In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

age                 175
fare                  1
cabin               688
embarked              1
boat                547
body                799
home_destination    366
dtype: int64

In [7]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pclass,872.0,2.297,0.8328,1.0,2.0,3.0,3.0,3.0
age,697.0,30.1423,14.2318,0.17,21.0,29.0,39.0,76.0
sibsp,872.0,0.4954,1.0086,0.0,0.0,0.0,1.0,8.0
parch,872.0,0.4174,0.9329,0.0,0.0,0.0,0.0,9.0
fare,871.0,32.7281,51.9179,0.0,7.925,14.4542,30.0708,512.3292
body,73.0,164.0548,98.7107,7.0,70.0,165.0,259.0,327.0


In [12]:
# sumário das características textuais
data.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
survived,872,2,no,536
name,872,870,"Kelly, Mr. James",2
sex,872,2,male,554
ticket,872,666,3101295,7
cabin,184,139,C23 C25 C27,5
embarked,871,3,S,612
boat,325,25,15,27
home_destination,506,284,"New York, NY",46


In [8]:
# quais as correlações entre as características numéricas?
data.corr()

Unnamed: 0,pclass,age,sibsp,parch,fare,body
pclass,1.0,-0.4289,0.0584,0.0442,-0.5453,0.061
age,-0.4289,1.0,-0.2759,-0.124,0.1835,0.0792
sibsp,0.0584,-0.2759,1.0,0.351,0.1651,-0.1547
parch,0.0442,-0.124,0.351,1.0,0.2083,0.0865
fare,-0.5453,0.1835,0.1651,0.2083,1.0,-0.1567
body,0.061,0.0792,-0.1547,0.0865,-0.1567,1.0


In [9]:
# show variable correlation which is more than 0.6 (positive or negative)
corr = data.corr()
corr[corr != 1][abs(corr) > 0.6].dropna(how='all', axis=1).dropna(how='all', axis=0)

In [10]:
data.groupby('survived').mean()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare,body
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
no,2.5093,30.1689,0.5168,0.3787,22.556,164.0548
yes,1.9583,30.1049,0.4613,0.4792,48.9247,


In [11]:
numeric_feats = data.dtypes[data.dtypes != "object"].index
numeric_feats

Index(['length', 'diameter', 'height', 'whole_weight', 'shucked_weight',
       'viscera_weight', 'shell_weight', 'rings'],
      dtype='object')

In [12]:
data.head(10).T

id,2758,1384,1131,3726,3445,817,2742,3757,2609,2728
sex,M,F,M,I,I,I,I,I,F,I
length,0.535,0.63,0.565,0.5,0.495,0.35,0.46,0.52,0.63,0.405
diameter,0.43,0.485,0.435,0.395,0.4,0.27,0.345,0.41,0.495,0.31
height,0.155,0.17,0.15,0.145,0.145,0.09,0.105,0.14,0.2,0.11
whole_weight,0.7845,1.321,0.99,0.7865,0.578,0.2055,0.415,0.699,1.425,0.91
shucked_weight,0.3285,0.5945,0.5795,0.332,0.2545,0.075,0.187,0.3395,0.659,0.416
viscera_weight,0.169,0.345,0.1825,0.1815,0.1305,0.0575,0.087,0.129,0.336,0.2075
shell_weight,0.245,0.345,0.206,0.2455,0.1645,0.062,0.11,0.1945,0.38,0.0995
rings,10,9,8,8,8,6,8,10,11,8


In [13]:
data.isna().sum()

sex               0
length            0
diameter          0
height            0
whole_weight      0
shucked_weight    0
viscera_weight    0
shell_weight      0
rings             0
dtype: int64

In [14]:
data.sex.describe()

count     2784
unique       3
top          M
freq       992
Name: sex, dtype: object

In [15]:
data.head()

Unnamed: 0_level_0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2758,M,0.535,0.43,0.155,0.784,0.329,0.169,0.245,10
1384,F,0.63,0.485,0.17,1.321,0.595,0.345,0.345,9
1131,M,0.565,0.435,0.15,0.99,0.58,0.182,0.206,8
3726,I,0.5,0.395,0.145,0.786,0.332,0.181,0.245,8
3445,I,0.495,0.4,0.145,0.578,0.255,0.131,0.165,8
