## Importação dos pacotes

In [2]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [3]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

## Carga dos dados

In [23]:
# carregar arquivo de dados de treino
data = pd.read_csv('titanic-train.csv', index_col='person')

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home_destination
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
416,2,no,"Gaskell, Mr. Alfred",male,16.0,0,0,239865,26.0,,S,"Liverpool / Montreal, PQ"
194,1,no,"Maguire, Mr. John Edward",male,30.0,0,0,110469,26.0,C106,S,"Brockton, MA"
600,3,no,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,
1112,3,no,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S,
878,3,no,"Ilmakangas, Miss. Pieta Sofia",female,25.0,1,0,STON/O2. 3101271,7.925,,S,


In [24]:
# quantas linhas e colunas existem?
data.shape

(872, 12)

## Análise dos dados

In [5]:
# quais são as colunas e respectivos tipos de dados?
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 872 entries, 416 to 1125
Data columns (total 12 columns):
pclass              872 non-null int64
survived            872 non-null object
name                872 non-null object
sex                 872 non-null object
age                 704 non-null float64
sibsp               872 non-null int64
parch               872 non-null int64
ticket              872 non-null object
fare                872 non-null float64
cabin               208 non-null object
embarked            870 non-null object
home_destination    494 non-null object
dtypes: float64(2), int64(3), object(7)
memory usage: 64.7+ KB


In [6]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

age                 168
cabin               664
embarked              2
home_destination    378
dtype: int64

In [7]:
# sumário estatístico das características numéricas
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
pclass,872.0,2.281,0.8438,1.0,1.0,3.0,3.0,3.0
age,704.0,29.4871,14.3915,0.17,21.0,28.0,38.0,80.0
sibsp,872.0,0.4908,1.0131,0.0,0.0,0.0,1.0,8.0
parch,872.0,0.3773,0.8347,0.0,0.0,0.0,0.0,9.0
fare,872.0,31.9279,46.2903,0.0,7.8958,14.4542,30.7719,512.3292


In [8]:
# sumário das características textuais
data.describe(include=['O']).T

Unnamed: 0,count,unique,top,freq
survived,872,2,no,550
name,872,872,"Najib, Miss. Adele Kiamie ""Jane""",1
sex,872,2,male,564
ticket,872,674,1601,7
cabin,208,154,G6,5
embarked,870,3,S,617
home_destination,494,289,"New York, NY",41


In [14]:
# quais as correlações entre as características numéricas?
corr = pd.get_dummies(data, columns=['survived', 'sex', 'embarked']).corr()
corr

Unnamed: 0,pclass,age,sibsp,parch,fare,survived_no,survived_yes,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
pclass,1.0,-0.4396,0.0695,0.0156,-0.5721,0.2944,-0.2944,-0.0897,0.0897,-0.2335,0.226,0.0677
age,-0.4396,1.0,-0.2487,-0.1694,0.1838,0.0483,-0.0483,-0.0871,0.0871,0.1214,-0.0035,-0.119
sibsp,0.0695,-0.2487,1.0,0.3849,0.1865,0.0259,-0.0259,0.1015,-0.1015,-0.0467,-0.0342,0.0651
parch,0.0156,-0.1694,0.3849,1.0,0.2559,-0.0784,0.0784,0.2209,-0.2209,-0.0169,-0.0926,0.0762
fare,-0.5721,0.1838,0.1865,0.2559,1.0,-0.2405,0.2405,0.1734,-0.1734,0.2415,-0.1323,-0.132
survived_no,0.2944,0.0483,0.0259,-0.0784,-0.2405,1.0,-1.0,-0.5134,0.5134,-0.1403,-0.0007,0.1297
survived_yes,-0.2944,-0.0483,-0.0259,0.0784,0.2405,-1.0,1.0,0.5134,-0.5134,0.1403,0.0007,-0.1297
sex_female,-0.0897,-0.0871,0.1015,0.2209,0.1734,-0.5134,0.5134,1.0,-1.0,0.0437,0.0859,-0.0999
sex_male,0.0897,0.0871,-0.1015,-0.2209,-0.1734,0.5134,-0.5134,-1.0,1.0,-0.0437,-0.0859,0.0999
embarked_C,-0.2335,0.1214,-0.0467,-0.0169,0.2415,-0.1403,0.1403,0.0437,-0.0437,1.0,-0.1586,-0.7711


In [20]:
# quais as correlações mais expressivas entre as variáveis?
corr[corr != 1][abs(corr) > 0.05].dropna(how='all', axis=1).dropna(how='all', axis=0)

Unnamed: 0,pclass,age,sibsp,parch,fare,survived_no,survived_yes,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
pclass,,-0.4396,0.0695,,-0.5721,0.2944,-0.2944,-0.0897,0.0897,-0.2335,0.226,0.0677
age,-0.4396,,-0.2487,-0.1694,0.1838,,,-0.0871,0.0871,0.1214,,-0.119
sibsp,0.0695,-0.2487,,0.3849,0.1865,,,0.1015,-0.1015,,,0.0651
parch,,-0.1694,0.3849,,0.2559,-0.0784,0.0784,0.2209,-0.2209,,-0.0926,0.0762
fare,-0.5721,0.1838,0.1865,0.2559,,-0.2405,0.2405,0.1734,-0.1734,0.2415,-0.1323,-0.132
survived_no,0.2944,,,-0.0784,-0.2405,,-1.0,-0.5134,0.5134,-0.1403,,0.1297
survived_yes,-0.2944,,,0.0784,0.2405,-1.0,,0.5134,-0.5134,0.1403,,-0.1297
sex_female,-0.0897,-0.0871,0.1015,0.2209,0.1734,-0.5134,0.5134,,-1.0,,0.0859,-0.0999
sex_male,0.0897,0.0871,-0.1015,-0.2209,-0.1734,0.5134,-0.5134,-1.0,,,-0.0859,0.0999
embarked_C,-0.2335,0.1214,,,0.2415,-0.1403,0.1403,,,,-0.1586,-0.7711


In [21]:
data.groupby('survived').mean()

Unnamed: 0_level_0,pclass,age,sibsp,parch,fare
survived,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
no,2.4709,30.0396,0.5109,0.3273,23.4128
yes,1.9565,28.6148,0.4565,0.4627,46.4723


In [29]:
data2 = data[['pclass', 'survived', 'sex', 'sibsp', 'parch']]
data2.survived = data2.survived.map({'yes': 1, 'no': 0})
data2.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[name] = value


Unnamed: 0_level_0,pclass,survived,sex,sibsp,parch
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
416,2,0,male,0,0
194,1,0,male,0,0
600,3,0,male,0,0
1112,3,0,female,1,1
878,3,0,female,1,0


In [30]:
# existe correlação entre sobrevivência e classe social?
data2[['pclass', 'survived']].groupby(['pclass'], as_index=False).mean().\
  sort_values(by='survived', ascending=False)

Unnamed: 0,pclass,survived
0,1,0.5946
1,2,0.3934
2,3,0.2527


In [32]:
# existe correlação entre sobrevivência e sexo?
data2[['sex', 'survived']].groupby(['sex'], as_index=False).mean().\
  sort_values(by='survived', ascending=False)

Unnamed: 0,sex,survived
0,female,0.7045
1,male,0.1862


In [33]:
# existe correlação entre sobrevivência e número de irmãos?
data2[['sibsp', 'survived']].groupby(['sibsp'], as_index=False).mean().\
  sort_values(by='survived', ascending=False)

Unnamed: 0,sibsp,survived
1,1,0.4977
2,2,0.4783
0,0,0.3328
3,3,0.3077
4,4,0.0625
5,5,0.0
6,8,0.0


In [34]:
# existe correlação entre sobrevivência e número de pais/filhos?
data2[['parch', 'survived']].groupby(['parch'], as_index=False).mean().\
  sort_values(by='survived', ascending=False)

Unnamed: 0,parch,survived
1,1,0.5856
2,2,0.5
3,3,0.5
0,0,0.3229
4,4,0.0
5,5,0.0
6,6,0.0
7,9,0.0


In [41]:
data[data.cabin.isnull() == False][['cabin', 'survived']].head(20).\
    sort_values(by='cabin', ascending=True)

Unnamed: 0_level_0,cabin,survived
person,Unnamed: 1_level_1,Unnamed: 2_level_1
239,A24,no
267,A7,no
300,B19,no
272,B45,yes
0,B5,yes
97,B58 B60,yes
54,B96 B98,yes
194,C106,no
268,C31,no
133,C92,yes


In [42]:
data['cabin'].value_counts()

C23 C25 C27        5
G6                 5
F2                 3
B96 B98            3
B58 B60            3
D                  3
C101               3
D17                2
B28                2
C93                2
B71                2
F G63              2
B18                2
F33                2
E44                2
A34                2
E121               2
B51 B53 B55        2
D19                2
E101               2
B78                2
B20                2
C78                2
C126               2
B77                2
B57 B59 B63 B66    2
B41                2
E46                2
C124               2
C31                2
                  ..
D15                1
F E69              1
B69                1
C46                1
T                  1
E40                1
E17                1
C89                1
D36                1
C55 C57            1
D56                1
B80                1
D49                1
C39                1
D28                1
D6                 1
E58          

In [47]:
data['deck'] = data['cabin'].str[:1]
data['room'] = data['cabin'].str.extract("([0-9]+)", expand=False)

In [48]:
data.head(20)

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home_destination,deck,room
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
416,2,no,"Gaskell, Mr. Alfred",male,16.0,0,0,239865,26.0,,S,"Liverpool / Montreal, PQ",,
194,1,no,"Maguire, Mr. John Edward",male,30.0,0,0,110469,26.0,C106,S,"Brockton, MA",C,106.0
600,3,no,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,,,
1112,3,no,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S,,,
878,3,no,"Ilmakangas, Miss. Pieta Sofia",female,25.0,1,0,STON/O2. 3101271,7.925,,S,,,
912,3,no,"Karaic, Mr. Milan",male,30.0,0,0,349246,7.8958,,S,,,
1305,3,no,"Zabour, Miss. Thamine",female,,1,0,2665,14.4542,,C,,,
1061,3,yes,"Nilsson, Miss. Helmina Josefina",female,26.0,0,0,347470,7.8542,,S,,,
1019,3,no,"Mineff, Mr. Ivan",male,24.0,0,0,349233,7.8958,,S,,,
1231,3,no,"Strom, Mrs. Wilhelm (Elna Matilda Persson)",female,29.0,1,1,347054,10.4625,G6,S,,G,6.0


In [50]:
corr = pd.get_dummies(data, columns=['survived', 'deck']).corr()
corr

Unnamed: 0,pclass,age,sibsp,parch,fare,survived_no,survived_yes,deck_A,deck_B,deck_C,deck_D,deck_E,deck_F,deck_G,deck_T
pclass,1.0,-0.4396,0.0695,0.0156,-0.5721,0.2944,-0.2944,-0.201,-0.3543,-0.4239,-0.2656,-0.2255,0.0396,0.0647,-0.0515
age,-0.4396,1.0,-0.2487,-0.1694,0.1838,0.0483,-0.0483,0.1161,0.1288,0.1911,0.1218,0.1306,-0.0609,-0.1028,0.0407
sibsp,0.0695,-0.2487,1.0,0.3849,0.1865,0.0259,-0.0259,-0.0467,-0.0158,0.0616,-0.019,-0.038,-0.0293,0.0082,-0.0164
parch,0.0156,-0.1694,0.3849,1.0,0.2559,-0.0784,0.0784,-0.0281,0.0809,0.0225,-0.0249,-0.02,-0.007,0.0749,-0.0153
fare,-0.5721,0.1838,0.1865,0.2559,1.0,-0.2405,0.2405,0.0327,0.3802,0.4246,0.085,0.0748,-0.0507,-0.0291,0.0026
survived_no,0.2944,0.0483,0.0259,-0.0784,-0.2405,1.0,-1.0,-0.0267,-0.176,-0.1169,-0.1098,-0.1483,-0.0267,-0.0363,0.0259
survived_yes,-0.2944,-0.0483,-0.0259,0.0784,0.2405,-1.0,1.0,0.0267,0.176,0.1169,0.1098,0.1483,0.0267,0.0363,-0.0259
deck_A,-0.201,0.1161,-0.0467,-0.0281,0.0327,-0.0267,0.0267,1.0,-0.0309,-0.0369,-0.0262,-0.0254,-0.0175,-0.01,-0.0045
deck_B,-0.3543,0.1288,-0.0158,0.0809,0.3802,-0.176,0.176,-0.0309,1.0,-0.0651,-0.0463,-0.0448,-0.0309,-0.0177,-0.0079
deck_C,-0.4239,0.1911,0.0616,0.0225,0.4246,-0.1169,0.1169,-0.0369,-0.0651,1.0,-0.0553,-0.0536,-0.0369,-0.0212,-0.0095
