In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

## Lendo o Banco de Dados

In [2]:
dados = pd.read_csv('titanic/train.csv')

In [3]:
dados.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
dados.drop(columns=['Name', 'Ticket', 'Cabin', 'Embarked'], inplace=True)

## Editando a variável de resposta (Target)

In [12]:
dados.set_index(['PassengerId'], inplace=True)

In [13]:
dados.rename(columns= {'Survived':'Target'}, inplace=True)

In [14]:
dados.head()

Unnamed: 0_level_0,Target,Pclass,Sex,Age,SibSp,Parch,Fare
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,0,3,male,22.0,1,0,7.25
2,1,1,female,38.0,1,0,71.2833
3,1,3,female,26.0,0,0,7.925
4,1,1,female,35.0,1,0,53.1
5,0,3,male,35.0,0,0,8.05


## Descrevendo os dados

In [15]:
dados.describe()

Unnamed: 0,Target,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [17]:
dados.describe(include=['O'])

Unnamed: 0,Sex
count,891
unique,2
top,male
freq,577


## Transformando os dados

In [18]:
dados['Sex_F'] = np.where(dados['Sex']=='Female', 1, 0)

dados['Pclass_1'] = np.where(dados['Pclass']==1, 1, 0)
dados['Pclass_2'] = np.where(dados['Pclass']==2, 1, 0)
dados['Pclass_3'] = np.where(dados['Pclass']==3, 1, 0)

In [19]:
dados.drop(columns=['Sex', 'Pclass'], inplace=True)

In [20]:
dados.isnull().sum()

Target        0
Age         177
SibSp         0
Parch         0
Fare          0
Sex_F         0
Pclass_1      0
Pclass_2      0
Pclass_3      0
dtype: int64

In [21]:
dados.fillna(0, inplace=True)

## Amostragem

In [31]:
x_train, x_test, y_train, y_test = train_test_split(dados.drop('Target', axis=1), dados['Target'], test_size=0.3, random_state=1234) 

[{'treino': x_train.shape}, {'test': x_test.shape}]

[{'treino': (623, 8)}, {'test': (268, 8)}]

## Modelo

In [33]:
rndForest = RandomForestClassifier(n_estimators=1000,
                                  criterion='gini',
                                  max_depth=5)

rndForest.fit(x_train, y_train)

RandomForestClassifier(max_depth=5, n_estimators=1000)

In [34]:
probabilidade = rndForest.predict_proba(dados.drop('Target', axis=1))[:, 1]
classificacao = rndForest.predict(dados.drop('Target', axis=1))

In [35]:
dados['Probabilidade'] = probabilidade
dados['Classificação'] = classificacao

In [36]:
dados.head()

Unnamed: 0_level_0,Target,Age,SibSp,Parch,Fare,Sex_F,Pclass_1,Pclass_2,Pclass_3,Probabilidade,Classificação
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,22.0,1,0,7.25,0,0,0,1,0.195939,0
2,1,38.0,1,0,71.2833,0,1,0,0,0.772656,1
3,1,26.0,0,0,7.925,0,0,0,1,0.217681,0
4,1,35.0,1,0,53.1,0,1,0,0,0.721645,1
5,0,35.0,0,0,8.05,0,0,0,1,0.177804,0
