Dataset: https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/problem12.html

# Preparando os dados
Imports

In [0]:
import pandas as pd # Importa o Pandas

arquivo = 'https://web.stanford.edu/class/archive/cs/cs109/cs109.1166/stuff/titanic.csv' # Nome e local do arquivo

df = pd.read_csv(arquivo) # Cria um dataframe do pandas

In [2]:
df.shape # Verifica o número de linhas e colunas

(887, 8)

In [3]:
df.head()

Unnamed: 0,Survived,Pclass,Name,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,Mr. Owen Harris Braund,male,22.0,1,0,7.25
1,1,1,Mrs. John Bradley (Florence Briggs Thayer) Cum...,female,38.0,1,0,71.2833
2,1,3,Miss. Laina Heikkinen,female,26.0,0,0,7.925
3,1,1,Mrs. Jacques Heath (Lily May Peel) Futrelle,female,35.0,1,0,53.1
4,0,3,Mr. William Henry Allen,male,35.0,0,0,8.05


In [4]:
df = df.drop('Name', axis=1)
df.describe() 

Unnamed: 0,Survived,Pclass,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
count,887.0,887.0,887.0,887.0,887.0,887.0
mean,0.385569,2.305524,29.471443,0.525366,0.383315,32.30542
std,0.487004,0.836662,14.121908,1.104669,0.807466,49.78204
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.25,0.0,0.0,7.925
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.1375
max,1.0,3.0,80.0,8.0,6.0,512.3292


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 887 entries, 0 to 886
Data columns (total 7 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Survived                 887 non-null    int64  
 1   Pclass                   887 non-null    int64  
 2   Sex                      887 non-null    object 
 3   Age                      887 non-null    float64
 4   Siblings/Spouses Aboard  887 non-null    int64  
 5   Parents/Children Aboard  887 non-null    int64  
 6   Fare                     887 non-null    float64
dtypes: float64(2), int64(4), object(1)
memory usage: 48.6+ KB


Mapendo valores de "sex"

In [0]:
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

In [7]:
df.head()

Unnamed: 0,Survived,Pclass,Sex,Age,Siblings/Spouses Aboard,Parents/Children Aboard,Fare
0,0,3,0,22.0,1,0,7.25
1,1,1,1,38.0,1,0,71.2833
2,1,3,1,26.0,0,0,7.925
3,1,1,1,35.0,1,0,53.1
4,0,3,0,35.0,0,0,8.05


# Modelo

In [0]:
y = df['Survived']
X = df.drop(['Survived'], axis=1)

In [0]:
from sklearn.model_selection import train_test_split # Importa parte do sklearn para dividir os dados de entrada entre treino e teste

In [0]:
X_treino, X_teste, y_treino, y_teste = train_test_split(X, y, random_state=0) # Divide os dados de entrada entre treino e teste

In [0]:
from xgboost import XGBClassifier

In [0]:
modelo = XGBClassifier()

In [13]:
modelo.fit(X_treino, y_treino)
modelo.score(X_teste, y_teste)

0.8288288288288288

# Otimizando hiperparâmetros

Com base no tutorial do [Mario Filho](http://mariofilho.com).

In [14]:
!pip install scikit-optimize
from skopt import gp_minimize 



In [15]:
from sklearn.model_selection import cross_val_score

def treinar_modelo(params):
  eta = params[0]
  max_depth = params[1]
  subsample = params[2]
  colsample_bytree = params[3]

  print(params, '\n')
    
  mdl = XGBClassifier(eta=eta, max_depth=max_depth, subsample=subsample, colsample_bytree=colsample_bytree, random_state=0)
    
  scores = cross_val_score(mdl, X, y, cv=5, scoring='accuracy')
    
  return -scores.mean()

space = [(1e-3, 1), #eta
         (2, 10), # max_depth 
         (0.05, 1.0), # subsample
         (0.1, 1.0)] # colsample bytree

resultados_gp = gp_minimize(treinar_modelo, space, random_state=0, verbose=1, n_calls=20, n_random_starts=10)

Iteration No: 1 started. Evaluating function at random point.
[0.5932517736067934, 9, 0.8650483367416192, 0.8625265649057131] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.4621
Function value obtained: -0.8276
Current minimum: -0.8276
Iteration No: 2 started. Evaluating function at random point.
[0.6239401330891865, 5, 0.3326578762172487, 0.1510416795856989] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1705
Function value obtained: -0.8039
Current minimum: -0.8276
Iteration No: 3 started. Evaluating function at random point.
[0.27338363828553314, 6, 0.8215602923367188, 0.5319794551375517] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2856
Function value obtained: -0.8377
Current minimum: -0.8377
Iteration No: 4 started. Evaluating function at random point.
[0.393392011304729, 9, 0.37052635239640497, 0.6833546848460775] 

Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.3467
Function valu

In [16]:
resultados_gp.x

[0.27338363828553314, 6, 0.8215602923367188, 0.5319794551375517]

# Adaptando modelo

In [17]:
mdl = XGBClassifier(eta=0.001,
                    max_depth=5,
                    subsample=0.6667,
                    colsample_bytree=0.6914,
                    random_state=0)
    
scores = cross_val_score(mdl, X, y, cv=5, scoring='accuracy')

print(scores.mean())

0.8467276074398529
