In [1]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from category_encoders import OneHotEncoder
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer

In [2]:
df = pd.read_csv('train.csv')

In [3]:
df.drop(['Name','Ticket', 'Cabin'], axis=1, inplace=True)

In [11]:
# media para os dados faltantes
num_transformer = Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))])

In [9]:
cat_transformer = Pipeline(steps=[('one-hot encoder', OneHotEncoder())])

In [12]:
preprocessor = ColumnTransformer(transformers=[
    ('num', num_transformer, ['Age', 'Fare']),
    ('cat', cat_transformer, ['Sex', 'Embarked'])
])

In [13]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('tree', DecisionTreeClassifier(max_depth=3, random_state=0))
])

In [18]:
# transformação nos campos numericos (longe escala)
# numerar os campos categoricos (sexo e embarque)
df

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.2500,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.9250,S
3,4,1,1,female,35.0,1,0,53.1000,S
4,5,0,3,male,35.0,0,0,8.0500,S
...,...,...,...,...,...,...,...,...,...
886,887,0,2,male,27.0,0,0,13.0000,S
887,888,1,1,female,19.0,0,0,30.0000,S
888,889,0,3,female,,1,2,23.4500,S
889,890,1,1,male,26.0,0,0,30.0000,C


In [14]:
# Tunando hiperparâmetros com 5-fold cross-validation e pipelines
parameters = {'tree__max_depth': [3, 4, 5]}
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(model, param_grid=parameters, cv=kfold, n_jobs=-1)
grid.fit(X=df.drop(['Survived'], axis=1), y=df['Survived'])

GridSearchCV(cv=KFold(n_splits=5, random_state=42, shuffle=True),
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median'))]),
                                                                         ['Age',
                                                                          'Fare']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('one-hot '
                                                                                          'encoder',
                                                                                          OneHotEncoder())]),
             

In [15]:
grid.best_params_

{'tree__max_depth': 4}