In [112]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]

In [113]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True):
        self.excluirName = excluirName
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        Xdrop = X.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
        return Xdrop


In [114]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [115]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()


In [116]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [117]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [True, False],
    'classificador__max_depth': [1, 3, 5, 7, 9]
}
modelo = GridSearchCV(pipetotal, param_grid=parametros, n_jobs=-1, scoring='roc_auc_ovo_weighted', refit=True)

scores = cross_validate(modelo, X, y.to_numpy().ravel(), cv=RepeatedKFold())
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

(array([0.85837308, 0.79979427, 0.88057257, 0.90297432, 0.87669102,
        0.88407557, 0.86058201, 0.83030186, 0.89659809, 0.90275493,
        0.81589674, 0.89914905, 0.86863306, 0.86722132, 0.87116075,
        0.81677393, 0.8775716 , 0.89781746, 0.84830918, 0.88493292,
        0.86834862, 0.83299718, 0.89650974, 0.84085576, 0.84068627,
        0.86559835, 0.83509459, 0.87241379, 0.87225936, 0.8797619 ,
        0.85481604, 0.91410717, 0.8691829 , 0.7932224 , 0.86691627,
        0.86462451, 0.8518814 , 0.8645127 , 0.84800839, 0.91759834,
        0.87828283, 0.83304598, 0.88776528, 0.83626875, 0.88279612,
        0.90969429, 0.83998924, 0.81554383, 0.90271615, 0.87867647]),
 0.8650871668834157,
 0.029282485815232233)

In [118]:
modelo.fit(X,y.to_numpy().ravel())
y_pred = modelo.predict(test)

In [119]:
result = pd.DataFrame()
result['PassengerId'] = test.loc[0:,'PassengerId']
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)