In [72]:
import pandas as pd
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]

print(y.shape)


(891, 1)


In [79]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNovos(BaseEstimator, TransformerMixin):
    def __init__(self, incluirFamilySize=True, incluirIsAlone=True):
        self.incluirFamilySize = incluirFamilySize
        self.incluirIsAlone = incluirIsAlone
    def fit(self, X, y=None):        
        return self
    def transform(self, X, y=None):
        data = X.copy()

        if self.incluirFamilySize:
            data['FamilySize'] = data['SibSp'] + data['Parch']

            # if self.incluirIsAlone:
            #     data['IsAlone'] = data['FamilySize'] == 0
            #     data['IsAlone'] = data['IsAlone'].astype(int)

        return data

atributosNovos = AtributosNovos()
Xnovos = atributosNovos.fit_transform(X)

print(Xnovos)

     PassengerId  Pclass                                               Name  \
0              1       3                            Braund, Mr. Owen Harris   
1              2       1  Cumings, Mrs. John Bradley (Florence Briggs Th...   
2              3       3                             Heikkinen, Miss. Laina   
3              4       1       Futrelle, Mrs. Jacques Heath (Lily May Peel)   
4              5       3                           Allen, Mr. William Henry   
..           ...     ...                                                ...   
886          887       2                              Montvila, Rev. Juozas   
887          888       1                       Graham, Miss. Margaret Edith   
888          889       3           Johnston, Miss. Catherine Helen "Carrie"   
889          890       1                              Behr, Mr. Karl Howell   
890          891       3                                Dooley, Mr. Patrick   

        Sex   Age  SibSp  Parch            Ticket  

In [89]:
from sklearn.base import BaseEstimator, TransformerMixin

pronomes = {
            'Mr':"Mr",
            'Mrs': "Mrs",
            'Ms': "Mrs",
            'Mme':"Mrs",
            'Mlle':"Miss",
            'Miss':"Miss",
            'Master':"Master",
            'Dr':"Dr",
            'Rev':"Officer",
            'Col':"Officer",
            'Capt':"Officer",
            'Major':"Officer",
            'Lady':"Royalty",
            'Sir':"Royalty",
            'the Countess':"Royalty",
            'Dona':"Royalty",
            'Don':"Royalty",
            'Jonkheer':"Royalty" 
        }

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

def tratarPronome(pronome):
    return pronomes[pronome]

class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True, tratarPronome=True):
        self.excluirName = excluirName
        self.tratarPronome = tratarPronome
    def fit(self, X, y=None):
        self.colunasIndesejadas = ["PassengerId", "Ticket", "Cabin", "Embarked"]
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        data = X.copy()

        Xdrop = data.drop(self.colunasIndesejadas,axis=1)
        if 'Name' not in self.colunasIndesejadas:
            Xdrop['Name'] = Xdrop['Name'].apply(extraiPronome)
            if self.tratarPronome:
                Xdrop['Name'] = Xdrop['Name'].apply(tratarPronome)

        return Xdrop

atributosDesejados = AtributosDesejados(excluirName=False)
Xdesejados = atributosDesejados.fit_transform(Xnovos)

Pclass          0
Name            0
Sex             0
Age           177
SibSp           0
Parch           0
Fare            0
FamilySize      0
dtype: int64
Pclass          int64
Name           object
Sex            object
Age           float64
SibSp           int64
Parch           int64
Fare          float64
FamilySize      int64
dtype: object


In [75]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        data = X.copy()

        return data[self.colunasNumericas].to_numpy()

atributosNumericos = AtributosNumericos()
Xnumericos = atributosNumericos.fit_transform(Xdesejados)
print(atributosNumericos.colunasNumericas)
print(Xnumericos.isnull().sum())


Index(['Pclass', 'Age', 'Fare', 'FamilySize', 'IsAlone'], dtype='object')


In [76]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        data = X.copy()

        return data[self.colunasCategoricas].to_numpy()


In [77]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [94]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipetotal = Pipeline([
    ('atributosNovos', AtributosNovos()),
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier())
])

parametros = {    
    'atributosDesejados__excluirName': [True, False],
    'classificador__max_depth': [None] + list(range(5,9,2))
}
modelo = GridSearchCV(pipetotal, param_grid=parametros)

scores = cross_validate(modelo, X, y.values.ravel(), cv=RepeatedKFold())

print(f"scores={scores['test_score']}, média={np.mean(scores['test_score'])}, desv_pad={np.std(scores['test_score'])}")

scores=[0.88826816 0.85393258 0.8258427  0.83146067 0.76966292 0.79888268
 0.83707865 0.83146067 0.86516854 0.84269663 0.82681564 0.79213483
 0.83146067 0.85393258 0.85955056 0.80446927 0.84831461 0.82022472
 0.83707865 0.85393258 0.81564246 0.84831461 0.79775281 0.83146067
 0.85955056 0.81564246 0.82022472 0.84831461 0.78651685 0.87640449
 0.8547486  0.76966292 0.85955056 0.87078652 0.80337079 0.7877095
 0.85393258 0.85393258 0.84831461 0.79775281 0.82122905 0.79213483
 0.87078652 0.79213483 0.87640449 0.79329609 0.83146067 0.85955056
 0.83146067 0.80898876], média=0.8309880107965602, desv_pad=0.02931529087635529


In [None]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)