## Tarefa - Revisão de generalização
### Ajuste de características - Titanic Competition


In [1]:
# Desabilita warnings
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
# train = pd.read_csv('train.csv')
# test = pd.read_csv('test.csv')
train = pd.read_csv('datasets/titanic/train.csv')
test = pd.read_csv('datasets/titanic/test.csv')
X = train[list(test.columns)]
y = train[train.columns[~train.columns.isin(test.columns)]]


In [2]:
from sklearn.base import BaseEstimator, TransformerMixin

def extraiPronome(nome):
    return nome.split(',')[1].split('.')[0].strip()

# Remome colunas indesejadas no final do tratamento dos dados
class AtributosDesejados(BaseEstimator, TransformerMixin):
    def __init__(self, excluirName=True):
        self.excluirName = excluirName
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Ticket', 'Cabin']
        if self.excluirName:
            self.colunasIndesejadas.append('Name')
        return self
    def transform(self, X, y=None):
        if 'Name' not in self.colunasIndesejadas:
            X['Name'] = X['Name'].apply(extraiPronome)
            
            # Trata pronome de tratamento
            X['Name'] = X['Name'].replace('Mlle', 'Miss')
            X['Name'] = X['Name'].replace(['Ms','Mme'] , 'Mrs')
            X['Name'] = X['Name'].replace(['Lady', 'the Countess', 'Dona'] , 'Mrs')
            X['Name'] = X['Name'].replace(['Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Mr')
            #X['Name'] = X['Name'].replace(['Lady', 'the Countess', 'Dona', 
            #                                'Capt', 'Col','Don', 'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer'], 'Other')

        
        # Deck (letra inicial da cabine)
        X['Deck'] = X["Cabin"].str.slice(0,1)        

        # Classe para 'Sex'
        X['Sex_Class'] = 0
        X.loc[ X['Sex'] == 'female', 'Sex_Class'] = 0
        X.loc[ X['Sex'] == 'male', 'Sex_Class'] = 1
        X['Sex_Class'] = X['Sex_Class'].astype(int)


        # Classes para as idades 
        X['Age_Class'] = ''
        X['Age'].fillna(X['Age'].median(), inplace=True)
        
        X.loc[ X['Age'] < 16, 'Age_Class'] = 'children'
        X.loc[(X['Age'] >= 16) & (X['Age'] < 35), 'Age_Class'] = 'young'
        X.loc[(X['Age'] > 35) & (X['Age'] < 65), 'Age_Class'] = 'adult'
        X.loc[ X['Age'] >= 65, 'Age_Class'] = 'old'


        # Classes para tarifa
        X['Fare_Class'] = ''
        X['Fare'].fillna(X['Fare'].median(), inplace=True)

        X.loc[ X['Fare'] < 25, 'Fare_Class'] = 'low_fare'
        X.loc[(X['Fare'] >= 20) & (X['Fare'] < 50), 'Fare_Class'] = 'median_fare'
        X.loc[(X['Fare'] >= 50) & (X['Fare'] < 100), 'Fare_Class'] = 'median_high_fare'
        X.loc[ X['Fare'] >= 100, 'Fare_Class'] = 'high_fare'


        # Cria característica FamilySize
        X['FamilySize'] = 0
        X['FamilySize'] = X['SibSp'] + X['Parch'] + 1
        X['FamilySize'] = X['FamilySize'].astype(int)

        # Classe para o tamanho da família
        X['FamilySize_class'] = ''
        X.loc[ X['FamilySize'] == 1, 'FamilySize_class'] = 'alone'
        X.loc[(X['FamilySize'] > 1) & (X['FamilySize'] <= 4 ), 'FamilySize_class'] = 'small'
        X.loc[(X['FamilySize'] > 4) & (X['FamilySize'] <= 7), 'FamilySize_class'] = 'medium'
        X.loc[ X['FamilySize'] > 7, 'FamilySize_class'] = 'large'     
    
        # # Classifica pessoas viajando sozinhas e cria nova característica 'isAlone'
        # X['isAlone'] = 0
        # X.loc[(X['SibSp'] == 0) & (X['Parch'] == 0), 'isAlone'] = 1
        # X.loc[(X['SibSp'] > 0) | (X['Parch'] > 0), 'isAalone'] = 0
        # X['isAlone'] = X['isAlone'].astype(int)

        # Remove colunas indesejadas
        X = X.drop(self.colunasIndesejadas,axis=1)
        
        return X


In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas].to_numpy()


In [4]:
from sklearn.base import BaseEstimator, TransformerMixin

class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas].to_numpy()


In [5]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion

trataAtributos = Pipeline([
    ('unecaracteristicas', FeatureUnion([
        ('pipenum', Pipeline([
            ('atributos_numericos', AtributosNumericos()),
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ])),
        ('pipecat', Pipeline([
            ('atributos_categoricos', AtributosCategoricos()),
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]))
    ])),
])


In [6]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, cross_validate, RepeatedKFold
import numpy as np

pipetotal = Pipeline([
    ('atributosDesejados', AtributosDesejados()),
    ('trataAtributos', trataAtributos),
    ('classificador', RandomForestClassifier())
])

parametros = {
    'atributosDesejados__excluirName': [False],
    'classificador__max_depth': [5],
    #'classificador__max_depth': [3, 5, 7, 9, 11, 13, 15 ,25],
    #'classificador__class_weight': ['balanced'],
    #'classificador__criterion': ['gini', 'entropy'],
    #'classificador__max_features': [None, 2, 4, 6, 'sqrt'],
    #'classificador__n_estimators': [100, 125, 150],
    #'classificador__min_samples_split': [3, 4, 5, 6, 8, 10],
    #'classificador__min_samples_leaf': [2, 3, 4]
}

modelo = GridSearchCV(pipetotal, param_grid=parametros, scoring='average_precision', n_jobs=-1)

scores = cross_validate(modelo, X, y, cv=RepeatedKFold())
scores['test_score'], np.mean(scores['test_score']), np.std(scores['test_score'])

(array([0.87393816, 0.80773031, 0.9129642 , 0.86328643, 0.84826868,
        0.84973131, 0.84927952, 0.80536374, 0.90388853, 0.82068243,
        0.83359383, 0.84331702, 0.88075979, 0.84655813, 0.87231961,
        0.91478173, 0.89587638, 0.8311178 , 0.79253598, 0.83489005,
        0.87422412, 0.90471963, 0.80376775, 0.84549763, 0.83486193,
        0.77840798, 0.87453626, 0.83663938, 0.89458536, 0.87413509,
        0.82832996, 0.82634675, 0.92605595, 0.91129311, 0.79423459,
        0.7694483 , 0.91586074, 0.85757611, 0.89049862, 0.85271318,
        0.8660106 , 0.86010023, 0.85749483, 0.83682184, 0.83612547,
        0.89558786, 0.7528664 , 0.93465832, 0.86861768, 0.7922182 ]),
 0.8535023498701303,
 0.04181076458589789)

In [7]:
modelo.fit(X,y)
y_pred = modelo.predict(test)
result = test[['PassengerId']]
result['Survived'] = y_pred
result.to_csv('submission.csv',index=False)