## Importação dos pacotes

In [1]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [2]:
# definir parâmetros extras
pd.set_option('precision', 4)
pd.set_option('display.max_columns', 100)

import warnings
warnings.filterwarnings("ignore")

## Carga dos dados

In [38]:
prefixo_arquivos = 'titanic/'
#prefixo_arquivos = '/kaggle/input/titanic/'

In [39]:
# carregar arquivo de dados de treino
train_data = pd.read_csv(prefixo_arquivos + 'train.csv', index_col='PassengerId')

In [40]:
# carregar arquivo de dados de teste
test_data = pd.read_csv(prefixo_arquivos + 'test.csv', index_col='PassengerId')

In [41]:
# unir ambos os dados de treino e teste
data = pd.concat([train_data, test_data])

# mostrar alguns exemplos de registros
data.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,22.0,,S,7.25,"Braund, Mr. Owen Harris",0,3,male,1,0.0,A/5 21171
2,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,female,1,1.0,PC 17599
3,26.0,,S,7.925,"Heikkinen, Miss. Laina",0,3,female,0,1.0,STON/O2. 3101282
4,35.0,C123,S,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,female,1,1.0,113803
5,35.0,,S,8.05,"Allen, Mr. William Henry",0,3,male,0,0.0,373450


## Transformações nos dados

In [42]:
# extrair títulos das pessoas a partir do nome
data['Title'] = data['Name'].str.extract(' ([A-Za-z]+)\.', expand=False)

# exibir relação entre título e sexo
pd.crosstab(data['Title'], data['Sex']).T

Title,Capt,Col,Countess,Don,Dona,Dr,Jonkheer,Lady,Major,Master,Miss,Mlle,Mme,Mr,Mrs,Ms,Rev,Sir
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
female,0,0,1,0,1,1,0,1,0,0,260,2,1,0,197,2,0,0
male,1,4,0,1,0,7,1,0,2,61,0,0,0,757,0,0,8,1


In [43]:
# agregar títulos incomuns
replacements = {
    'Miss': ['Mlle', 'Ms'],
    'Mrs': ['Mme'],
    'Rare': ['Lady', 'Countess', 'Capt', 'Col', 'Don', \
             'Dr', 'Major', 'Rev', 'Sir', 'Jonkheer', 'Dona']
}
for k, v in replacements.items():
    data['Title'] = data['Title'].replace(v, k)
    
# exibir relação entre título e sexo
pd.crosstab(data['Title'], data['Sex']).T

Title,Master,Miss,Mr,Mrs,Rare
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
female,0,264,0,198,4
male,61,0,757,0,25


In [44]:
# categorizar os valores dos títulos
title_mapping = {'Mr': 1, 'Miss': 2, 'Mrs': 3, 'Master': 4, 'Rare': 5}
data['Title'] = data['Title'].map(title_mapping)
data['Title'] = data['Title'].fillna(0)

In [45]:
# categorizar os valores dos sexos
data['Sex'] = data['Sex'].map({'female': 1, 'male': 0}).astype(int)

In [46]:
# preencher e categorizar os valores dos portos de embarque
data['Embarked'].fillna(data.Embarked.mode()[0], inplace=True)
data['Embarked'] = data['Embarked'].map({'S': 0, 'C': 1, 'Q': 2}).astype(int)

In [47]:
# preencher os valores da passagem
data['Fare'].fillna(data.Fare.mean(), inplace=True)

In [48]:
# criar coluna com tamanho da família
data['FSize'] = data['Parch'] + data['SibSp'] + 1

In [49]:
# criar coluna indicando se estava sozinho
data['Alone'] = 0
data.loc[data.FSize == 1, 'Alone'] = 1

In [50]:
# criar coluna contendo o deque
data['Deck'] = data['Cabin'].str[:1]
data['Deck'] = data['Deck'].fillna('N').astype('category')
data['Deck'] = data['Deck'].cat.codes

In [51]:
# criar coluna contendo o número do quarto
data['Room'] = data['Cabin'].str.extract("([0-9]+)", expand=False)
data['Room'] = data['Room'].fillna(0).astype(int)

In [52]:
data.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,FSize,Alone,Deck,Room
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,22.0,,0,7.25,"Braund, Mr. Owen Harris",0,3,0,1,0.0,A/5 21171,1,2,0,7,0
2,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,1,1,1.0,PC 17599,3,2,0,2,85
3,26.0,,0,7.925,"Heikkinen, Miss. Laina",0,3,1,0,1.0,STON/O2. 3101282,2,1,1,7,0
4,35.0,C123,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,1,1,1.0,113803,3,2,0,2,123
5,35.0,,0,8.05,"Allen, Mr. William Henry",0,3,0,0,0.0,373450,1,1,1,7,0


### Inferir idades faltantes dos passageiros

In [53]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

def evaluate_regression_model(model, X, y):
    kfold = KFold(n_splits=10, random_state=42)
    results = cross_val_score(model, X, y, cv=kfold, scoring='neg_mean_squared_error', verbose=1)
    score = (-1) * results.mean()
    stddev = results.std()
    print(model, '\nScore: %.2f (+/- %.2f)' % (score, stddev))
    return score, stddev

In [54]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

age_models = [
#    ('LR', LinearRegression(n_jobs=-1, fit_intercept=True, normalize=True)),
    ('GBR', GradientBoostingRegressor(random_state=42)),
    ('RFR', RandomForestRegressor(random_state=42)),
    ('XGB', XGBRegressor(random_state=42, objective='reg:squarederror')),
#    ('MLP', MLPRegressor(random_state=42, max_iter=500, activation='tanh',
#                         hidden_layer_sizes=(10,5,5), solver='lbfgs')),
#    ('GPR', GaussianProcessRegressor(random_state=42, alpha=0.01, normalize_y=True))
]

In [55]:
# selecionar dados para o treino

cols = ['Pclass', 'SibSp', 'Parch', 'Fare', 'Title', 'Age', 'Alone']

data_age = data[cols].dropna()

X_age = data_age.drop(['Age'], axis=1)
y_age = data_age['Age']

data_age.head()

Unnamed: 0_level_0,Pclass,SibSp,Parch,Fare,Title,Age,Alone
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1,3,1,0,7.25,1,22.0,0
2,1,1,0,71.2833,3,38.0,0
3,3,0,0,7.925,2,26.0,1
4,1,1,0,53.1,3,35.0,0
5,3,0,0,8.05,1,35.0,1


In [56]:
data_age.corr()

Unnamed: 0,Pclass,SibSp,Parch,Fare,Title,Age,Alone
Pclass,1.0,0.0472,0.0172,-0.5651,-0.1541,-0.4081,0.1595
SibSp,0.0472,1.0,0.3745,0.1412,0.308,-0.2437,-0.6274
Parch,0.0172,0.3745,1.0,0.2167,0.3151,-0.1509,-0.5701
Fare,-0.5651,0.1412,0.2167,1.0,0.1484,0.1782,-0.2598
Title,-0.1541,0.308,0.3151,0.1484,1.0,-0.0926,-0.3973
Age,-0.4081,-0.2437,-0.1509,0.1782,-0.0926,1.0,0.1288
Alone,0.1595,-0.6274,-0.5701,-0.2598,-0.3973,0.1288,1.0


In [57]:
names = []
scores = []
lowest = 999
best_model = None

for name, model in age_models:
    
    score, stddev = evaluate_regression_model(model, X_age, y_age)
    names.append(name)
    scores.append(score)
    
    if score < lowest:
        best_model = model
        lowest = score

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.1s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
                          learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='auto',
                          random_state=42, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False) 
Score: 112.79 (+/- 14.49)


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.4s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators='warn',
                      n_jobs=None, oob_score=False, random_state=42, verbose=0,
                      warm_start=False) 
Score: 135.68 (+/- 24.32)
XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1) 
Score: 112.37 (+/-

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.9s finished


In [58]:
results = pd.DataFrame({'Age Model': names, 'Score': scores})
results.sort_values(by='Score', ascending=True)

Unnamed: 0,Age Model,Score
2,XGB,112.367
0,GBR,112.7919
1,RFR,135.679


In [59]:
age_model = best_model
age_model.fit(X_age, y_age)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:squarederror',
             random_state=42, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
             seed=None, silent=None, subsample=1, verbosity=1)

In [60]:
# preencher dados faltantes de idade a partir de uma regressão
data['AgePred'] = age_model.predict(data[cols].drop('Age', axis=1))
data.loc[data.Age.isnull(), 'Age'] = data['AgePred']
data.drop('AgePred', axis=1, inplace=True)
data.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,FSize,Alone,Deck,Room
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,22.0,,0,7.25,"Braund, Mr. Owen Harris",0,3,0,1,0.0,A/5 21171,1,2,0,7,0
2,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,1,1,1.0,PC 17599,3,2,0,2,85
3,26.0,,0,7.925,"Heikkinen, Miss. Laina",0,3,1,0,1.0,STON/O2. 3101282,2,1,1,7,0
4,35.0,C123,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,1,1,1.0,113803,3,2,0,2,123
5,35.0,,0,8.05,"Allen, Mr. William Henry",0,3,0,0,0.0,373450,1,1,1,7,0


In [61]:
# existem colunas com dados nulos?
data[data.columns[data.isnull().any()]].isnull().sum()

Cabin       1014
Survived     418
dtype: int64

In [62]:
data.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,FSize,Alone,Deck,Room
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,22.0,,0,7.25,"Braund, Mr. Owen Harris",0,3,0,1,0.0,A/5 21171,1,2,0,7,0
2,38.0,C85,1,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,1,1,1,1.0,PC 17599,3,2,0,2,85
3,26.0,,0,7.925,"Heikkinen, Miss. Laina",0,3,1,0,1.0,STON/O2. 3101282,2,1,1,7,0
4,35.0,C123,0,53.1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,1,1,1,1.0,113803,3,2,0,2,123
5,35.0,,0,8.05,"Allen, Mr. William Henry",0,3,0,0,0.0,373450,1,1,1,7,0


In [63]:
# realizar normalização nos dados numéricos contínuos
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

cols = ['Age', 'Fare', 'Parch', 'Pclass', 'SibSp', 'FSize']

data.loc[:,cols] = scaler.fit_transform(data.loc[:,cols])

In [64]:
data.head()

Unnamed: 0_level_0,Age,Cabin,Embarked,Fare,Name,Parch,Pclass,Sex,SibSp,Survived,Ticket,Title,FSize,Alone,Deck,Room
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1,0.2735,,0,0.0142,"Braund, Mr. Owen Harris",0.0,1.0,0,0.125,0.0,A/5 21171,1,0.1,0,7,0
2,0.4739,C85,1,0.1391,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0.0,0.0,1,0.125,1.0,PC 17599,3,0.1,0,2,85
3,0.3236,,0,0.0155,"Heikkinen, Miss. Laina",0.0,1.0,1,0.0,1.0,STON/O2. 3101282,2,0.0,1,7,0
4,0.4363,C123,0,0.1036,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0.0,0.0,1,0.125,1.0,113803,3,0.1,0,2,123
5,0.4363,,0,0.0157,"Allen, Mr. William Henry",0.0,1.0,0,0.0,0.0,373450,1,0.0,1,7,0


## Modelagem preditiva

In [65]:
# importar os pacotes necessários para os algoritmos de classificação
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier

In [66]:
from datetime import datetime
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

# avalia o desempenho do modelo, retornando o valor da precisão
def evaluate_classification_model(model, X, y):
    start = datetime.now()
    kfold = KFold(n_splits=10, random_state=42)
    results = cross_val_score(model, X, y, cv=kfold, scoring='accuracy', verbose=1)
    end = datetime.now()
    elapsed = int((end - start).total_seconds() * 1000)
    score = 100.0 * results.mean()
    stddev = 100.0 * results.std()
    print(model, '\nScore: %.2f (+/- %.2f) [%5s ms]' % (score, stddev, elapsed))
    return score, stddev, elapsed

In [67]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold

# faz o ajuste fino do modelo, calculando os melhores hiperparâmetros
def fine_tune_model(model, params, X, y):
    print('\nFine Tuning Model:')
    print(model, "\nparams:", params)
    kfold = KFold(n_splits=10, random_state=42)
    grid = GridSearchCV(estimator=model, param_grid=params, scoring='accuracy', cv=kfold, verbose=1)
    grid.fit(X, y)
    print('\nGrid Best Score: %.2f' % (grid.best_score_ * 100.0))
    print('Best Params:', grid.best_params_)
    return grid

In [68]:
# definir dados de treino
train_data = data[data.Survived.isnull() == False]

# selecionar atributos para o modelo
cols = ['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked', 'Title', 'FSize', 'Alone', 'Deck']

X_train = train_data[cols]
y_train = train_data['Survived']

print('Forma dos dados de treino:', X_train.shape, y_train.shape)

Forma dos dados de treino: (891, 11) (891,)


In [69]:
train_data.corr()

Unnamed: 0,Age,Embarked,Fare,Parch,Pclass,Sex,SibSp,Survived,Title,FSize,Alone,Deck,Room
Age,1.0,-0.0186,0.1048,-0.211,-0.4031,-0.1125,-0.3128,-0.0619,-0.1123,-0.3192,0.2113,-0.2868,0.2312
Embarked,-0.0186,1.0,0.0621,-0.0787,0.0457,0.1166,-0.06,0.1068,0.0454,-0.0803,0.0178,-0.0391,0.0238
Fare,0.1048,0.0621,1.0,0.2162,-0.5495,0.1823,0.1597,0.2573,0.1363,0.2171,-0.2718,-0.5257,0.4056
Parch,-0.211,-0.0787,0.2162,1.0,0.0184,0.2455,0.4148,0.0816,0.3158,0.7831,-0.5834,-0.0316,0.0016
Pclass,-0.4031,0.0457,-0.5495,0.0184,1.0,-0.1319,0.0831,-0.3385,-0.1739,0.066,0.1352,0.7421,-0.5733
Sex,-0.1125,0.1166,0.1823,0.2455,-0.1319,1.0,0.1146,0.5434,0.5027,0.201,-0.3036,-0.1186,0.0917
SibSp,-0.3128,-0.06,0.1597,0.4148,0.0831,0.1146,1.0,-0.0353,0.2696,0.8907,-0.5845,0.0411,-0.0387
Survived,-0.0619,0.1068,0.2573,0.0816,-0.3385,0.5434,-0.0353,1.0,0.4078,0.0166,-0.2034,-0.2951,0.2298
Title,-0.1123,0.0454,0.1363,0.3158,-0.1739,0.5027,0.2696,0.4078,1.0,0.342,-0.4058,-0.1323,0.0798
FSize,-0.3192,-0.0803,0.2171,0.7831,0.066,0.201,0.8907,0.0166,0.342,1.0,-0.6909,0.0123,-0.0257


In [70]:
# definir dados de teste
test_data = data[data.Survived.isnull()]

X_test = test_data[cols]

print('Forma dos dados de teste:', X_test.shape)

Forma dos dados de teste: (418, 11)


In [71]:
names = []
models = []
scores = []
stddevs = []
times = []

def add_model_info(name, model, score, stddev, elapsed):
    names.append(name)
    models.append((name, model))
    scores.append(score)
    stddevs.append(stddev)
    times.append(elapsed)

## Avaliação e ajuste fino de cada modelo preditivo

-  https://scikit-learn.org/stable/modules/classes.html

### Generalized Linear Models

In [72]:
model = LogisticRegression(random_state=42, solver='newton-cg', C=0.1, multi_class='auto', max_iter=500)

params = dict(
    solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    C=np.logspace(-3, 3, 7)
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('LR', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False) 
Score: 79.12 (+/- 3.31) [ 1217 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    1.2s finished


### Decision Trees

In [73]:
model = DecisionTreeClassifier(random_state=42, criterion='entropy', max_depth=6, min_samples_split=0.25)

#criterion=’mse’, splitter=’best’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=None, random_state=None, max_leaf_nodes=None, 
#min_impurity_decrease=0.0, min_impurity_split=None, presort=False

params = dict(
    criterion=['gini','entropy'],
    max_depth=[4, 6, 8, 10, 12, 14],
    min_samples_split=[0.25, 0.5, 0.75, 1.0]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('DT', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=0.25,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 
Score: 79.02 (+/- 3.35) [  339 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


### Discriminant Analysis

In [74]:
model = LinearDiscriminantAnalysis(solver='lsqr')

#solver=’svd’, shrinkage=None, priors=None,
#n_components=None, store_covariance=False, tol=0.0001

params = dict(
    solver=['svd', 'lsqr'] #, 'eigen']
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('LDA', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='lsqr', store_covariance=False, tol=0.0001) 
Score: 81.03 (+/- 3.30) [  898 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.9s finished


### Naïve Bayes

In [75]:
model = GaussianNB(priors=None, var_smoothing=1e-8)

#priors=None, var_smoothing=1e-09

params = dict(
    priors=[None],
    var_smoothing=[1e-8, 1e-7, 1e-6, 1e-5, 1e-4]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('NB', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GaussianNB(priors=None, var_smoothing=1e-08) 
Score: 78.79 (+/- 3.11) [  319 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.3s finished


### Nearest Neighbors

In [76]:
model = KNeighborsClassifier(n_neighbors=11, weights='uniform')

#n_neighbors=5, weights=’uniform’, algorithm=’auto’, leaf_size=30, p=2, metric=’minkowski’,
#metric_params=None, n_jobs=None

params = dict(
    n_neighbors=[1, 3, 5, 7, 9, 11, 13],
    weights=['uniform', 'distance']
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('KNN', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=11, p=2,
                     weights='uniform') 
Score: 82.27 (+/- 3.07) [  766 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    0.8s finished


### Support Vector Machines

In [77]:
model = SVC(random_state=42, C=10, gamma=0.1, kernel='rbf')

#kernel=’rbf’, degree=3, gamma=’auto_deprecated’, coef0=0.0, tol=0.001, C=1.0, 
#epsilon=0.1, shrinking=True, cache_size=200, verbose=False, max_iter=-1

params = dict(
    C=[0.001, 0.01, 0.1, 1, 10, 100],
    gamma=[0.001, 0.01, 0.1, 1, 10, 100],
    kernel=['linear', 'rbf']
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('SVM', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False) 
Score: 82.05 (+/- 3.46) [ 1994 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    2.0s finished


### Neural network models

In [78]:
model = MLPClassifier(random_state=42, solver='lbfgs', alpha=1, hidden_layer_sizes=(100,), activation='logistic')
                
#hidden_layer_sizes=(100, ), activation=’relu’, solver=’adam’, alpha=0.0001, batch_size=’auto’, 
#learning_rate=’constant’, learning_rate_init=0.001, power_t=0.5, max_iter=200, shuffle=True, 
#random_state=None, tol=0.0001, verbose=False, warm_start=False, momentum=0.9, nesterovs_momentum=True, 
#early_stopping=False, validation_fraction=0.1, beta_1=0.9, beta_2=0.999, epsilon=1e-08, n_iter_no_change=10

params = dict(
    alpha=[1,0.1,0.01,0.001,0.0001,0],
    hidden_layer_sizes=[(100,), (50,), (50,2), (5,5,2), (10,5,2)],
    activation=['identity', 'logistic', 'tanh', 'relu'],
    solver=['lbfgs', 'sgd', 'adam']
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('MLP', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


MLPClassifier(activation='logistic', alpha=1, batch_size='auto', beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(100,), learning_rate='constant',
              learning_rate_init=0.001, max_iter=200, momentum=0.9,
              n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
              random_state=42, shuffle=True, solver='lbfgs', tol=0.0001,
              validation_fraction=0.1, verbose=False, warm_start=False) 
Score: 81.71 (+/- 2.83) [69285 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:  1.2min finished


### Ensemble Methods

In [79]:
model = AdaBoostClassifier(DecisionTreeClassifier(random_state=42), random_state=42, n_estimators=50)

#base_estimator=None, n_estimators=50, learning_rate=1.0,
#algorithm=’SAMME.R’, random_state=None

params = dict(
    n_estimators=[10, 25, 50, 100]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('ABDT', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(class_weight=None,
                                                         criterion='gini',
                                                         max_depth=None,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort=False,
                                                         random_state=42,
                            

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   11.6s finished


In [80]:
from sklearn.ensemble import BaggingClassifier

model = BaggingClassifier(random_state=42, n_estimators=100,
                          max_samples=0.25, max_features=0.8)

#base_estimator=None, n_estimators=10, max_samples=1.0, max_features=1.0,
#bootstrap=True, bootstrap_features=False, oob_score=False, warm_start=False,
#n_jobs=None, random_state=None, verbose=0

params = dict(
    n_estimators=[10, 50, 100, 500],
    max_samples=[0.25, 0.5, 0.75, 1.0],
    max_features=[0.7, 0.8, 0.9, 1.0]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('BC', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


BaggingClassifier(base_estimator=None, bootstrap=True, bootstrap_features=False,
                  max_features=0.8, max_samples=0.25, n_estimators=100,
                  n_jobs=None, oob_score=False, random_state=42, verbose=0,
                  warm_start=False) 
Score: 83.84 (+/- 4.29) [ 9514 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.5s finished


In [81]:
model = ExtraTreesClassifier(random_state=42, n_estimators=100, max_depth=7, 
                             min_samples_split=0.25, max_features='auto')

#n_estimators=’warn’, criterion=’gini’, max_depth=None, min_samples_split=2,
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, 
#max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, 
#bootstrap=False, oob_score=False, n_jobs=None, random_state=None, verbose=0,
#warm_start=False, class_weight=None

params = dict(
    n_estimators=[10, 50, 100, 500],
    max_depth=[None, 3, 7, 11],
    min_samples_split=[0.25, 0.5],
    max_features=['auto', 0.7, 0.85, 1.0]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('ET', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
                     max_depth=7, max_features='auto', max_leaf_nodes=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=0.25,
                     min_weight_fraction_leaf=0.0, n_estimators=100,
                     n_jobs=None, oob_score=False, random_state=42, verbose=0,
                     warm_start=False) 
Score: 79.91 (+/- 3.24) [ 7414 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    7.4s finished


In [82]:
model = GradientBoostingClassifier(random_state=42, n_estimators=100, max_features=0.75,
                                   max_depth=4, learning_rate=0.1, subsample=0.6)

#loss=’ls’, learning_rate=0.1, n_estimators=100, subsample=1.0, criterion=’friedman_mse’, min_samples_split=2,
#min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_depth=3, min_impurity_decrease=0.0, 
#min_impurity_split=None, init=None, random_state=None, max_features=None, alpha=0.9, verbose=0, 
#max_leaf_nodes=None, warm_start=False, presort=’auto’, validation_fraction=0.1, n_iter_no_change=None, 
#tol=0.0001

params = dict(
    n_estimators=[100, 250, 500],
    max_features=[0.75, 0.85, 1.0],
    max_depth=[4, 6, 8, 10],
    learning_rate=[0.05, 0.1, 0.15],
    subsample=[0.4, 0.6, 0.8]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('GB', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


GradientBoostingClassifier(criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=4,
                           max_features=0.75, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='auto',
                           random_state=42, subsample=0.6, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False) 
Score: 83.06 (+/- 3.44) [10184 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   10.2s finished


In [83]:
model = RandomForestClassifier(random_state=42, n_estimators=100, max_features='auto', max_depth=5)

#n_estimators=’warn’, criterion=’mse’, max_depth=None, min_samples_split=2, min_samples_leaf=1, 
#min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, 
#min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, 
#verbose=0, warm_start=False

params = dict(
    n_estimators=[10, 50, 100, 500],
    max_features=['auto', 'sqrt', 'log2'],
    max_depth=[None, 3, 5, 7, 9, 11, 13]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('RF', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False) 
Score: 82.94 (+/- 3.58) [ 8967 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    9.0s finished


### Outros algoritmos

#### XGBoost

- https://xgboost.readthedocs.io/en/latest/python/python_api.html#module-xgboost.sklearn

In [84]:
model = XGBClassifier(max_depth=3, n_estimators=100)

#max_depth=3, learning_rate=0.1, n_estimators=100, verbosity=1, silent=None, objective='reg:squarederror',
#booster='gbtree', n_jobs=1, nthread=None, gamma=0, min_child_weight=1, max_delta_step=0, subsample=1, 
#colsample_bytree=1, colsample_bylevel=1, colsample_bynode=1, reg_alpha=0, reg_lambda=1, scale_pos_weight=1, 
#base_score=0.5, random_state=0, seed=None, missing=None, importance_type='gain'

params = dict(
    max_depth=[3, 5, 7, 9],
    n_estimators=[50, 75, 100, 200]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('XGB', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1) 
Score: 83.39 (+/- 3.77) [ 3613 ms]


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:    3.6s finished


#### LightGBM

- https://github.com/microsoft/LightGBM
- https://lightgbm.readthedocs.io/en/latest/
- https://medium.com/@pushkarmandot/https-medium-com-pushkarmandot-what-is-lightgbm-how-to-implement-it-how-to-fine-tune-the-parameters-60347819b7fc

### Ensemble Learning Model

- https://towardsdatascience.com/automate-stacking-in-python-fc3e7834772e
- https://github.com/vecxoz/vecstack

In [85]:
estimators =  [
    ('RF', RandomForestClassifier(random_state=42, n_estimators=100, max_features='auto', max_depth=5)),
    ('BC', BaggingClassifier(random_state=42, n_estimators=100, max_samples=0.25, max_features=0.8)),
    ('GB', GradientBoostingClassifier(random_state=42, max_depth=4, max_features=0.75,
                                   n_estimators=100, learning_rate=0.1, subsample=0.6)),
#    ('XGB', XGBClassifier(max_depth=3, n_estimators=100)),
]
model = VotingClassifier(estimators, n_jobs=-1, weights=(2,1,1))

#estimators, weights=None, n_jobs=None

params = dict(
    weights=[(1,1,1), (2,1,1), (3,1,1), (3,2,1), (2,2,1), (2,1,2), (5,4,3), (1,2,1), (1,1,2), ]
)
#fine_tune_model(model, params, X_train, y_train)

score, stddev, elapsed = evaluate_classification_model(model, X_train, y_train)
add_model_info('VC', model, score, stddev, elapsed)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


VotingClassifier(estimators=[('RF',
                              RandomForestClassifier(bootstrap=True,
                                                     class_weight=None,
                                                     criterion='gini',
                                                     max_depth=5,
                                                     max_features='auto',
                                                     max_leaf_nodes=None,
                                                     min_impurity_decrease=0.0,
                                                     min_impurity_split=None,
                                                     min_samples_leaf=1,
                                                     min_samples_split=2,
                                                     min_weight_fraction_leaf=0.0,
                                                     n_estimators=100,
                                                     n_jobs=None,
             

[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed:   39.7s finished


## Avaliar importância dos atributos no modelo

In [86]:
model = RandomForestClassifier(random_state=42, max_features='auto', n_estimators=100)
model.fit(X_train, y_train)

importances = pd.DataFrame({'feature': X_train.columns,
                            'importance': np.round(model.feature_importances_, 3)})
importances = importances.sort_values('importance', ascending=False).set_index('feature')
importances.head(20)

Unnamed: 0_level_0,importance
feature,Unnamed: 1_level_1
Age,0.224
Fare,0.204
Title,0.193
Sex,0.127
Pclass,0.064
Deck,0.059
FSize,0.045
SibSp,0.029
Embarked,0.028
Parch,0.017


In [87]:
importances.plot.bar()

<matplotlib.axes._subplots.AxesSubplot at 0xa153e42c>

## Comparação final entre os algoritmos

In [88]:
results = pd.DataFrame({'Algorithm': names, 'Score': scores, 'Std Dev': stddevs, 'Time (ms)': times})
results.sort_values(by='Score', ascending=False)

Unnamed: 0,Algorithm,Score,Std Dev,Time (ms)
8,BC,83.8414,4.2852,9514
12,XGB,83.392,3.7737,3613
10,GB,83.0562,3.4408,10184
13,VC,83.0537,4.1832,39746
11,RF,82.9401,3.583,8967
4,KNN,82.2697,3.0721,766
5,SVM,82.0462,3.459,1994
6,MLP,81.7079,2.8304,69285
2,LDA,81.0337,3.3045,898
7,ABDT,80.7004,3.656,11591


## Gerar arquivos com resultados

In [89]:
# criar diretório para os arquivos de envio
!test -d submissions || mkdir submissions

In [97]:
prefixo_arquivo = 'submissions/titanic-submission'
sufixo_arquivo = '06set'

for name, model in models:
    print(model, '\n')
    
    # treinar o modelo
    model.fit(X_train, y_train)
    
    # executar previsão usando o modelo
    y_pred = model.predict(X_test)
    y_pred_int = y_pred.astype(int)
    #vfunc = np.vectorize(lambda x: 'yes' if x > 0 else 'no')

    # gerar dados de envio (submissão)
    submission = pd.DataFrame({
      'PassengerId': X_test.index,
      'Survived': y_pred_int #vfunc(y_pred)
    })
    submission.set_index('PassengerId', inplace=True)

    # gerar arquivo CSV para o envio
    filename = '%s-p-%s-%s.csv' % (prefixo_arquivo, sufixo_arquivo, name.lower())
    submission.to_csv(filename)

LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=500,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='newton-cg', tol=0.0001, verbose=0,
                   warm_start=False) 

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=6,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=0.25,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=42, splitter='best') 

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
                           solver='lsqr', store_covariance=False, tol=0.0001) 

GaussianNB(priors=None, var_smoothing=1e-08) 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric=

In [98]:
!head submissions/titanic-*.csv

==> submissions/titanic-submission-p-06set-abdt.csv <==
PassengerId,Survived
892,0
893,0
894,0
895,1
896,0
897,0
898,0
899,0
900,1

==> submissions/titanic-submission-p-06set-bc.csv <==
PassengerId,Survived
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1

==> submissions/titanic-submission-p-06set-dt.csv <==
PassengerId,Survived
892,0
893,0
894,0
895,0
896,0
897,0
898,0
899,0
900,0

==> submissions/titanic-submission-p-06set-et.csv <==
PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1

==> submissions/titanic-submission-p-06set-gb.csv <==
PassengerId,Survived
892,0
893,0
894,0
895,0
896,1
897,0
898,0
899,0
900,1

==> submissions/titanic-submission-p-06set-knn.csv <==
PassengerId,Survived
892,0
893,0
894,0
895,0
896,1
897,0
898,1
899,0
900,1

==> submissions/titanic-submission-p-06set-lda.csv <==
PassengerId,Survived
892,0
893,1
894,0
895,0
896,1
897,0
898,1
899,0
900,1

==