# Preparando os dados

In [3]:
import pandas as pd
import numpy as np
import sklearn
import matplotlib as mpl
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import cross_val_score
import random

In [4]:
from data_engineering import LimpaTudo

In [11]:
df = pd.read_csv("/content/drive/MyDrive/Arquivos Colab/Titanic/train.csv")
treinamento = LimpaTudo(df)
treinamento = treinamento.run()

In [12]:
treinamento = treinamento.drop('PassengerId', axis=1)
treinamento.head(1)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,family,Master,Miss,Mr,Mrs,OTHER,C,Q,S,A5,CA,NA,OTHER_T,PC,SOTONOQ
0,0,3,1,22.0,1,0,7.25,73,0,0,1,0,0,0,0,1,1,0,0,0,0,0


In [15]:
x = treinamento.iloc[:, 1:22]
y = treinamento['Survived']

In [16]:
x.head(1)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,family,Master,Miss,Mr,Mrs,OTHER,C,Q,S,A5,CA,NA,OTHER_T,PC,SOTONOQ
0,3,1,22.0,1,0,7.25,73,0,0,1,0,0,0,0,1,1,0,0,0,0,0


# Gradient Boosting Machine

In [17]:
from sklearn.ensemble import GradientBoostingClassifier
gradclf = GradientBoostingClassifier(random_state=10)

In [21]:
def display_metrics(modelo):
  print('Accuracy: ', cross_val_score(modelo, x, y, cv=5, scoring="accuracy").mean())
  print('Precision: ', cross_val_score(modelo, x, y, cv=5, scoring="precision").mean())
  print("Recall:", cross_val_score(modelo, x, y, cv=5, scoring="recall").mean())
  print('ROC AUC: ',cross_val_score(modelo, x, y, cv=5, scoring="roc_auc").mean())
  print('F1: ',cross_val_score(modelo, x, y, cv=5, scoring="f1").mean())

In [22]:
display_metrics(gradclf)

Accuracy:  0.8260247316552635
Precision:  0.8069491749662324
Recall: 0.7191389599317988
ROC AUC:  0.8707733355517208
F1:  0.7583523022401062


## Otimizando hiperparametros

In [26]:
from sklearn.model_selection import GridSearchCV
param_grid = {'n_estimators': range(80,150,10), 'learning_rate': [0.03, 0.05, 0.08, 0.1]}
grid_search = GridSearchCV(GradientBoostingClassifier(), param_grid, cv=5, scoring='accuracy')
grid_search.fit(x, y)

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(),
             param_grid={'learning_rate': [0.03, 0.05, 0.08, 0.1],
                         'n_estimators': range(80, 150, 10)},
             scoring='accuracy')

In [27]:
print(grid_search.best_score_)
print(grid_search.best_params_)

0.8350072186303434
{'learning_rate': 0.08, 'n_estimators': 130}


In [28]:
param_grid2 = {'min_samples_split':range(0,20,1), 'max_depth':  range(5,11)}
grid_search2 = GridSearchCV(GradientBoostingClassifier(n_estimators= 130, learning_rate=0.8),\
                            param_grid2, cv=5, scoring='accuracy')
grid_search2.fit(x, y)

60 fits failed out of a total of 600.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
30 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 596, in fit
    monitor,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 672, in _fit_stages
    X_csr,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 246, in _fit_stage
    tree.fit(X, residual, sample_weight=sample_weight, check_input=False)
  File "/usr/local/lib/python3.7/dis

GridSearchCV(cv=5,
             estimator=GradientBoostingClassifier(learning_rate=0.8,
                                                  n_estimators=130),
             param_grid={'max_depth': range(5, 11),
                         'min_samples_split': range(0, 20)},
             scoring='accuracy')

In [29]:
print(grid_search2.best_score_)
print(grid_search2.best_params_)

0.8316489862532169
{'max_depth': 5, 'min_samples_split': 7}


In [32]:
param_grid3 = {'min_samples_leaf':range(0,11,1)}
grid_search3 = GridSearchCV(GradientBoostingClassifier(n_estimators= 130, learning_rate=0.08, max_depth=5, min_samples_split= 7),\
                            param_grid3, cv=5, scoring='accuracy')
grid_search3.fit(x, y)

5 fits failed out of a total of 55.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 596, in fit
    monitor,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 672, in _fit_stages
    X_csr,
  File "/usr/local/lib/python3.7/dist-packages/sklearn/ensemble/_gb.py", line 246, in _fit_stage
    tree.fit(X, residual, sample_weight=sample_weight, check_input=False)
  File "/usr/local/lib/python3.7/dist-p

GridSearchCV(cv=5,
             estimator=GradientBoostingClassifier(learning_rate=0.08,
                                                  max_depth=5,
                                                  min_samples_split=7,
                                                  n_estimators=130),
             param_grid={'min_samples_leaf': range(0, 11)}, scoring='accuracy')

In [33]:
print(grid_search3.best_score_)
print(grid_search3.best_params_)

0.830525390747599
{'min_samples_leaf': 8}


## GBM otimizado:

In [34]:
gbm_o = GradientBoostingClassifier(n_estimators= 130, learning_rate=0.08, max_depth=5, min_samples_split= 7)

In [35]:
display_metrics(gbm_o)

Accuracy:  0.8271671583704727
Precision:  0.8093635210044747
Recall: 0.722080136402387
ROC AUC:  0.863316732756126
F1:  0.7605225220625155


# Random Forest Classifier

In [23]:
from sklearn.ensemble import RandomForestClassifier
rdf = RandomForestClassifier(random_state=10)

In [25]:
display_metrics(rdf)

Accuracy:  0.8204130311970372
Precision:  0.79866697357369
Recall: 0.7161125319693095
ROC AUC:  0.8656816066528871
F1:  0.7525321025336164


In [36]:
param_grid4 = {'n_estimators': [200, 500, 1000, 2000], 'max_depth' : [2, 4, 5, 10],\
              'min_samples_split' : [3, 4, 5] }
grid_search4 = GridSearchCV(RandomForestClassifier(), param_grid4, cv=5, scoring='accuracy')
grid_search4.fit(x, y)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [2, 4, 5, 10],
                         'min_samples_split': [3, 4, 5],
                         'n_estimators': [200, 500, 1000, 2000]},
             scoring='accuracy')

In [37]:
print(grid_search4.best_score_)
print(grid_search4.best_params_)

0.8327600276191074
{'max_depth': 10, 'min_samples_split': 5, 'n_estimators': 200}


## Random Forest otimizado

In [39]:
rdf_o = RandomForestClassifier(max_depth= 10, min_samples_split= 5, n_estimators= 200, random_state=10)

In [40]:
display_metrics(rdf_o)

Accuracy:  0.8249011361496453
Precision:  0.8030063985253595
Recall: 0.7219522591645353
ROC AUC:  0.8746931906170744
F1:  0.7573402351748407


# Submission

In [56]:
df_teste = pd.read_csv('/content/drive/MyDrive/Arquivos Colab/Titanic/test.csv', index_col= 'PassengerId')

In [57]:
teste = LimpaTudo(df_teste)
teste = teste.run()

In [58]:
x2 = x.drop(['SOTONOQ', 'A5'], axis=1)

Gerando as previsões:

In [59]:
gbm_o.fit(x2,y)
previsao_gbm = gbm_o.predict(teste)
rdf_o.fit(x2,y)
previsao_rdf = rdf_o.predict(teste)

In [60]:
submission_gbm = teste.filter('PassengerId')
submission_rdf = teste.filter('PassengerId')
submission_gbm['Survived'] = pd.Series(previsao_gbm).values
submission_rdf['Survived'] = pd.Series(previsao_rdf).values

In [62]:
submission_gbm.to_csv('submission_gbm.csv', index=True)
submission_rdf.to_csv('submission_rdf.csv', index=True)

Competition score:
- GBM: 73,8%
- RD Forest: 75,8%
