## Importação dos pacotes

In [33]:
# importar pacotes necessários
import numpy as np
import pandas as pd

In [34]:
# importar os pacotes necessários para os algoritmos de classificação
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import Perceptron
from sklearn.linear_model import Ridge
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier

## Carga dos dados de entrada

In [168]:
# carregar arquivo de dados de treino
train_data = pd.read_csv('titanic-train.csv', index_col='person')

In [169]:
# carregar arquivo de dados de teste
test_data = pd.read_csv('titanic-test.csv', index_col='person')

## Transformações nos dados

In [170]:
train_data.head()

Unnamed: 0_level_0,pclass,survived,name,sex,age,sibsp,parch,ticket,fare,cabin,embarked,home_destination
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
416,2,no,"Gaskell, Mr. Alfred",male,16.0,0,0,239865,26.0,,S,"Liverpool / Montreal, PQ"
194,1,no,"Maguire, Mr. John Edward",male,30.0,0,0,110469,26.0,C106,S,"Brockton, MA"
600,3,no,"Abbing, Mr. Anthony",male,42.0,0,0,C.A. 5547,7.55,,S,
1112,3,no,"Peacock, Miss. Treasteall",female,3.0,1,1,SOTON/O.Q. 3101315,13.775,,S,
878,3,no,"Ilmakangas, Miss. Pieta Sofia",female,25.0,1,0,STON/O2. 3101271,7.925,,S,


In [171]:
for data in [train_data, test_data]:
    print(data.shape)
    data.drop(['name', 'ticket', 'cabin', 'home_destination'], axis=1, inplace=True)
    
    data['age'].fillna(round(data.age.mean()), inplace=True)
    data['embarked'].fillna(data.embarked.mode()[0], inplace=True)
    data.fillna('0', inplace=True)
    
    #data.dropna(how='any', inplace=True)
    print(data.shape)

(872, 12)
(872, 8)
(437, 11)
(437, 7)


In [173]:
# gerar "one hot encoding" em atributos categóricos
cols = ['pclass', 'sex', 'embarked']
train_data = pd.get_dummies(train_data, columns=cols)
test_data = pd.get_dummies(test_data, columns=cols)

In [174]:
# realizar normalização nos dados numéricos contínuos
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
for data in [train_data, test_data]:
    data.loc[:,'age':'fare'] = scaler.fit_transform(data.loc[:,'age':'fare'])

  return self.partial_fit(X, y)
  return self.partial_fit(X, y)


In [175]:
train_data.head()

Unnamed: 0_level_0,survived,age,sibsp,parch,fare,pclass_1,pclass_2,pclass_3,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
416,no,0.198296,0.0,0.0,0.050749,0,1,0,0,1,0,0,1
194,no,0.373669,0.0,0.0,0.050749,1,0,0,0,1,0,0,1
600,no,0.523988,0.0,0.0,0.014737,0,0,1,0,1,0,0,1
1112,no,0.03545,0.125,0.111111,0.026887,0,0,1,1,0,0,0,1
878,no,0.311036,0.125,0.0,0.015469,0,0,1,1,0,0,0,1


## Seleção dos dados de treino e teste

In [176]:
# definir dados de treino

X_train = train_data.drop(['survived'], axis=1) # tudo, exceto a coluna alvo
y_train = train_data['survived'] # apenas a coluna alvo

print('Forma dos dados de treino:', X_train.shape, y_train.shape)

Forma dos dados de treino: (872, 12) (872,)


In [177]:
# definir dados de teste

X_test = test_data # tudo, já que não possui a coluna alvo

print('Forma dos dados de teste:', X_test.shape)

Forma dos dados de teste: (437, 12)


In [178]:
X_train.head()

Unnamed: 0_level_0,age,sibsp,parch,fare,pclass_1,pclass_2,pclass_3,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
416,0.198296,0.0,0.0,0.050749,0,1,0,0,1,0,0,1
194,0.373669,0.0,0.0,0.050749,1,0,0,0,1,0,0,1
600,0.523988,0.0,0.0,0.014737,0,0,1,0,1,0,0,1
1112,0.03545,0.125,0.111111,0.026887,0,0,1,1,0,0,0,1
878,0.311036,0.125,0.0,0.015469,0,0,1,1,0,0,0,1


In [179]:
X_test.head()

Unnamed: 0_level_0,age,sibsp,parch,fare,pclass_1,pclass_2,pclass_3,sex_female,sex_male,embarked_C,embarked_Q,embarked_S
person,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
76,0.510453,0.125,0.111111,0.162314,1,0,0,1,0,1,0,0
87,0.35168,0.0,0.0,0.059532,1,0,0,0,1,0,0,1
376,0.311987,0.0,0.0,0.020495,0,1,0,0,1,0,0,1
645,0.298756,0.0,0.0,0.015216,0,0,1,0,1,0,0,1
976,0.404604,0.0,0.0,0.015379,0,0,1,0,1,0,0,1


## Treinamento dos modelos e geração dos resultados 

In [180]:
models = []
models.append(('LR', LogisticRegression(random_state=42, solver='lbfgs', multi_class='auto', max_iter=500, C=100)))
models.append(('DT', DecisionTreeClassifier(random_state=42, criterion='gini', max_depth=11)))
models.append(('KNN', KNeighborsClassifier(n_neighbors=1)))
models.append(('SVM', SVC(random_state=42, C=10, gamma=0.1, kernel='rbf')))
models.append(('RF', RandomForestClassifier(random_state=42, max_features='auto', n_estimators=10)))
models.append(('SGD', SGDClassifier(random_state=42, max_iter=100, tol=0.1)))
models.append(('NN', Perceptron(random_state=42, max_iter=100, tol=0.01)))
models.append(('NB', GaussianNB(priors=None, var_smoothing=1e-08)))
models.append(('LSVM', LinearSVC(random_state=42, max_iter=1000, C=10)))
models.append(('ABDT', AdaBoostClassifier(DecisionTreeClassifier(random_state=42), n_estimators=5)))
models.append(('GB', GradientBoostingClassifier(random_state=42, max_depth=3)))
models.append(('MLP', MLPClassifier(random_state=42, solver='lbfgs', alpha=0.1, hidden_layer_sizes=(15,))))
models.append(('LDA', LinearDiscriminantAnalysis(solver='svd')))

In [181]:
!mkdir submissions

mkdir: cannot create directory ‘submissions’: File exists


In [182]:
sufixo_arquivo = '03jul'

for name, model in models:
    print(model, '\n')
    
    # treinar o modelo
    model.fit(X_train, y_train)
    
    # executar previsão usando o modelo
    y_pred = model.predict(X_test)
    
    # gerar dados de envio (submissão)
    submission = pd.DataFrame({
      'person': X_test.index,
      'survived': y_pred
    })
    submission.set_index('person', inplace=True)

    # gerar arquivo CSV para o envio
    filename = 'submissions/titanic-submission-p-%s-%s.csv' % (sufixo_arquivo, name.lower())
    submission.to_csv(filename)

LogisticRegression(C=100, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=500, multi_class='auto',
          n_jobs=None, penalty='l2', random_state=42, solver='lbfgs',
          tol=0.0001, verbose=0, warm_start=False) 

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=11,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best') 

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=1, p=2,
           weights='uniform') 

SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=42, shrinking=True,
  tol=0.001, verbose=False)



AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=42,
            splitter='best'),
          learning_rate=1.0, n_estimators=5, random_state=None) 

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=42,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbos



In [183]:
# verificar conteúdo dos arquivos gerados
!head submissions/titanic-submission-p-*.csv

==> submissions/titanic-submission-p-03jul-abdt.csv <==
person,survived
76,yes
87,yes
376,no
645,no
976,no
584,yes
769,no
628,no
1036,no

==> submissions/titanic-submission-p-03jul-dt.csv <==
person,survived
76,yes
87,no
376,no
645,no
976,no
584,yes
769,no
628,no
1036,no

==> submissions/titanic-submission-p-03jul-gb.csv <==
person,survived
76,yes
87,no
376,no
645,no
976,no
584,yes
769,no
628,no
1036,yes

==> submissions/titanic-submission-p-03jul-knn.csv <==
person,survived
76,yes
87,yes
376,no
645,no
976,no
584,yes
769,no
628,no
1036,no

==> submissions/titanic-submission-p-03jul-lda.csv <==
person,survived
76,yes
87,no
376,no
645,no
976,no
584,yes
769,no
628,yes
1036,yes

==> submissions/titanic-submission-p-03jul-lr.csv <==
person,survived
76,yes
87,no
376,no
645,no
976,no
584,yes
769,no
628,no
1036,yes

==> submissions/titanic-submission-p-03jul-lsvm.csv <==
person,survived
76,yes
87,no
376,no
645,no
976