@author Joubert Alexandrino de Souza
@version 2021-02-26

-----------------------------------------------
# Submeter uma predição no Kaggle
-----------------------------------------------

### Para fazer esta atividade você precisará assistir o vídeo https://youtu.be/j8-dT-OoYFs.

### Depois você precisará submeter uma predição no Kaggle e salvar a página de submissão em PDF.

### Salve o PDF no seu GitHub e coloque o link para entregar a atividade.

### A sua nota será o score multiplicado por 100.

In [83]:
# Importa as bibliotecas necessárias
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import FeatureUnion
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

%matplotlib inline

## Realiza a leitura e visualização dos dados

In [84]:
dados = pd.read_csv("https://raw.githubusercontent.com/joubert-alexandrino/reconhecimento-padroes/main/titanic-train.csv")
dados.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [85]:
# Tamanho do dataset
dados.shape

(891, 12)

In [86]:
# Verifica os tipo de dados do dataset
dados.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [87]:
# Verifica se existem dados nulos
dados.isnull().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [88]:
# Verifica se existem dados NA
dados.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [89]:
# Estatísticas descritivas do dataset
dados.describe().round(2)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.38,2.31,29.7,0.52,0.38,32.2
std,257.35,0.49,0.84,14.53,1.1,0.81,49.69
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.12,0.0,0.0,7.91
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.45
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.33


In [90]:
# Matriz de correlação
dados.corr().round(2)

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
PassengerId,1.0,-0.01,-0.04,0.04,-0.06,-0.0,0.01
Survived,-0.01,1.0,-0.34,-0.08,-0.04,0.08,0.26
Pclass,-0.04,-0.34,1.0,-0.37,0.08,0.02,-0.55
Age,0.04,-0.08,-0.37,1.0,-0.31,-0.19,0.1
SibSp,-0.06,-0.04,0.08,-0.31,1.0,0.41,0.16
Parch,-0.0,0.08,0.02,-0.19,0.41,1.0,0.22
Fare,0.01,0.26,-0.55,0.1,0.16,0.22,1.0


## Inicia o processamento dos dados

In [91]:
# Separa o conjunto de características do target
X, y = dados.drop(['Survived'], axis=1, inplace=False), dados.Survived
X.shape, y.shape

((891, 11), (891,))

In [92]:
# Configura as classes para serem usadas nos Pipelines

class AtributosDesejados(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Name', 'Ticket', 'Cabin']
        return self

    def transform(self, X, y=None):
        return X.drop(self.colunasIndesejadas, axis=1)

class AtributosNumericos(BaseEstimator, TransformerMixin):
    
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    
    def transform(self, X, y=None):
        return X[self.colunasNumericas]

class AtributosCategoricos(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self

    def transform(self, X, y=None):
        return X[self.colunasCategoricas]

In [93]:
# Conigura os Pipelines

# Dados Numéricos
pipenum = Pipeline([
    ('atributos_numericos', AtributosNumericos()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

# Dados Categoricos
pipecat = Pipeline([
    ('atributos_categoricos', AtributosCategoricos()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

# Unir as características
unecaracteristicas = FeatureUnion([
    ('pipenum', pipenum),
    ('pipecat',pipecat)
])

# Pré-processamento
preproc = Pipeline([
    ('atributos_desejados', AtributosDesejados()),
    ('unecaracteristicas', unecaracteristicas)
])

# Total
pipetotal = Pipeline([
    ('preproc', preproc),    
    ('knn', KNeighborsClassifier())
])

In [94]:
# Realiza a validação cruzada
parametros = {
    'knn__n_neighbors':[3,5,7],
    'knn__weights': ['uniform', 'distance'],
    'knn__p': [1,2]
}

modelo = GridSearchCV(pipetotal, param_grid=parametros, return_train_score = True)

scores = cross_validate(modelo, X, y, return_estimator=True)

scores, np.mean(scores['test_score'])

({'fit_time': array([3.06972098, 3.00551462, 3.00085258, 3.67272282, 4.23406959]),
  'score_time': array([0.01322508, 0.01328659, 0.0122447 , 0.01979923, 0.01560473]),
  'estimator': (GridSearchCV(estimator=Pipeline(steps=[('preproc',
                                           Pipeline(steps=[('atributos_desejados',
                                                            AtributosDesejados()),
                                                           ('unecaracteristicas',
                                                            FeatureUnion(transformer_list=[('pipenum',
                                                                                            Pipeline(steps=[('atributos_numericos',
                                                                                                             AtributosNumericos()),
                                                                                                            ('imputer',
                              

In [95]:
# Importa os dados de teste
dados_teste = pd.read_csv("https://raw.githubusercontent.com/joubert-alexandrino/reconhecimento-padroes/main/titanic-test.csv")
dados_teste.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [96]:
dados_teste.shape

(418, 11)

In [97]:
# Realiza a predição
melhor_classificador = scores['estimator'][np.argmax(scores['test_score'])].best_estimator_
ypred = melhor_classificador.predict(dados_teste)
ypred.shape

(418,)

In [98]:
# Exibe os resultados
ypred

array([0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [99]:
dados_teste['PassengerId']

0       892
1       893
2       894
3       895
4       896
       ... 
413    1305
414    1306
415    1307
416    1308
417    1309
Name: PassengerId, Length: 418, dtype: int64

In [100]:
# Salva  os resultados
resultados = pd.DataFrame(data=dados_teste['PassengerId'], columns=['PassengerId','Survived'])
resultados['Survived'] = ypred
resultados.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,1


In [101]:
resultados.to_csv("titanic_gender_submission_joubert_3.csv", index=False)