## Load Dataset

In [393]:
import pandas as pd
train = pd.read_csv('datasets/titanic/train.csv')
test = pd.read_csv('datasets/titanic/test.csv')

train.dtypes



PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

In [394]:
y = train['Survived']
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: Survived, Length: 891, dtype: int64

### Remove as respostas (y) do dataset

In [395]:
X = train.drop('Survived',axis=1)
X.columns

Index(['PassengerId', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp', 'Parch',
       'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

## Pré-processamento
### Atributos desejados

In [396]:
from sklearn.base import BaseEstimator, TransformerMixin
class AtributosDesejados(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasIndesejadas = ['PassengerId', 'Name', 'Ticket', 'Cabin']
        return self
    def transform(self, X, y=None):
        return X.drop(self.colunasIndesejadas,axis=1)

atributosDesejados = AtributosDesejados()
Xdrop = atributosDesejados.fit_transform(X)
Xdrop.columns

Index(['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'], dtype='object')

### Atributos Numéricos a partir dos atributos desejados (Xdrop)

In [397]:
from sklearn.base import BaseEstimator, TransformerMixin
class AtributosNumericos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasNumericas = X.select_dtypes(include='number').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasNumericas]

atributosNumericos = AtributosNumericos()
Xnum = atributosNumericos.fit_transform(Xdrop)
Xnum.columns

Index(['Pclass', 'Age', 'SibSp', 'Parch', 'Fare'], dtype='object')

In [398]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

pipenum = Pipeline([
    ('atributos_numericos', AtributosNumericos()),
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

XnumLimpo = pipenum.fit_transform(Xnum)
XnumLimpo

array([[ 0.82737724, -0.56573646,  0.43279337, -0.47367361, -0.50244517],
       [-1.56610693,  0.66386103,  0.43279337, -0.47367361,  0.78684529],
       [ 0.82737724, -0.25833709, -0.4745452 , -0.47367361, -0.48885426],
       ...,
       [ 0.82737724, -0.1046374 ,  0.43279337,  2.00893337, -0.17626324],
       [-1.56610693, -0.25833709, -0.4745452 , -0.47367361, -0.04438104],
       [ 0.82737724,  0.20276197, -0.4745452 , -0.47367361, -0.49237783]])

### Atributos Categóricos a partir dos atributos desejados

In [399]:
from sklearn.base import BaseEstimator, TransformerMixin
class AtributosCategoricos(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.colunasCategoricas = X.select_dtypes(include='object').columns
        return self
    def transform(self, X, y=None):
        return X[self.colunasCategoricas]

atributosCategoricos = AtributosCategoricos()
Xcat = atributosCategoricos.fit_transform(Xdrop)
Xcat.columns

Index(['Sex', 'Embarked'], dtype='object')

In [400]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

pipecat = Pipeline([
    ('atributos_categoricos', AtributosCategoricos()),
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder())
])

XcatLimpo = pipecat.fit_transform(Xdrop)
XcatLimpo

<891x5 sparse matrix of type '<class 'numpy.float64'>'
	with 1782 stored elements in Compressed Sparse Row format>

### X Tratado
#### Pipeline para tratar os atributos numéricos e categóricos e unir as características

In [401]:
from sklearn.pipeline import FeatureUnion
unecaracteristicas = FeatureUnion([
    ('pipenum', pipenum),
    ('pipecat', pipecat)
])
Xtratado = unecaracteristicas.fit_transform(Xdrop)
Xtratado

<891x10 sparse matrix of type '<class 'numpy.float64'>'
	with 6237 stored elements in Compressed Sparse Row format>

#### Pipeline para tratar todo o X

In [402]:
from sklearn.pipeline import Pipeline

preproc = Pipeline([
    ('atributos_desejados', AtributosDesejados()),
    ('unecaracteristicas', unecaracteristicas)
])
Xtratado = preproc.fit_transform(X)
Xtratado

<891x10 sparse matrix of type '<class 'numpy.float64'>'
	with 6237 stored elements in Compressed Sparse Row format>

## Classificador

In [403]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_validate
import numpy as np
from collections import Counter
from sklearn.base import BaseEstimator, ClassifierMixin


# Valor mais próximo da média da característica
def melhorValorTarefa(x, y):   
    valor = None
    impValor = None
    xmean = np.mean(x) # média da característica
    valor = x[(x - xmean).argmin()] # valor mais próximo da média
    return valor


# Característica com melhor valor mais próximo da média do y
def melhorCaracteristicaTarefa(X, y):   
    impurezas = []
    valores = []
    for caracteristica in range(X.shape[1]):
        valor = melhorValorTarefa(X[:,caracteristica], y)
        valores.append(valor)

    ymean = np.mean(y) #média do y
    caracteristica = (valores - ymean).argmin() # Característica com melhor valor mais próximo da média do y

    return caracteristica, valores[caracteristica]

def maisFrequenteTarefa(y):
    return Counter(y.flat).most_common(1)[0][0]


class ArvoreTarefa(BaseEstimator, ClassifierMixin):
    def fit(self, X, y):
        self.caracteristica, self.valor = melhorCaracteristicaTarefa(X, y)
        maiores = X[:,self.caracteristica] > self.valor
        if sum(maiores)>0 and sum(~maiores)>0:
            self.maiores = ArvoreTarefa()
            self.maiores.fit(X[maiores,:],y[maiores])
            self.menores = ArvoreTarefa()
            self.menores.fit(X[~maiores,:],y[~maiores])
        else:
            self.resposta = maisFrequenteTarefa(y)
    def predict(self, X):
        y = np.empty((X.shape[0]))
        if hasattr(self, 'resposta'):
            y[:] = self.resposta
        else:
            maiores = X[:,self.caracteristica] > self.valor
            y[maiores] = self.maiores.predict(X[maiores,:])
            y[~maiores] = self.menores.predict(X[~maiores,:])
        return y



In [404]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

pipetotal = Pipeline([
    ('preproc', preproc),
    ('arvore', DecisionTreeClassifier())
])

pipetotal.fit(X,y)
ypred = pipetotal.predict(X)
accuracy_score(y, ypred)

0.9797979797979798

In [412]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

classificador = Pipeline([
    ('preproc', preproc),
    ('clf', RandomForestClassifier())
])

classificador.fit(X,y)
ypred = classificador.predict(test)

np.sum(ypred==1)


155

### Com GridSearch

In [413]:
parametros = {
    'clf__max_depth': [None] + list(range(1,20,2)),
    'clf__criterion': ['gini', 'entropy']
}

classificador_Grid = GridSearchCV(classificador, param_grid=parametros)

classificador_Grid.fit(X,y)
ypred = classificador_Grid.predict(test)

np.sum(ypred==1)


144

In [411]:
result = test[['PassengerId']]
result['Survived'] = ypred
result.to_csv('videoaula.csv',index=False)

ValueError: Length of values does not match length of index