In [168]:

import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

train = pd.read_csv(r"F:\DataScience\ml-scripting\study cases\titanic\datasets\train.csv")


In [169]:

train.drop(['PassengerId','Name','Ticket','Cabin','Fare'], axis=1, inplace=True)
train.head(4)


Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Embarked
0,0,3,male,22.0,1,0,S
1,1,1,female,38.0,1,0,C
2,1,3,female,26.0,0,0,S
3,1,1,female,35.0,1,0,S


In [160]:

def StringCategoricalImputer(dataframe, fields):
    imputer = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
    dataframe[fields] = imputer.fit_transform(dataframe[fields])
    return dataframe, imputer


def NumericalCategoricalImputer(dataframe, fields):
    imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
    dataframe[fields] = imputer.fit_transform(dataframe[fields])
    return dataframe, imputer


def OneHotTransformerSKLearn(dataframe, fields):
    transformer = ColumnTransformer([( 'encoder', OneHotEncoder(), fields)],
                                                 remainder='passthrough')
    dataframe = pd.DataFrame(transformer.fit_transform(dataframe))
    return dataframe


def OneHotTransformerDummies(dataframe, fields):
    return pd.get_dummies(dataframe, columns=fields)


def ScalerTransformer(dataframe):
    scaler = StandardScaler()
    cols = X.columns.to_list()
    return pd.DataFrame(scaler.fit_transform(X), columns=cols)


categorical = ['Sex','SibSp','Embarked','Parch','Pclass']
train, StringImputer = StringCategoricalImputer(train, categorical)

numerical = ['Age']
train, NumericalImputer = NumericalCategoricalImputer(train, numerical)

dummies = ['Sex','Embarked']
train = OneHotTransformerDummies(train, transformer)

train.head(4)


Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S
0,0,3,22.0,1,0,0,1,0,0,1
1,1,1,38.0,1,0,1,0,1,0,0
2,1,3,26.0,0,0,1,0,0,0,1
3,1,1,35.0,1,0,1,0,0,0,1


In [170]:

X = train.drop('Survived', axis=1)
y = train['Survived']

#X = ScalerTransformer(X)

display(X.head(4))
display(y.head(4))


Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Embarked
0,3,male,22.0,1,0,S
1,1,female,38.0,1,0,C
2,3,female,26.0,0,0,S
3,1,female,35.0,1,0,S


0    0
1    1
2    1
3    1
Name: Survived, dtype: int64

In [162]:

def classificadorNaiveBayes(previsores_train, classe_train, previsores_test, classe_test):
    classificadorNaive = GaussianNB()
    classificadorNaive.fit(previsores_train, classe_train)
    previsoes = classificadorNaive.predict(previsores_test)
    precisao = accuracy_score(classe_test, previsoes)
    return precisao


def get_crossvalidation_fold_index(X, y, splits=5):
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state = 101)
    kfold_list = list()
    for train_index, test_index in kfold.split(X, y):
        kfold_list.append( {"nr_fold": len(kfold_list),
                            "train": train_index,
                            "test": test_index} )
    return kfold_list


folds = get_crossvalidation_fold_index(X, y, 5)

for i, v in enumerate(folds):
    X_train = X.loc[folds[i].get('train')]
    y_train = y.loc[folds[i].get('train')]
    X_test  = X.loc[folds[i].get('test')]
    y_test  = y.loc[folds[i].get('test')]
    folds[i]['NaiveBayes_accuracy'] = classificadorNaiveBayes(X_train, y_train, X_test, y_test)


In [157]:

media = 0
for item in folds:
    score = item.get('NaiveBayes_accuracy') * 100
    media = media + score
    print('Accuracy: {:.2f}%'.format(score))

print('Media Accuracy: {:.2f}%'.format(media / len(folds)))


Accuracy: 78.21%
Accuracy: 87.64%
Accuracy: 78.65%
Accuracy: 76.97%
Accuracy: 73.60%
Media Accuracy: 79.01%
