##### Common Voice Separation Challenge

This is a classifying challenge built with a Bayes Classifier implementation.

- We have two `csv` files, one with data to train and other with data to validate;
- Both belong to the same dataset which holds 39 features and 5 classes;
- We use a self implementation of a bayesian classifier along with a cross validation strategy;
- The best classifier caught from CV is, then, used to classify data from the validation file;
- The result is stored on a response file whose content is to be verified;
- Wish me luck!

Imports

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold
from BayesClassifier import BayesClassifier

#### Import Data

In [3]:
dsTrain = pd.read_csv('./file/treino.csv')
features = dsTrain.columns[1:-1]

idKey = 'id'
classKey = 'y'
classes = dsTrain[classKey].unique()

X = dsTrain[features].values
Y = dsTrain[classKey].values

#### Train

In [4]:

cv = KFold(n_splits=50, shuffle=True, random_state=True)
cv.get_n_splits(X)

history = list()
maxAccuracy = -np.inf
bestClassifier = None

def getAccuracy(yHat: np.array, y: np.array) -> float:
    return sum(yHat == y) / y.shape[0]

for trainIdx, testIdx in cv.split(X):

    xTest, yTest = X[testIdx], Y[testIdx]
    xTrain, yTrain = X[trainIdx], Y[trainIdx]

    classifier = BayesClassifier().fit(xTrain, yTrain)
    yHat = classifier.predict(xTest)

    accuracy = getAccuracy(yHat, yTest)
    if accuracy > maxAccuracy:
        maxAccuracy = accuracy
        bestClassifier = classifier

    history.append({ 'accuracy': accuracy, 'classifier': classifier })

Check overall accuracy against source dataset

In [5]:
yHat = bestClassifier.predict(X)
overallAccuracy = getAccuracy(yHat, Y)
print(f'overallAccuracy: {overallAccuracy}')

overallAccuracy: 0.8911428571428571


#### Write validation data

In [6]:
# Set data validation data
dsValidation = pd.read_csv('./file/validacao.csv')
xValidation = dsValidation[features].values

# Build result csv file
responseData = pd.DataFrame(columns=[idKey, classKey])
responseData[idKey] = dsValidation[idKey]
responseData[classKey] = bestClassifier.predict(xValidation)

responseData.to_csv('./sample.csv', index=False)