# Common Voice Separation Challenge

This is a classifying challenge built with a Bayes Classifier implementation.

- We have two `csv` files, one with data to train and other with data to validate;
- Both belong to the same dataset which holds 39 features and 5 classes;
- We use a self implementation of a bayesian classifier along with a cross validation strategy;
- The best classifier caught from CV is, then, used to classify data from the validation file;
- The result is stored on a response file whose content is to be verified;
- Wish me luck!

Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold

from BayesClassifier import BayesClassifier

### Import Data

In [2]:
dsTrain = pd.read_csv('./file/treino.csv')
features = dsTrain.columns[1:-1]

idKey = 'id'
classKey = 'y'
classes = dsTrain[classKey].unique()

Y = dsTrain[classKey].values
X = StandardScaler().fit_transform(dsTrain[features].values)


#### Visualize data without normalization

In [3]:
dsTrain.describe()

Unnamed: 0,id,X0,X1,X2,X3,X4,X5,X6,X7,X8,...,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38
count,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,...,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0
mean,1749.5,5.801373,0.129776,-0.080697,0.12429,-0.069568,0.02091,-0.0489,0.008685,-0.027437,...,-0.005714,0.001124,-0.008123,0.006108,-0.00837,0.00337,-0.004579,0.001335,-0.004414,-0.006183
std,1010.507298,0.553708,0.12398,0.09351,0.092898,0.089135,0.052297,0.053688,0.035701,0.030207,...,0.009593,0.009048,0.012625,0.016424,0.013657,0.012869,0.010866,0.01342,0.009063,0.005669
min,0.0,1.711399,-0.327801,-0.476553,-0.098704,-0.287921,-0.247465,-0.207162,-0.182348,-0.190235,...,-0.048417,-0.04246,-0.056412,-0.031993,-0.051446,-0.027959,-0.047215,-0.026848,-0.03793,-0.031556
25%,874.75,5.647714,0.046752,-0.141731,0.047857,-0.147098,-0.018919,-0.094705,-0.018231,-0.045746,...,-0.011652,-0.004524,-0.014107,-0.003971,-0.012464,-0.004658,-0.008465,-0.006812,-0.008247,-0.010106
50%,1749.5,5.966449,0.135367,-0.07979,0.110533,-0.05991,0.019772,-0.044049,0.005706,-0.026727,...,-0.005303,0.001016,-0.0055,0.000957,-0.004434,-0.000157,-0.00181,-0.002524,-0.002603,-0.006293
75%,2624.25,6.147465,0.219378,-0.015827,0.209089,0.00439,0.057833,-0.005593,0.037291,-0.009213,...,0.000485,0.006401,0.000358,0.008538,0.0006,0.006309,0.002531,0.003849,0.001484,-0.00248
max,3499.0,6.727103,0.628729,0.211273,0.376119,0.128229,0.224193,0.106031,0.122694,0.091944,...,0.02777,0.036939,0.033304,0.059907,0.022399,0.047134,0.020612,0.05348,0.018757,0.017067


#### Visualize normalized data

In [4]:
aux = list(features)
aux.append(classKey)
dsTrainNorm = pd.DataFrame(columns=aux)

dsTrainNorm[features] = StandardScaler().fit_transform(dsTrain[features].values)
dsTrainNorm[classKey] = dsTrain[classKey].values

dsTrainNorm.describe()

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,...,X29,X30,X31,X32,X33,X34,X35,X36,X37,X38
count,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,...,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0,3500.0
mean,7.085126e-16,-1.319579e-16,-1.207923e-16,-1.928616e-16,1.674851e-17,-3.5527140000000005e-17,1.15717e-16,-1.015061e-17,3.248195e-17,3.451208e-17,...,-7.511452e-17,-2.4361470000000003e-17,-3.4258310000000003e-17,3.3497010000000004e-17,-4.2632560000000003e-17,4.060244e-18,3.65422e-17,4.060244e-18,-4.7707870000000004e-17,-1.700227e-17
std,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,...,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143,1.000143
min,-7.387573,-3.691261,-4.233918,-2.400759,-2.450022,-5.132497,-2.948215,-5.351605,-5.390223,-4.171927,...,-4.452163,-4.817661,-3.825391,-2.320175,-3.154741,-2.434869,-3.924347,-2.100408,-3.698548,-4.476027
25%,-0.2775491,-0.6697463,-0.6527989,-0.8228853,-0.8699246,-0.7617061,-0.8532867,-0.75402,-0.6062083,-0.6382913,...,-0.6190686,-0.6242627,-0.4740195,-0.6137851,-0.2998673,-0.6239333,-0.3576205,-0.6071367,-0.4230086,-0.6919678
50%,0.2981701,0.04510473,0.009702076,-0.1481053,0.1083639,-0.02175288,0.090368,-0.0834359,0.02350988,0.05091387,...,0.04284784,-0.0119647,0.207799,-0.3137106,0.2881972,-0.2741291,0.2549013,-0.2875632,0.1998308,-0.01937498
75%,0.6251327,0.7228193,0.6938183,0.9129433,0.829846,0.7061315,0.8067411,0.8013862,0.6034046,0.6682513,...,0.6462783,0.5833586,0.6718409,0.1479509,0.6569134,0.2283614,0.6544806,0.1873476,0.6508445,0.6533318
max,1.672112,4.025048,3.122791,2.711203,2.219376,3.887661,2.886141,3.193859,3.952711,5.372706,...,3.491079,3.958888,3.281777,3.276051,2.253303,3.401239,2.3187,3.886214,2.55689,4.101638


### Train

In [7]:

cv = KFold(n_splits=2, shuffle=True, random_state=True)
cv.get_n_splits(X)

history = list()
# bestPca = None
maxAccuracy = -np.inf
bestClassifier = None

def getAccuracy(yHat: np.array, y: np.array) -> float:
    return sum(yHat == y) / y.shape[0]

i = 0
for trainIdx, testIdx in cv.split(X):
    i += 1

    xTest, yTest = X[testIdx], Y[testIdx]
    xTrain, yTrain = X[trainIdx], Y[trainIdx]

    # pca = PCA(.98).fit(xTrain)
    # xTrain = pca.transform(xTrain)
    # xTest = pca.transform(xTest)

    classifier = BayesClassifier().fit(xTrain, yTrain)
    yHat = classifier.predict(xTest)

    accuracy = getAccuracy(yHat, yTest)
    if accuracy > maxAccuracy:
        # bestPca = pca
        maxAccuracy = accuracy
        bestClassifier = classifier

    history.append({ 'accuracy': accuracy, 'classifier': classifier })

print(f'{i} folds tested')

2 folds tested


Check overall accuracy against source dataset

In [8]:
# xValidation = bestPca.transform(X)
xValidation = X
yHat = bestClassifier.predict(xValidation)
overallAccuracy = getAccuracy(yHat, Y)
print(f'overallAccuracy: {overallAccuracy}')

overallAccuracy: 0.8805714285714286


#### Write validation data

In [9]:
# Set data validation data
dsValidation = pd.read_csv('./file/validacao.csv')
xValidation = dsValidation[features].values

# Build result csv file
responseData = pd.DataFrame(columns=[idKey, classKey])
responseData[idKey] = dsValidation[idKey]
responseData[classKey] = bestClassifier.predict(xValidation)

responseData.to_csv('./sample.csv', index=False)