In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn

import os
print(os.listdir('./dataset/kaggle/kaggle-voice'))

['voice.csv']


In [2]:
# load the csv file
data = pd.read_csv('./dataset/kaggle/kaggle-voice/voice.csv')
# data.head()

In [3]:
# data.info()

In [4]:
# visualize data
# seaborn.pairplot(data[['meanfreq', 'Q25', 'Q75', 'skew', 'centroid', 'label']],hue='label', size=3)

In [5]:
# shuffle data (original data had gender arranged in sequence)
data = data.sample(frac=1, random_state=42)
# data.head()

In [6]:
# encode gender label into 0 & 1
data['label'] = data['label'].map({'male':1, 'female': 0})
# data.head()

In [7]:
# input the axes
X = data.loc[:, data.columns != 'label']
Y = data.loc[:, 'label']
# print('X:\n{}'.format(X))
# print('Y:\n{}'.format(Y))

In [8]:
# standard scaling
from sklearn.preprocessing import StandardScaler

X = StandardScaler().fit_transform(X)
# X.head()

In [9]:
# train-test split
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [10]:
# evaluation function
from sklearn import metrics

def evaluate(model, X_test, Y_test, Y_pred, Y_predproba):
    # metrics.plot_confusion_matrix(model, X_test, Y_test, cmap='plasma')
    print('Confusion Matrix:\n{}'.format(metrics.confusion_matrix(Y_test, Y_pred)))
    print('Precision: {}'.format(metrics.precision_score(Y_test, Y_pred)))
    print('Recall: {}'.format(metrics.recall_score(Y_test, Y_pred)))
    print('ROC AUC Score: {}'.format(metrics.roc_auc_score(Y_test, Y_pred)))
    print('f1 score: {}'.format(metrics.f1_score(Y_test, Y_pred)))

In [11]:
# predict with logistic regression
from sklearn.linear_model import LogisticRegression

logreg = LogisticRegression(random_state=42, max_iter=150)
logreg = logreg.fit(X_train, Y_train)

evaluate(logreg, X_test, Y_test, logreg.predict(X_test), logreg.predict_proba(X_test)[:,1])

Confusion Matrix:
[[503  15]
 [ 11 517]]
Precision: 0.9718045112781954
Recall: 0.9791666666666666
ROC AUC Score: 0.9751045688545688
f1 score: 0.9754716981132076


In [12]:
# predict with SVM
from sklearn.svm import SVC

svm = SVC(random_state=42)
svm = svm.fit(X_train, Y_train)

evaluate(svm, X_test, Y_test, svm.predict(X_test), svm.decision_function(X_test))

Confusion Matrix:
[[512   6]
 [ 11 517]]
Precision: 0.988527724665392
Recall: 0.9791666666666666
ROC AUC Score: 0.9837918275418275
f1 score: 0.9838249286393911


In [13]:
# predict with random forest classifier
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=250, max_depth=15, random_state=42)
random_forest = random_forest.fit(X_train, Y_train)

evaluate(random_forest, X_test, Y_test, random_forest.predict(X_test), random_forest.predict_proba(X_test)[:,1])

Confusion Matrix:
[[508  10]
 [ 14 514]]
Precision: 0.9809160305343512
Recall: 0.9734848484848485
ROC AUC Score: 0.9770899145899148
f1 score: 0.9771863117870723


In [14]:
#  predict with decision tree classifier
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree = decision_tree.fit(X_train, Y_train)

evaluate(decision_tree, X_test, Y_test, decision_tree.predict(X_test), decision_tree.predict_proba(X_test)[:,1])

Confusion Matrix:
[[498  20]
 [ 17 511]]
Precision: 0.9623352165725048
Recall: 0.9678030303030303
ROC AUC Score: 0.9645964958464959
f1 score: 0.9650613786591123


In [15]:
# predict with KNN
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)
knn = knn.fit(X_train, Y_train)

evaluate(knn, X_test, Y_test, knn.predict(X_test), knn.predict_proba(X_test)[:,1])

Confusion Matrix:
[[504  14]
 [  7 521]]
Precision: 0.9738317757009346
Recall: 0.9867424242424242
ROC AUC Score: 0.9798576986076987
f1 score: 0.980244590780809
