IMPORTING  LIBRARIES

In [1]:
import pandas as pd
import cudf as cd
import cupy as cp
# import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import sklearn
import seaborn as sns
from sklearn.svm import SVC

import warnings
warnings.filterwarnings("ignore")

 **Data Loading & Preprocessing **
 
 Initially the dataset is loaded into the notebook using the cudf library. The data set is analyzed thoroughly using different functions to check for null values and to check description of all the columns.
 
 The data is preprocessed to deal with the inconsistencies within the data. The data available on Kaggle **didn’t have any inconsistencies**.
 The categorical values are converted into numerical for computation.

In [2]:
data= cd.read_csv('../input/voicegender/voice.csv')

In [3]:
data.head()

In [4]:
data.shape

In [5]:
data.isnull().sum()

In [6]:
data.describe()

In [7]:
data.label.value_counts()


In [8]:
print(data.label.unique())
data.label = [1 if i =='female' else 0 for i in data.label.to_pandas() ]

y = data.label.to_pandas().values.reshape(-1,1)
x_data = data.drop(["label"], axis=1)


In [9]:
x_data

**NORMALIZATION**

The data is transferred onto the unit sphere. It gives equal weights/importance to each variable so that no single variable steers model performance in one direction just because they are bigger numbers. 

In [10]:
x = (x_data.to_pandas() - cp.min(x_data.to_pandas()))/(cp.max(x_data.to_pandas()) - cp.min(x_data.to_pandas())).values
data.head()

**DATA EXPLORATION**

Done in order to understand the data visually. Built pairplot of each and every column. 


In [11]:
plt.scatter(data['meanfreq'].to_array(), data['dfrange'].to_array())
plt.xlabel('meanfreq')
plt.ylabel('dfrange')
plt.title('Scatter Plot : Dominant frequency VS Mean frequency')
plt.show()


In [12]:
sns.pairplot(data.to_pandas(), hue ='label')
plt.show()

In [13]:
sns.pairplot(data.to_pandas(), hue ='label',vars=['meanfreq','sd','skew','kurt','dfrange'])
plt.show()

**SPLITTING THE DATASET**
test size of 25 is selected after checking accuracy against each test size.


In [14]:
print(type(x))
print(type(y))

X_train, X_test, Y_train, Y_test = train_test_split(x,y,test_size = 0.25, random_state = 42)
x_train = X_train.T
x_test = X_test.T
y_train = Y_train.T
y_test = Y_test.T

**APPLYING DIFFERENT MODELS**

I inferred that the best ouput was given by the logistic regressor and KNN Classifier

In [15]:
classifier = SVC(kernel = 'rbf')
classifier.fit(X_train, Y_train)
y_pred = classifier.predict(x_test.T)
y_pred.shape
y_test= y_test.reshape(792,1)
y_test.shape
from sklearn.metrics import r2_score
y=r2_score(y_test,y_pred)
print('Using SVM classifier:')
print('Accuracy of training set: {:.3f}'.format(classifier.score(X_train,Y_train)))
print('Accuracy of test set: {:.3f}'.format(classifier.score(X_test,Y_test)))
print('r2 score: {:.3f}'.format(y))


In [16]:
#Using elbow method to find a good value of 'k' and thus improve the accuracy of the model
error_rate = []

for k in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train,Y_train)
    y_pred_k = knn.predict(X_test)
    error_rate.append(cp.mean(y_pred_k != Y_test))

#plotting error_rate vs k
plt.figure(figsize=(10,6))
plt.plot(range(1,40), error_rate, color='blue', linestyle='dashed', marker='o', markerfacecolor='red', markersize=10)
plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')



In [17]:
classifierKNC = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifierKNC.fit(X_train, Y_train)
y_pred = classifierKNC.predict(x_test.T)
y_pred.shape
y_test= y_test.reshape(792,1)
y_test.shape
from sklearn.metrics import r2_score
y=r2_score(y_test,y_pred)
print('Using K nearest Classifier:')
print('Accuracy of training set: {:.2f}'.format(classifierKNC.score(X_train,Y_train)))
print('Accuracy of test set: {:.2f}'.format(classifierKNC.score(X_test,Y_test)))
print('r2 score: {:.3f}'.format(y))

In [18]:
y_Pred = cp.linspace(y_pred,951)
y_Pred

In [19]:
predict = []
for i in range(0,1):
    for each in y_Pred[i]:
        predict.append(int(each))

conf_matrix =sklearn.metrics.confusion_matrix(y_test, y_pred)
f,ax = plt.subplots(figsize=(7,7))
sns.heatmap(conf_matrix,annot=True,linewidths=0.5,linecolor="white",fmt=".0f",ax=ax)
plt.xlabel("y_test")
plt.ylabel("predict")
plt.show()
#According to our confusion matrix the model predict 9 and 6 wrong sample.6 represent it is 1 but model predict 0 instead of it, 9 also represent it is 0 but model predict 1 instead of it

In [20]:
classifierDT = DecisionTreeClassifier(criterion = 'entropy')
classifierDT.fit(X_train, Y_train)
y_pred = classifierDT.predict(x_test.T)
y_pred.shape
y_test= y_test.reshape(792,1)
y_test.shape
y=r2_score(y_test,y_pred)
print('Using Decision tree classifier:')
print('Accuracy of training set: {:.2f}'.format(classifierDT.score(X_train,Y_train)))
print('Accuracy of test set: {:.2f}'.format(classifierDT.score(X_test,Y_test)))
print('r2 score: {:.3f}'.format(y))

In [21]:
lr = LogisticRegression()
lr.fit(x_train.T,y_train.T)
y_pred = lr.predict(x_test.T)
y_pred.shape
y_test= y_test.reshape(792,1)
y_test.shape
Y =r2_score(y_test,y_pred)
print("test accuracy {}".format(lr.score(X_test,Y_test)))
print('r2 score: {:.3f}'.format(Y))

In [22]:
y_Pred = cp.linspace(y_pred,634)
y_Pred

In [23]:
predict = []
for i in range(0,1):
    for each in y_Pred[i]:
        predict.append(int(each))

conf_matrix =sklearn.metrics.confusion_matrix(y_test, y_pred)
f,ax = plt.subplots(figsize=(7,7))
sns.heatmap(conf_matrix,annot=True,linewidths=0.5,linecolor="white",fmt=".0f",ax=ax)
plt.xlabel("y_test")
plt.ylabel("predict")
plt.show()

#According to our confusion matrix the model predict 6 and 11 wrong sample.6 represent it is 1 but model predict 0 instead of it, 1 also represent it is 0 but model predict 1 instead of it

CREATING DATAFRAME TO COMPARE THE ACCURACY AND R2 SCORE OF KNN AND LOGISTIC REGRESSION ALGORITHM. AND PLOTING GRAPH FOR THE SAME

In [24]:
table = pd.DataFrame(columns=['test_portion','r2_score_lr','accuracy_lr','r2_score_KNN','accuracy_KNN'])
table['test_portion'] = [35,30,25,20,15]
table['r2_score_lr'] = [0.884,0.89,0.9785353535353535,0.930,0.924]
table['accuracy_lr'] = [0.9711451758340848,0.97,0.914,0.9826498422712934, 0.9810924369747899]
table['r2_score_KNN'] = [0.92,0.916,0.924,0.918,0.907]
table['accuracy_KNN'] = [0.98,0.98,0.98,0.98,0.98]

table



In [25]:
plt.figure(figsize=(20,10))

plt.subplot(121)
plt.scatter(table['test_portion'],table['r2_score_lr'])
plt.plot(table['test_portion'],table['r2_score_lr'])

plt.scatter(table['test_portion'],table['r2_score_KNN'])
plt.plot(table['test_portion'],table['r2_score_KNN'])

plt.xlabel('TEST SIZE')
plt.ylabel('R2 SCORE')
plt.title('R2 SCORE OF LOGISTIC REGRESSION and KNN CLASSIFIER vs DIFFERENT TEST SIZE')
plt.grid(True)
plt.legend(['Logistic Regression','KNN Classifier'])

plt.subplot(122)
plt.scatter(table['test_portion'],table['accuracy_lr'])
plt.plot(table['test_portion'],table['accuracy_lr'])

plt.scatter(table['test_portion'],table['accuracy_KNN'])
plt.plot(table['test_portion'],table['accuracy_KNN'])

plt.title('ACCURACY OF LOGISTIC REGRESSION and KNN vs DIFFERENT TEST SIZE')
plt.xlabel('TEST SIZE')
plt.ylabel('ACCURACY')
plt.grid(True)
plt.legend(['Logistic Regression','KNN Classifier'])

plt.show()