In [1]:
import pandas as pd
import numpy as np

In [2]:
data_df = pd.read_csv("voice-classification.csv")
data_df.sample(n=5)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
2067,0.175185,0.057511,0.183543,0.164017,0.203264,0.039247,2.17132,7.285044,0.919647,0.530706,...,0.175185,0.170602,0.015826,0.266667,0.180027,0.007812,0.273438,0.265625,0.116246,female
777,0.176309,0.059555,0.19072,0.118462,0.228238,0.109777,1.49536,5.147349,0.924788,0.466217,...,0.176309,0.106554,0.015984,0.228571,0.348828,0.09375,3.53125,3.4375,0.128947,male
1130,0.202333,0.063001,0.221946,0.137544,0.264817,0.127273,2.000371,6.681799,0.873847,0.261759,...,0.202333,0.12361,0.047291,0.269663,1.190168,0.023438,7.429688,7.40625,0.093438,male
2101,0.184568,0.034825,0.183302,0.172884,0.201209,0.028326,3.140537,13.595432,0.824722,0.24716,...,0.184568,0.161185,0.034557,0.207792,0.961682,0.078125,6.648438,6.570312,0.250178,female
3032,0.192281,0.04197,0.197156,0.189689,0.2048,0.015111,3.919928,18.890319,0.824379,0.274805,...,0.192281,0.193415,0.070423,0.27027,0.483023,0.200195,0.795898,0.595703,0.695697,female


In [3]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   obje

## We don't have any null / missing value and we have only one categorical value which is Label that is used to identify gender/type of voice.

In [4]:
print("Total number of records: ", data_df.shape[0])
print("Total number of males: ", data_df[data_df['label']=='male'].shape[0])
print("Total number of females: ", data_df[data_df['label']=='female'].shape[0])

Total number of records:  3168
Total number of males:  1584
Total number of females:  1584


#### Label is categorical data so we need to endoce it to numeric.

In [5]:
x_features = data_df.drop(columns=["label"])
y_target = data_df['label']

In [6]:
x_features.sample(n=3)

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
1142,0.185609,0.061889,0.177158,0.137373,0.243217,0.105845,1.620441,5.678051,0.919616,0.471991,0.244718,0.185609,0.132252,0.046921,0.277457,1.752656,0.023438,12.539062,12.515625,0.117324
2688,0.185682,0.04894,0.174672,0.152182,0.227898,0.075716,2.055586,7.737662,0.901932,0.388738,0.149558,0.185682,0.15658,0.048632,0.274286,1.449424,0.023438,8.976562,8.953125,0.113351
809,0.176482,0.058527,0.184521,0.128199,0.210805,0.082605,1.750256,6.873995,0.922014,0.453263,0.195249,0.176482,0.136086,0.047572,0.277457,0.701891,0.023438,5.90625,5.882812,0.099374


In [7]:
y_target

0         male
1         male
2         male
3         male
4         male
         ...  
3163    female
3164    female
3165    female
3166    female
3167    female
Name: label, Length: 3168, dtype: object

In [8]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

In [9]:
y_target_enc = encoder.fit_transform(y_target)
y_target_enc

array([1, 1, 1, ..., 0, 0, 0])

#### we encoded male to 1 and female to 0

### As we have 21 features we should consider scaling the features with single digit variance to optimize the model.

In [10]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [11]:
x_features = scaler.fit_transform(x_features)
x_features[0:2]

array([[-4.04924806,  0.4273553 , -4.22490077, -2.57610164, -5.69360723,
        -0.21477826,  2.29330585,  1.76294635, -0.03908279,  0.4715753 ,
        -2.14121031, -4.04924806, -1.81203825, -1.0979981 ,  0.56595854,
        -1.5642046 , -0.70840431, -1.43142165, -1.41913712, -1.45477229],
       [-3.84105325,  0.6116695 , -3.99929342, -2.48688452, -5.58898726,
        -0.25848536,  4.54805598,  4.43300778, -0.06523603,  0.59443122,
        -2.14121031, -3.84105325, -1.07959443, -1.09153262, -0.29403034,
        -1.56191576, -0.70840431, -1.41810716, -1.4058184 , -1.01410294]])

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_features, y_target_enc, test_size=0.3, random_state=5)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((2217, 20), (951, 20), (2217,), (951,))

In [13]:
from sklearn.svm import SVC
svc = SVC()

svc.fit(x_train, y_train)

y_predict = svc.predict(x_test)

In [14]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [15]:
acc = accuracy_score(y_test, y_predict) * 100
print("Accuracy Score: {:0.2f}%".format(acc))

Accuracy Score: 98.21%


In [16]:
confusion_matrix(y_test, y_predict)

array([[463,  10],
       [  7, 471]], dtype=int64)

In [17]:
print(classification_report(y_test, y_predict))

              precision    recall  f1-score   support

           0       0.99      0.98      0.98       473
           1       0.98      0.99      0.98       478

    accuracy                           0.98       951
   macro avg       0.98      0.98      0.98       951
weighted avg       0.98      0.98      0.98       951



##  Hyper-tunning of estimator. GridSearchCV is the way by which we can tryout diferent combinations for the estimator and select the one with best Accuracy.

In [18]:
svc.get_params().keys()

dict_keys(['C', 'break_ties', 'cache_size', 'class_weight', 'coef0', 'decision_function_shape', 'degree', 'gamma', 'kernel', 'max_iter', 'probability', 'random_state', 'shrinking', 'tol', 'verbose'])

In [19]:
from sklearn.model_selection import GridSearchCV

gsc = GridSearchCV(SVC(), {
    'C': [1,10,20,30,50],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1, 10,100]
}, refit=True, verbose=True, cv=2)

# cv = Cross validator.

gsc.fit(x_train, y_train)

gsc_y_predict = gsc.predict(x_test)

Fitting 2 folds for each of 120 candidates, totalling 240 fits


In [20]:
gsc.best_score_

0.9797024997314392

In [21]:
gsc.best_params_

{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

In [22]:
gsc_df = pd.DataFrame(gsc.cv_results_)
print("Min Score: ", gsc_df['mean_test_score'].min())
print("Min Score: ", gsc_df['mean_test_score'].max())
gsc_df[gsc_df['mean_test_score'] == gsc_df['mean_test_score'].min()]

Min Score:  0.5011275484792947
Min Score:  0.9797024997314392


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_gamma,param_kernel,params,split0_test_score,split1_test_score,mean_test_score,std_test_score,rank_test_score
22,0.1609,0.005996,0.266836,0.013991,1,100,rbf,"{'C': 1, 'gamma': 100, 'kernel': 'rbf'}",0.501353,0.500903,0.501128,0.000225,116
46,0.153904,0.009994,0.254343,0.013492,10,100,rbf,"{'C': 10, 'gamma': 100, 'kernel': 'rbf'}",0.501353,0.500903,0.501128,0.000225,116
70,0.137414,0.0025,0.242351,0.002498,20,100,rbf,"{'C': 20, 'gamma': 100, 'kernel': 'rbf'}",0.501353,0.500903,0.501128,0.000225,116
94,0.137416,0.006496,0.239852,0.001,30,100,rbf,"{'C': 30, 'gamma': 100, 'kernel': 'rbf'}",0.501353,0.500903,0.501128,0.000225,116
118,0.136916,0.003998,0.240352,0.000498,50,100,rbf,"{'C': 50, 'gamma': 100, 'kernel': 'rbf'}",0.501353,0.500903,0.501128,0.000225,116


##  Hyper-tunning of estimator. GridSearchCV run for all the combinations whichis time consuming and resource consuming.. But RandomizedSearchCV provide a way to try out only a number of values from complete combinations randomly.

In [23]:
from sklearn.model_selection import RandomizedSearchCV

rscv = RandomizedSearchCV(SVC(), {
    'C': [1,10,20,30,50],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto', 0.1, 1, 10,100]
}, n_iter=5, cv=5, verbose=True)

rscv.fit(x_train, y_train)
rscv_y_pred = rscv.predict(x_test)

Fitting 5 folds for each of 5 candidates, totalling 25 fits


In [24]:
print("Best Score: ",rscv.best_score_)
print("Best Params: ",rscv.best_params_)

Best Score:  0.9724818497956196
Best Params:  {'kernel': 'linear', 'gamma': 1, 'C': 1}


In [25]:
rscv_df = pd.DataFrame(rscv.cv_results_)
rscv_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_kernel,param_gamma,param_C,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.361175,0.007493,0.161902,0.009397,rbf,100,30,"{'kernel': 'rbf', 'gamma': 100, 'C': 30}",0.502252,0.5,0.501129,0.501129,0.501129,0.501128,0.000712,5
1,0.232055,0.049915,0.003997,0.000633,linear,auto,20,"{'kernel': 'linear', 'gamma': 'auto', 'C': 20}",0.984234,0.966216,0.96614,0.970655,0.970655,0.97158,0.006636,2
2,0.040573,0.003609,0.003998,0.000633,linear,1,1,"{'kernel': 'linear', 'gamma': 1, 'C': 1}",0.986486,0.966216,0.96614,0.975169,0.968397,0.972482,0.00774,1
3,0.14631,0.005001,0.030581,0.00049,sigmoid,0.1,10,"{'kernel': 'sigmoid', 'gamma': 0.1, 'C': 10}",0.704955,0.720721,0.715576,0.699774,0.717833,0.711772,0.008021,4
4,0.052168,0.004704,0.004397,0.000489,poly,10,10,"{'kernel': 'poly', 'gamma': 10, 'C': 10}",0.972973,0.961712,0.959368,0.972912,0.959368,0.965267,0.006326,3


## Observation: 
### We saw that RandomizedSearchCV provide almost equal Accuracy parameters set but took just 25 fits whereas GridSearchCV took 240 fits.