## Gender Recognition by Voice
The following notebook is based on the dataset provided by Kaggle: https://www.kaggle.com/primaryobjects/voicegender          
It contains 3168 samples of male and female voice samples. We'll be using SVMs to classify whether a particular voice sample is from a male or female speaker.

In [61]:
import pandas as pd

In [62]:
# load dataset
df = pd.read_csv('./voice.csv')

In [63]:
# how many total samples and each of male and female
print('total samples: {}'.format(len(df.label)))
print('male samples: {}, female samples: {}'.format(sum(df.label=='male'), sum(df.label=='female')))

total samples: 3168
male samples: 1584, female samples: 1584


In [64]:
# Therefore, randomly guessing all males will result in a 50% baseline accuracy

# overview of the data
df.describe()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,mode,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx
count,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0,3168.0
mean,0.180907,0.057126,0.185621,0.140456,0.224765,0.084309,3.140168,36.568461,0.895127,0.408216,0.165282,0.180907,0.142807,0.036802,0.258842,0.829211,0.052647,5.047277,4.99463,0.173752
std,0.029918,0.016652,0.03636,0.04868,0.023639,0.042783,4.240529,134.928661,0.04498,0.177521,0.077203,0.029918,0.032304,0.01922,0.030077,0.525205,0.063299,3.521157,3.520039,0.119454
min,0.039363,0.018363,0.010975,0.000229,0.042946,0.014558,0.141735,2.068455,0.738651,0.036876,0.0,0.039363,0.055565,0.009775,0.103093,0.007812,0.004883,0.007812,0.0,0.0
25%,0.163662,0.041954,0.169593,0.111087,0.208747,0.04256,1.649569,5.669547,0.861811,0.258041,0.118016,0.163662,0.116998,0.018223,0.253968,0.419828,0.007812,2.070312,2.044922,0.099766
50%,0.184838,0.059155,0.190032,0.140286,0.225684,0.09428,2.197101,8.318463,0.901767,0.396335,0.186599,0.184838,0.140519,0.04611,0.271186,0.765795,0.023438,4.992188,4.945312,0.139357
75%,0.199146,0.06702,0.210618,0.175939,0.24366,0.114175,2.931694,13.648905,0.928713,0.533676,0.221104,0.199146,0.169581,0.047904,0.277457,1.177166,0.070312,7.007812,6.992188,0.209183
max,0.251124,0.115273,0.261224,0.247347,0.273469,0.252225,34.725453,1309.612887,0.981997,0.842936,0.28,0.251124,0.237636,0.204082,0.279114,2.957682,0.458984,21.867188,21.84375,0.932374


In [65]:
# Any null values? Dimensions of data?
print(df.isnull().sum())
print('Shape: {}'.format(df.shape))

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
IQR         0
skew        0
kurt        0
sp.ent      0
sfm         0
mode        0
centroid    0
meanfun     0
minfun      0
maxfun      0
meandom     0
mindom      0
maxdom      0
dfrange     0
modindx     0
label       0
dtype: int64
Shape: (3168, 21)


In [66]:
# Assigning X and Y
X = df.iloc[:,:-1]
Y = df.iloc[:,-1]

In [67]:
# convert strings to int values
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit_transform(Y)

array([1, 1, 1, ..., 0, 0, 0], dtype=int64)

In [68]:
# scale the feature data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(X)

In [69]:
# split data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=32)

In [70]:
# train a SVM classifier with default hyperparameters
from sklearn.svm import SVC
from sklearn import metrics
clf = SVC()
clf.fit(X_train,Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [71]:
# prediction
Y_pred = clf.predict(X_test)
print('Accuracy Score: {}'.format(metrics.accuracy_score(Y_test,Y_pred)))

Accuracy Score: 0.985804416404


In [72]:
# Linear kernel
clf = SVC(kernel='linear')
clf.fit(X_train,Y_train)
Y_pred = clf.predict(X_test)
print('Accuracy Score: {}'.format(metrics.accuracy_score(Y_test,Y_pred)))

Accuracy Score: 0.982649842271


In [73]:
# Linear SVC
from sklearn.svm import LinearSVC
lin_clf = LinearSVC(loss="hinge")
lin_clf.fit(X_train,Y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=1000, multi_class='ovr',
     penalty='l2', random_state=None, tol=0.0001, verbose=0)

In [74]:
Y_pred = lin_clf.predict(X_test)
print('Accuracy Score: {}'.format(metrics.accuracy_score(Y_test,Y_pred)))

Accuracy Score: 0.982649842271


In [75]:
# perform GridSearchCV to fine-tune hyperparameters
from sklearn.model_selection import GridSearchCV
C = np.linspace(0.01,3,20)
gamma = np.linspace(0,3,20)
parameters = {'C':C,'gamma':gamma}
svc = SVC()
grid_search_clf = GridSearchCV(svc,parameters)
grid_search_clf.fit(X_train,Y_train)
print("The best parameters are %s with a score of %0.2f"
      % (grid_search_clf.best_params_, grid_search_clf.best_score_))

The best parameters are {'C': 1.2689473684210528, 'gamma': 0.15789473684210525} with a score of 0.98


In [80]:
# test parameters found thru Grid Search on the test set
Y_pred = grid_search_clf.predict(X_test)
print('Accuracy Score: {}'.format(metrics.accuracy_score(Y_test,Y_pred)))

Accuracy Score: 0.984227129338


## References
https://datascience.stackexchange.com/questions/21877/how-to-use-the-output-of-gridsearch/21888