In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import cross_val_score, train_test_split

In [111]:
df = pd.read_csv('voice.csv')

In [112]:
df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [113]:
df['gender_code'] = [1 if x == 'female' else 0 for x in df['label']]

Here, we label the genders with 0 or 1. 

In [114]:
male = df[df['gender_code'] == 0]
female = df[df['gender_code'] == 1]

In [115]:
male[['meanfreq','IQR','sfm']].describe()

Unnamed: 0,meanfreq,IQR,sfm
count,1584.0,1584.0,1584.0
mean,0.170813,0.110784,0.47167
std,0.026254,0.020415,0.150473
min,0.039363,0.021841,0.080963
25%,0.155625,0.10096,0.363316
50%,0.176343,0.10994,0.461636
75%,0.190593,0.119331,0.576902
max,0.225582,0.196168,0.831347


In [116]:
female[['meanfreq','IQR','sfm']].describe()

Unnamed: 0,meanfreq,IQR,sfm
count,1584.0,1584.0,1584.0
mean,0.191,0.057834,0.344763
std,0.02996,0.042924,0.179854
min,0.078847,0.014558,0.036876
25%,0.177031,0.031106,0.208125
50%,0.192732,0.042689,0.277228
75%,0.211981,0.061268,0.478122
max,0.251124,0.252225,0.842936


In [117]:
cols = [x for x in df.columns if x not in ['label','gender_code']]

In [118]:
x = df[cols]
y = df['gender_code']

In [119]:
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=.3,stratify=y)

In [120]:
print x_train.shape
print y_train.shape
print x_test.shape
print y_test.shape

(2216, 20)
(2216,)
(952, 20)
(952,)


In [129]:
rf = RandomForestClassifier()

In [122]:
print x.shape
print y.shape

(3168, 20)
(3168,)


In [46]:
pd.DataFrame({'columns':x.columns,'importances':rf.feature_importances_}).sort_values('importances',ascending=False)

Unnamed: 0,columns,importances
12,meanfun,0.345532
5,IQR,0.214203
9,sfm,0.104121
3,Q25,0.099171
1,sd,0.075121
8,sp.ent,0.047974
11,centroid,0.012346
10,mode,0.011702
4,Q75,0.009757
2,median,0.009301


In [55]:
df.corr()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,gender_code
meanfreq,1.0,-0.739039,0.925445,0.911416,0.740997,-0.627605,-0.322327,-0.316036,-0.601203,-0.784332,...,1.0,0.460844,0.383937,0.274004,0.536666,0.229261,0.519528,0.51557,-0.216979,0.337415
sd,-0.739039,1.0,-0.562603,-0.846931,-0.161076,0.87466,0.314597,0.346241,0.71662,0.838086,...,-0.739039,-0.466281,-0.345609,-0.129662,-0.482726,-0.357667,-0.482278,-0.475999,0.12266,-0.479539
median,0.925445,-0.562603,1.0,0.774922,0.731849,-0.477352,-0.257407,-0.243382,-0.502005,-0.66169,...,0.925445,0.414909,0.337602,0.251328,0.455943,0.191169,0.438919,0.435621,-0.213298,0.283919
Q25,0.911416,-0.846931,0.774922,1.0,0.47714,-0.874189,-0.319475,-0.350182,-0.648126,-0.766875,...,0.911416,0.545035,0.320994,0.199841,0.467403,0.302255,0.459683,0.454394,-0.141377,0.511455
Q75,0.740997,-0.161076,0.731849,0.47714,1.0,0.009636,-0.206339,-0.148881,-0.174905,-0.378198,...,0.740997,0.155091,0.258002,0.285584,0.359181,-0.02375,0.335114,0.335648,-0.216475,-0.066906
IQR,-0.627605,0.87466,-0.477352,-0.874189,0.009636,1.0,0.249497,0.316185,0.640813,0.663601,...,-0.627605,-0.534462,-0.22268,-0.069588,-0.333362,-0.357037,-0.337877,-0.331563,0.041252,-0.618916
skew,-0.322327,0.314597,-0.257407,-0.319475,-0.206339,0.249497,1.0,0.97702,-0.195459,0.079694,...,-0.322327,-0.167668,-0.216954,-0.080861,-0.336848,-0.061608,-0.305651,-0.30464,-0.169325,-0.036627
kurt,-0.316036,0.346241,-0.243382,-0.350182,-0.148881,0.316185,0.97702,1.0,-0.127644,0.109884,...,-0.316036,-0.19456,-0.203201,-0.045667,-0.303234,-0.103313,-0.2745,-0.272729,-0.205539,-0.087195
sp.ent,-0.601203,0.71662,-0.502005,-0.648126,-0.174905,0.640813,-0.195459,-0.127644,1.0,0.866411,...,-0.601203,-0.513194,-0.305826,-0.120738,-0.293562,-0.294869,-0.324253,-0.319054,0.198074,-0.490552
sfm,-0.784332,0.838086,-0.66169,-0.766875,-0.378198,0.663601,0.079694,0.109884,0.866411,1.0,...,-0.784332,-0.421066,-0.3621,-0.192369,-0.428442,-0.289593,-0.436649,-0.43158,0.211477,-0.357499


One really cool thing about ensemble classifiers is that we can figure out what variables we should pay most attention to.
Very useful for feature selection.

In [126]:
pipe_rf = Pipeline([('scaler', StandardScaler()), 
                    ('pca', PCA(n_components=10)),
                    ('rf', estimator)])

pipe_rf.fit(x_train, y_train)
predictions = pipe_rf.predict(x_test)
print('Test Accuracy: %.3f' % pipe_rf.score(x_test, y_test))

Test Accuracy: 0.938


In [56]:
print '\nClassification Report:\n', classification_report(y_test, predictions)


Classification Report:
             precision    recall  f1-score   support

          0       0.95      0.96      0.96       476
          1       0.96      0.95      0.96       476

avg / total       0.96      0.96      0.96       952



In [58]:
print '\nConfusion Matrix:\n', confusion_matrix(y_test, predictions)


Confusion Matrix:
[[459  17]
 [ 25 451]]


Our model looks pretty good, but I am curious to see how the misclassfied 42 people were classified that way.

In [69]:
comparisons = pd.concat([y_test.reset_index(),pd.DataFrame(predictions)],axis=1)

In [71]:
comparisons.head()

Unnamed: 0,index,gender_code,0
0,302,0,0
1,59,0,0
2,535,0,1
3,2885,1,1
4,2086,1,1


In [82]:
misgendered = []
for i in range(len((comparisons))):
    if comparisons['gender_code'][i] != comparisons[0][i]:
        misgendered.append(comparisons['index'][i])
    else:
        pass

In [110]:
for a in misgendered:
     print df.ix[a,['meanfun','IQR','sfm','gender_code']]

meanfun        0.120921
IQR            0.056032
sfm            0.564341
gender_code           0
Name: 535, dtype: object
meanfun         0.141281
IQR            0.0690856
sfm             0.460696
gender_code            1
Name: 2227, dtype: object
meanfun         0.127532
IQR            0.0963057
sfm             0.586907
gender_code            1
Name: 3001, dtype: object
meanfun         0.12442
IQR            0.036618
sfm            0.137412
gender_code           0
Name: 265, dtype: object
meanfun        0.120959
IQR            0.152575
sfm            0.772289
gender_code           0
Name: 633, dtype: object
meanfun         0.172847
IQR            0.0481617
sfm            0.0809634
gender_code            0
Name: 304, dtype: object
meanfun        0.124927
IQR            0.113724
sfm            0.636657
gender_code           1
Name: 1911, dtype: object
meanfun        0.183974
IQR            0.159576
sfm            0.652936
gender_code           1
Name: 3152, dtype: object
meanfun        0