# Gender classification

## 1. Main goal

Given the height, weight and shoe size of a person, determine its gender

## 2. Data

In [49]:
# [height, weight, shoe size]
features = [
    [181,80,44], [177,70,43], [160,60,38], [154,54,37],
    [166,65,40], [190,90,47], [175,64,39], [177,70,40],
    [159,55,37], [171,75,42], [181,85,43]
]

labels = ['male', 'female', 'female', 'female', 'male', 'male',
    'male', 'female', 'male', 'female', 'male']

for m, g in zip(features, labels):
    print(m, g)

[181, 80, 44] male
[177, 70, 43] female
[160, 60, 38] female
[154, 54, 37] female
[166, 65, 40] male
[190, 90, 47] male
[175, 64, 39] male
[177, 70, 40] female
[159, 55, 37] male
[171, 75, 42] female
[181, 85, 43] male


## 3. Model training

In [103]:
from time import time
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

### 3.1 Out of the box DecisionTreeClassifier

In [51]:
dt_clf = DecisionTreeClassifier()
t0 = time()
dt_clf.fit(features, labels)
print("training time:", round(time()-t0, 3), "s")

training time: 0.315 s


### 3.2 Out of the box KNeighborsClassifier

In [52]:
knn_clf = KNeighborsClassifier()
t0 = time()
knn_clf.fit(features, labels)
print("training time:", round(time()-t0, 3), "s")

training time: 0.004 s


### 3.3 SVC with gamma = 'scale'

In [53]:
svc_clf = SVC(gamma='scale')
t0 = time()
svc_clf.fit(features, labels)
print("training time:", round(time()-t0, 3), "s")

training time: 0.053 s


### 3.4 SVC with gamma = 'auto'

In [62]:
svc_auto_clf = SVC(gamma='auto')
t0 = time()
svc_auto_clf.fit(features, labels)
print("training time:", round(time()-t0, 3), "s")

training time: 0.002 s


## 4. Validation

The accuracy score is used to validate the trained models. Due the small dataset available we use the validation_set == training_set, knowing that for real cases a much better pipeline consist into implement a cross validation strategy.

In [102]:
# Get the accuracy_score of each model
# and select the best one. Note that for
# this toy example validation_set == training_set

models = [svc_auto_clf, svc_clf, knn_clf, dt_clf]
m_names = ['SVC (gamma=auto)', 'SVC (gamma=scale)', 'KNN', 'DecisionTree']
scores = []
# calc accuracy scores
for model, m_name in zip(models, m_names):
    scores.append(model.score(features, labels))
    print("{:0.4}\t{}".format(scores[-1], m_name))
# select the best model
best_index = np.argmax(scores)
print("\nThe model with the highest accuracy ({:0.4}) is {}".format(scores[best_index], m_names[best_index]))
    

1.0	SVC (gamma=auto)
0.7273	SVC (gamma=scale)
0.5455	KNN
1.0	DecisionTree

The model with the highest accuracy (1.0) is SVC (gamma=auto)


## 5. Predict

In [104]:
test_case = [[190, 70, 43]]

In [105]:
svc_auto_clf.predict(test_case)

array(['male'],
      dtype='<U6')