In [1]:
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plot

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
features = np.loadtxt('features.txt')

In [3]:
labels = Path('labels.txt').read_text().split('\n')

## The data needs some cleaning

In [4]:
np.unique(labels)

array(['Female', 'Male', 'Masculino', 'Please Select', 'Weiblich',
       '[female];', '[male];', 'adult', 'female', 'make', 'male', 'male;',
       'unknown'], dtype='<U13')

### Set 1=male 2=female

In [5]:
male_words = ['Male', 'Masculino', '[male];', 'male', 'male;']
female_words = ['Female', 'Weiblich', 'female', '[female];']

clean_labels = []
clean_features=  []
for feature, label in zip(features, labels):
    if label in male_words:
        clean_features.append(feature)
        clean_labels.append(1)
    elif label in female_words:
        clean_features.append(feature)
        clean_labels.append(2)
        
features = np.array(clean_features)
labels = np.array(clean_labels)        

## Train/Test Split

In [6]:
train_x, test_x, train_y, test_y = train_test_split(features, labels)

In [7]:
male_features = train_x[train_y==1]
male_labels = train_y[train_y == 1]

female_features = train_x[train_y==2]
female_labels = train_y[train_y == 2]

female_features = np.row_stack([female_features] * int(len(male_labels)/len(female_labels)))
female_labels = np.concatenate([female_labels] * int(len(male_labels)/len(female_labels)))

train_x = np.row_stack((male_features, female_features))
train_y = np.concatenate((male_labels, female_labels))

## Train a Model

In [9]:
# model = RandomForestClassifier()
model = GradientBoostingClassifier(n_estimators=500, max_depth=5, subsample=0.8, verbose=1)
model.fit(train_x, train_y)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.2472           0.1367           10.56m
         2           1.1329           0.1139           10.42m
         3           1.0373           0.0955           10.33m
         4           0.9555           0.0812           10.29m
         5           0.8862           0.0693           10.24m
         6           0.8247           0.0604           10.19m
         7           0.7716           0.0535           10.18m
         8           0.7238           0.0478           10.14m
         9           0.6803           0.0419           10.12m
        10           0.6433           0.0370           10.09m
        20           0.4259           0.0138            9.88m
        30           0.3324           0.0063            9.69m
        40           0.2811           0.0041            9.52m
        50           0.2462           0.0025            9.34m
        60           0.2221           0.0024            9.17m
       

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=5,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=0.8, tol=0.0001,
                           validation_fraction=0.1, verbose=1,
                           warm_start=False)

## Evaluate it

In [10]:
print(classification_report(test_y, model.predict(test_x), target_names=['Male', 'Female']))

              precision    recall  f1-score   support

        Male       0.99      0.99      0.99     15999
      Female       0.92      0.95      0.93      2944

    accuracy                           0.98     18943
   macro avg       0.96      0.97      0.96     18943
weighted avg       0.98      0.98      0.98     18943

