In [50]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rc
rc('font', family='AppleGothic')
plt.rcParams['axes.unicode_minus'] = False

import warnings
warnings.filterwarnings('ignore')

In [2]:
pos_pos = pd.read_csv("pp.csv")
pos_neg = pd.read_csv("pn.csv")
neg_pos = pd.read_csv("np.csv")
neg_neg = pd.read_csv("nn.csv")

gender = pd.read_csv('all_gender.csv')

In [3]:
gender.head()

Unnamed: 0,vocal,label,gender
0,Final_dataset/이정_겨울봄.wav,4022,0
1,Final_dataset/버즈_거짓말.wav,4234,0
2,Final_dataset/김광석_너무 깊이 생각하지마.wav,4411,0
3,Final_dataset/소찬휘_크게 라디오를 켜고.wav,4229,1
4,Final_dataset/이적_당연한 것들.wav,4493,0


In [4]:
pos_pos.shape, pos_neg.shape, neg_pos.shape, neg_neg.shape

((256, 4500), (256, 4500), (256, 4500), (256, 4500))

In [5]:
np_df = neg_pos.T.reset_index()
df = np_df.merge(gender[['vocal', 'gender']], left_on='index', right_on='vocal', how='left').drop('vocal', axis=1)

In [6]:
ftr = df.drop(['index', 'gender'], axis=1)
target = df[['gender']]

In [90]:
from sklearn.model_selection import train_test_split

x_train, x_valid, y_train, y_valid = train_test_split(ftr, target, test_size=.1, stratify=target, random_state=42)

In [91]:
x_train.shape, x_valid.shape, y_train.shape, y_valid.shape

((4050, 256), (450, 256), (4050, 1), (450, 1))

### UMAP

In [45]:
# !pip install -q umap-learn

You should consider upgrading via the '/usr/bin/python3 -m pip install --upgrade pip' command.[0m


In [53]:
# import umap
# reducer = umap.UMAP(n_components=32)

# x_train = reducer.fit_transform(x_train)
# x_valid = reducer.transform(x_valid)
# x_train.shape, x_valid.shape

((3600, 32), (900, 32))

### Model

In [10]:
from sklearn.metrics import f1_score, classification_report
from sklearn.model_selection import KFold

cv = KFold(n_splits=5, shuffle=True, random_state=42)

### RandomForest

In [92]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=42, n_estimators=4000, class_weight='balanced_subsample')
rf.fit(x_train, y_train)
rf_pred = rf.predict(x_valid)

In [93]:
f1_score(rf_pred, y_valid)

0.7861635220125786

In [94]:
target['gender'].unique().tolist()

[0, 1]

In [95]:
print(classification_report(y_valid, rf_pred, target_names=['male', 'female']))

              precision    recall  f1-score   support

        male       0.84      0.93      0.88       276
      female       0.87      0.72      0.79       174

    accuracy                           0.85       450
   macro avg       0.85      0.82      0.83       450
weighted avg       0.85      0.85      0.85       450



### XGBoost

In [96]:
from xgboost import XGBClassifier
xgb = XGBClassifier(random_state=42, n_estimators=4000, class_weight='balanced_subsample')
xgb.fit(x_train, y_train)
xgb_pred = xgb.predict(x_valid)

Parameters: { class_weight } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




In [97]:
f1_score(xgb_pred, y_valid)

0.7734138972809668

In [98]:
print(classification_report(y_valid, xgb_pred, target_names=['male', 'female']))

              precision    recall  f1-score   support

        male       0.84      0.89      0.87       276
      female       0.82      0.74      0.77       174

    accuracy                           0.83       450
   macro avg       0.83      0.82      0.82       450
weighted avg       0.83      0.83      0.83       450



### NGBoost

In [99]:
#!pip install -q ngboost

In [100]:
from ngboost import NGBClassifier
from ngboost.distns import categorical
from ngboost.scores import Score, MLE, CRPS

ngb = NGBClassifier(random_state=42, n_estimators=2000,
                    Dist=categorical.k_categorical(2),
                    Base=default_tree_learner,
                    minibatch_frac=1.0,
                    Score=MLE)
ngb.fit(X=x_train, Y=y_train, X_val=x_valid, Y_val=y_valid, early_stopping_rounds=500)
ngb_pred = ngb.predict(x_valid)

[iter 0] loss=0.6672 val_loss=0.6489 scale=4.0000 norm=8.0000
[iter 100] loss=0.3324 val_loss=0.3835 scale=2.0000 norm=3.1882
[iter 200] loss=0.3073 val_loss=0.3784 scale=1.0000 norm=1.6088
[iter 300] loss=0.2973 val_loss=0.3814 scale=1.0000 norm=1.6083
[iter 400] loss=0.2922 val_loss=0.3858 scale=0.5000 norm=0.8038
[iter 500] loss=0.2901 val_loss=0.3909 scale=0.2500 norm=0.4030
[iter 600] loss=0.2886 val_loss=0.3940 scale=0.2500 norm=0.4026
== Early stopping achieved.
== Best iteration / VAL167 (val_loss=0.3775)


In [101]:
f1_score(ngb_pred, y_valid)

0.7774294670846394

In [102]:
print(classification_report(y_valid, ngb_pred, target_names=['male', 'female']))

              precision    recall  f1-score   support

        male       0.84      0.92      0.88       276
      female       0.86      0.71      0.78       174

    accuracy                           0.84       450
   macro avg       0.85      0.82      0.83       450
weighted avg       0.84      0.84      0.84       450



### ExtraTrees

In [103]:
from sklearn.ensemble import ExtraTreesClassifier
ext = ExtraTreesClassifier(random_state=42, n_estimators=4000, class_weight='balanced')
ext.fit(x_train, y_train)
ext_pred = ext.predict(x_valid)

In [104]:
f1_score(ext_pred, y_valid)

0.7749999999999999

In [105]:
print(classification_report(y_valid, ext_pred, target_names=['male', 'female']))

              precision    recall  f1-score   support

        male       0.84      0.92      0.88       276
      female       0.85      0.71      0.77       174

    accuracy                           0.84       450
   macro avg       0.84      0.82      0.83       450
weighted avg       0.84      0.84      0.84       450



### TabNet

In [106]:
#!pip install -q pytorch-tabnet

In [107]:
x_train = x_train.reset_index(drop=True)
x_valid = x_valid.reset_index(drop=True)

y_train = y_train.reset_index(drop=True)
y_valid = y_valid.reset_index(drop=True)

In [108]:
x_train.values.shape, y_train.values.shape

((4050, 256), (4050, 1))

In [109]:
import torch

if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
print(DEVICE)

cuda


In [110]:
from pytorch_tabnet.pretraining import TabNetPretrainer

unsupervised_model = TabNetPretrainer(optimizer_fn=torch.optim.Adam,
                                      optimizer_params=dict(lr=2e-2),
                                      mask_type="entmax",
                                      n_shared_decoder=1,
                                      n_indep_decoder=1)
unsupervised_model.fit(X_train=x_train.values,
                       eval_set=[x_valid.values],
                       pretraining_ratio=0.8,
                       batch_size=128,
                       virtual_batch_size=64,
                       drop_last=False,)

epoch 0  | loss: 162.01428| val_0_unsup_loss_numpy: 2.7197799682617188|  0:00:01s
epoch 1  | loss: 1.64335 | val_0_unsup_loss_numpy: 1.1120799779891968|  0:00:02s
epoch 2  | loss: 0.95234 | val_0_unsup_loss_numpy: 1.2578699588775635|  0:00:03s
epoch 3  | loss: 0.8974  | val_0_unsup_loss_numpy: 1.2454899549484253|  0:00:04s
epoch 4  | loss: 0.8855  | val_0_unsup_loss_numpy: 1.0815800428390503|  0:00:05s
epoch 5  | loss: 0.83056 | val_0_unsup_loss_numpy: 1.0029900074005127|  0:00:06s
epoch 6  | loss: 0.78682 | val_0_unsup_loss_numpy: 1.0521299839019775|  0:00:08s
epoch 7  | loss: 0.72826 | val_0_unsup_loss_numpy: 1.0008000135421753|  0:00:09s
epoch 8  | loss: 0.68412 | val_0_unsup_loss_numpy: 0.977840006351471|  0:00:10s
epoch 9  | loss: 0.65076 | val_0_unsup_loss_numpy: 0.951170027256012|  0:00:11s
epoch 10 | loss: 0.62476 | val_0_unsup_loss_numpy: 0.8807399868965149|  0:00:12s
epoch 11 | loss: 0.58777 | val_0_unsup_loss_numpy: 0.8507400155067444|  0:00:14s
epoch 12 | loss: 0.56576 | va

In [111]:
from pytorch_tabnet.tab_model import TabNetClassifier

clf_partial = TabNetClassifier(
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    mask_type='entmax'
)

clf_partial.fit(
    X_train=x_train.values, y_train=y_train['gender'].values,
    patience=20,
    eval_set=[(x_train.values, y_train['gender'].values), (x_valid.values, y_valid['gender'].values)],
    eval_name=['train', 'valid'],
    eval_metric=['accuracy'],
    from_unsupervised=unsupervised_model,
    batch_size=128, virtual_batch_size=64, weights=1, drop_last=False
)

epoch 0  | loss: 0.42485 | train_accuracy: 0.83778 | valid_accuracy: 0.83778 |  0:00:01s
epoch 1  | loss: 0.39756 | train_accuracy: 0.81778 | valid_accuracy: 0.80889 |  0:00:02s
epoch 2  | loss: 0.37092 | train_accuracy: 0.84469 | valid_accuracy: 0.84444 |  0:00:03s
epoch 3  | loss: 0.36949 | train_accuracy: 0.82148 | valid_accuracy: 0.81111 |  0:00:04s
epoch 4  | loss: 0.3808  | train_accuracy: 0.83531 | valid_accuracy: 0.81778 |  0:00:05s
epoch 5  | loss: 0.37104 | train_accuracy: 0.84222 | valid_accuracy: 0.84222 |  0:00:06s
epoch 6  | loss: 0.36964 | train_accuracy: 0.85259 | valid_accuracy: 0.85111 |  0:00:08s
epoch 7  | loss: 0.36824 | train_accuracy: 0.84765 | valid_accuracy: 0.84222 |  0:00:09s
epoch 8  | loss: 0.37929 | train_accuracy: 0.84815 | valid_accuracy: 0.83111 |  0:00:10s
epoch 9  | loss: 0.37217 | train_accuracy: 0.84889 | valid_accuracy: 0.85111 |  0:00:11s
epoch 10 | loss: 0.35265 | train_accuracy: 0.82222 | valid_accuracy: 0.81778 |  0:00:12s
epoch 11 | loss: 0.36

In [112]:
tn_pred = clf_partial.predict(x_valid.values)

In [113]:
f1_score(tn_pred, y_valid)

0.8130563798219584

In [114]:
print(classification_report(y_valid, tn_pred, target_names=['male', 'female']))

              precision    recall  f1-score   support

        male       0.87      0.91      0.89       276
      female       0.84      0.79      0.81       174

    accuracy                           0.86       450
   macro avg       0.86      0.85      0.85       450
weighted avg       0.86      0.86      0.86       450

