In [1]:
from importlib import reload

import json
import numpy as np
import pandas as pd

import torch
from torch.utils.data import DataLoader
import torch.optim as optim

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
import wgan.data as data
from wgan.models_cat import Generator, Critic
from wgan.training import WGAN
from wgan.evaluation import discriminator_evaluation
from wgan.models_cat import make_GANbalancer

In [3]:
from wgan.data import load_DMC10

In [4]:
X_train, X_test, y_train, y_test, idx_cont, idx_cat, cat_dict = load_DMC10("/Users/hauptjoh/Data/DMC10")

In [6]:
X_train, X_test, y_train, y_test, Xy_gan, idx_cont2, idx_cat2, scaler = data.prepare_data(X_train, y_train, X_test, y_test, 
                                                                                        idx_cont=idx_cont, idx_cat=idx_cat, 
                                                                                       cat_levels = [np.max(X_train[:,i])+1 for i in idx_cat])



In [10]:
emb_sizes = [int(min(10., np.ceil(x+1/2))) for x in Xy_gan.cat_levels]

In [20]:
generator, critic = make_GANbalancer(Xy_gan, 128, [64,64], [64,64], emb_sizes, auxiliary=False)

In [22]:
batch_size = 64
train_loader = DataLoader(Xy_gan, batch_size = batch_size, shuffle=True)
#test_loader = DataLoader(data_test, batch_size = batch_size, shuffle=False)

# Initialize optimizers
lr_G = 5e-5
lr_D = 5e-5
betas = (.9, .99)
G_optimizer = optim.Adam(generator.parameters(), lr=lr_G, betas=betas)
C_optimizer = optim.Adam(critic.parameters(), lr=lr_D, betas=betas)

In [16]:
trainer = WGAN(generator, critic, G_optimizer, C_optimizer, print_every=1000,
                  use_cuda=torch.cuda.is_available())

In [17]:
trainer.train(train_loader, 10)


Epoch 1
Iteration 1
D: 0.02111656218767166
GP: 8.844720840454102
Gradient norm: 0.059550996869802475

Epoch 2
Iteration 1
D: -0.47809159755706787
GP: 0.3886418044567108
Gradient norm: 0.9690327048301697
G: 0.3378964066505432
Distance: 0.47809159755706787

Epoch 3
Iteration 1
D: -2.4365949630737305
GP: 0.10617315024137497
Gradient norm: 1.093416452407837
G: 1.3715300559997559
Distance: 2.4365949630737305

Epoch 4
Iteration 1
D: -3.180450201034546
GP: 0.1919037401676178
Gradient norm: 1.138419270515442
G: 2.005605459213257
Distance: 3.180450201034546

Epoch 5
Iteration 1
D: -3.3453946113586426
GP: 0.20758435130119324
Gradient norm: 1.1440528631210327
G: 2.1219892501831055
Distance: 3.3453946113586426

Epoch 6
Iteration 1
D: -3.381187677383423
GP: 0.21828508377075195
Gradient norm: 1.1476640701293945
G: 2.2195801734924316
Distance: 3.381187677383423

Epoch 7
Iteration 1
D: -3.3247666358947754
GP: 0.20660720765590668
Gradient norm: 1.1434406042099
G: 2.126131057739258
Distance: 3.32476663

In [18]:
generator.training_iterations 

314

In [19]:
desc = f"DMC10_conditional_input128_hidden64-64"
torch.save(generator.state_dict(), f"../models/wgan_generator_{desc}_{generator.training_iterations}")
torch.save(critic.state_dict(), f"../models/wgan_critic_{desc}_{generator.training_iterations}")

In [23]:
file_name = "DMC10_minority_input128_hidden64-64_12492"
generator.load_state_dict(torch.load(f"../models/wgan_generator_{file_name}"))
critic.load_state_dict(torch.load(f"../models/wgan_critic_{file_name}"))

## Eye ball test


In [307]:

from scipy.stats import moment

In [308]:
X_fake = generator(generator.sample_latent(1500)).data.numpy()

In [33]:
def calc_moments(X):
    return np.array([np.mean(X), moment(X,2), moment(X,3), moment(X,4)])

In [34]:
np.round(pd.DataFrame(X_fake).apply(calc_moments).T,2)

Unnamed: 0,0,1,2,3
0,0.19,0.00,0.00,0.00
1,0.00,0.00,0.00,0.00
2,0.00,0.00,0.00,0.00
3,0.00,0.00,0.00,0.00
4,0.00,0.00,0.00,0.00
5,0.00,0.00,0.00,0.00
6,0.00,0.00,0.00,0.00
7,0.00,0.00,0.00,0.00
8,0.00,0.00,0.00,0.00
9,0.00,0.00,0.00,0.00


In [50]:
np.round(pd.DataFrame(X_train[y_train==1]).apply(calc_moments).T,2)

Unnamed: 0,0,1,2,3
0,0.15,0.13,0.09,0.08
1,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0


In [None]:
pd.DataFrame(X_fake).iloc[:,24].describe()

In [None]:
pd.DataFrame(X_train[y_train==1]).iloc[:,24].describe()

In [None]:
from wgan.evaluation import plot_distributions

In [None]:
X_fake = generator.sample_data(sum(y_train==1)).numpy()

In [None]:
X_fake2 = np.round(X_fake*10)

In [None]:
pd.DataFrame(X_fake2).iloc[:,4].value_counts()

In [None]:
pd.DataFrame(X_train2[y_train==1,:]).iloc[:,4].value_counts()

In [None]:
distributions = plot_distributions(X_train2[y_train==1,:][:,[1,2,4,12,14]],X_fake2[:,[1,2,4,12,14]])

## Predictive test

In [22]:
#clf = DecisionTreeClassifier(max_depth=10) #LogisticRegression(solver='saga') 
clf = RandomForestClassifier(n_estimators=50, min_samples_leaf=50, ) #LogisticRegression(solver='saga') 

In [25]:
X_fake = generator.sample_data(num_samples=int(sum(y_train==1)),class_index=1).numpy()

In [28]:
y_train

array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])

In [279]:
#X_fake[:,2:29] = np.round(X_fake[:,2:29])

In [26]:
X_fakereal = np.vstack([X_train[y_train==1], 
                        X_fake])
y_fakereal = np.concatenate([np.zeros(X_train[y_train==1].shape[0]), 
                        np.ones(X_fake.shape[0])]).flatten()

IndexError: boolean index did not match indexed array along dimension 1; dimension is 38 but corresponding boolean dimension is 1

In [283]:
clf = RandomForestClassifier(n_estimators=50, min_samples_leaf=10, n_jobs=10)
model_fakereal = clf.fit(X_fakereal, y_fakereal)

In [284]:

discriminator_evaluation(X_train[y_train==1], X_fake,clf)

(1.0, 1.0)

In [285]:
idx_cont

[11,
 12,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27,
 28,
 29,
 30,
 35,
 36,
 37]

In [289]:
pd.DataFrame(X_train[y_train==1,:][:,:]).mean()

0      0.154850
1      0.000000
2      0.000000
3      0.000165
4      0.000000
5      0.000165
6      0.000000
7      0.000165
8      0.000000
9      0.000000
10     0.000000
11     0.000000
12     0.000000
13     0.000000
14     0.000165
15     0.000000
16     0.000000
17     0.000000
18     0.000000
19     0.000000
20     0.503057
21     0.006776
22     7.583375
23     0.234507
24     1.081474
25     0.987275
26     0.253512
27    42.105272
28     0.038837
29     0.139481
30     0.986614
31     0.003966
32     0.457115
33     0.097670
34     3.927119
35     4.890597
36    15.052058
37     1.742852
dtype: float64

In [290]:
pd.DataFrame(X_fake).mean()

0      0.185596
1      0.000007
2      0.000009
3      0.000045
4      0.000007
5      0.000217
6      0.000010
7      0.000168
8      0.000012
9      0.000007
10     0.000004
11     0.000013
12     0.000012
13     0.000007
14     0.000233
15     0.000005
16     0.000017
17     0.000015
18     0.000007
19     0.000020
20     0.520079
21     0.013551
22     8.055859
23     0.257974
24     1.151380
25     0.977855
26     0.231036
27    42.698563
28     0.035366
29     0.165923
30     1.127086
31     0.005784
32     0.485540
33     0.090729
34     3.698893
35     4.613783
36    15.475128
37     1.638572
dtype: float32

In [287]:
np.round(pd.DataFrame(model_fakereal.feature_importances_),2)

Unnamed: 0,0
0,0.0
1,0.0
2,0.04
3,0.04
4,0.0
5,0.06
6,0.0
7,0.02
8,0.08
9,0.06


## Upsampling Test

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [25]:
from imblearn import FunctionSampler
from imblearn.over_sampling import SMOTE
from wgan.evaluation import make_GAN_sampler, upsampling_evaluation

In [26]:
from copy import deepcopy
from sklearn.metrics import roc_auc_score, accuracy_score, f1_score
def upsampling_evaluation(X_train, X_test, y_train, y_test, classifier, sampler):
    """
    Train classifier on resampled data and evaluate performance in terms of AUC

    Input
    -----
    sampler: imblearn object or None
    """
    model = deepcopy(classifier)

    # Sample synthetic SMOTE data
    if sampler:
        X_train, y_train =  sampler.fit_resample(X_train,y_train)

    model.fit(X=X_train, y=y_train)
    prob = model.predict_proba(X_test)[:,1]
    imb_ratio = np.mean(y_train)

    return roc_auc_score(y_test, prob), f1_score(y_test, 1.0*(prob>imb_ratio)), imb_ratio 

In [41]:
np.sum(y_train)/len(y_train)

0.1865980017269027

In [28]:
#sampling_target = {1:int(np.sum(y_train)*2)}
sampling_target = "auto"

In [31]:
smote = SMOTE(sampling_strategy = sampling_target, k_neighbors=100, n_jobs=20) #random_state=123, 

wgan_sampler = make_GAN_sampler(generator, sampling_target)

result = {"F1":[],"AUC":[]}
for sampler in [None, smote, wgan_sampler]:
    temp_result = {'F1':[],"AUC":[]}
    for _ in range(3):
        auc,f1,imb_ratio = upsampling_evaluation(X_train, X_test, y_train, y_test, 
                         #LogisticRegression(solver="lbfgs", max_iter=1e4),
                         #DecisionTreeClassifier(min_samples_leaf=50),
                         RandomForestClassifier(n_estimators=100, min_samples_leaf=100),
                         sampler)
        temp_result["F1"].append(f1)
        temp_result["AUC"].append(auc)
    
    result["F1"].append(np.mean(temp_result["F1"]))
    result["AUC"].append(np.mean(temp_result["AUC"]))

In [32]:
result

{'F1': [0.34482368359500964, 0.20994430417832977, 0.0],
 'AUC': [0.6110524597359962, 0.5433505076479904, 0.6110360025287358]}