In [8]:
import numpy as np
import scipy.stats as sp
import pandas as pd
from sklearn.preprocessing import minmax_scale, scale, MinMaxScaler

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [9]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss

In [12]:
from wgan.simulation import create_continuous_data
from imblearn.over_sampling import SMOTE, ADASYN, SMOTENC
from wgan.imblearn import GANbalancer
import wgan.data_loader

## Artifical Data Generation

In [13]:
data = {
    "Independent" : create_continuous_data(n_samples=1000, n_var=5, n_dependent=0, pos_ratio=0),
    "Dependent" : create_continuous_data(n_samples=1000, n_var=5, n_dependent=5, pos_ratio=0),
    "Mixed" : create_continuous_data(n_samples=1000, n_var=10, n_dependent=5, pos_ratio=0)
}
    

## First and Second Moment Approximation

In [34]:
gan_balancer = GANbalancer(idx_cont=range(5), categorical=None, auxiliary=False,
                           generator_layers=[10], critic_layers=[10],
                          batch_size = 128, n_iter=10000)

In [36]:
gan_balancer._fit(data["Independent"][0], y=np.random.binomial(1,0.5,size=1000))

RuntimeError: size mismatch, m1: [128 x 17], m2: [15 x 1] at /Users/administrator/nightlies/pytorch-1.0.0/wheel_build_dirs/conda_3.6/conda/conda-bld/pytorch_1544137972173/work/aten/src/TH/generic/THTensorMath.cpp:940

In [None]:
%debug

> [0;32m/Users/hauptjoh/anaconda/envs/deeplearning/lib/python3.6/site-packages/torch/nn/functional.py[0m(1352)[0;36mlinear[0;34m()[0m
[0;32m   1350 [0;31m    [0;32mif[0m [0minput[0m[0;34m.[0m[0mdim[0m[0;34m([0m[0;34m)[0m [0;34m==[0m [0;36m2[0m [0;32mand[0m [0mbias[0m [0;32mis[0m [0;32mnot[0m [0;32mNone[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1351 [0;31m        [0;31m# fused op is marginally faster[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m-> 1352 [0;31m        [0mret[0m [0;34m=[0m [0mtorch[0m[0;34m.[0m[0maddmm[0m[0;34m([0m[0mtorch[0m[0;34m.[0m[0mjit[0m[0;34m.[0m[0m_unwrap_optional[0m[0;34m([0m[0mbias[0m[0;34m)[0m[0;34m,[0m [0minput[0m[0;34m,[0m [0mweight[0m[0;34m.[0m[0mt[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1353 [0;31m    [0;32melse[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m   1354 [0;31m        [0moutput[0m [0;34m=[0m [0minpu

## Visual test

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
combinations = [(x,y) for x in range(no_vars) for y in range(no_vars) if y>x]

In [None]:
fig, axes = plt.subplots(nrows=no_vars, ncols=no_vars, sharex=True, sharey=True, squeeze=True,figsize=(10,10))
for y in axes:
    for x in y:
        x.set_xticklabels([])
        x.set_yticklabels([])

for i,j in combinations:
    sns.kdeplot(X_majority[:,i], X_majority[:,j], alpha=0.5, cmap="Blues", ax=axes[(j,i)])
    sns.kdeplot(X_minority[:,i], X_minority[:,j], alpha=0.5, cmap="Greens", ax=axes[(j,i)])
fig.savefig(f'../img/cont_sample_tr_iter_{trainer.G.training_iterations}.png',format='png', dpi=100)
    #fig.show()

In [None]:
epochs = 90

for _ in range(30):
    trainer.train(data_loader, epochs)
    
    
    if modus == 'full':
        fake_minority = generator(*generator.sample_latent(num_samples= 1000, class_index=1)).data.numpy()
        fake_majority = generator(*generator.sample_latent(num_samples= 1000, class_index=0)).data.numpy()
    elif modus == 'minority':
        fake_minority = generator(generator.sample_latent(num_samples= 1000)).data.numpy()
        
    fig, axes = plt.subplots(nrows=no_vars, ncols=no_vars, sharex=True, squeeze=True,figsize=(10,10))
    for y in axes:
        for x in y:
            x.set_xticklabels([])
            x.set_yticklabels([])
    
    for i in range(no_vars):
        sns.kdeplot(X_minority[:,i], alpha=0.5, shade=True, color="blue", ax=axes[(i,i)])
        sns.kdeplot(fake_minority[:,i], alpha=0.5, shade=True, color="green", ax=axes[(i,i)])
    
    for i,j in combinations:
        axes[(i,j)].set_ylim(0,1)
        # majority (upper right)
        if modus == 'full':
            sns.kdeplot(X_majority[0:1000,i], X_majority[0:1000,j], alpha=0.5, cmap="Blues", ax=axes[(i,j)])
            sns.kdeplot(fake_majority[:,i], fake_majority[:,j], alpha=0.5, cmap="Greens", ax=axes[(i,j)], )
        
        # minority (lower left)
        sns.kdeplot(X_minority[:,i], X_minority[:,j], alpha=0.5, cmap="Blues", ax=axes[(j,i)])
        sns.kdeplot(fake_minority[:,i], fake_minority[:,j], alpha=0.5, cmap="Greens", ax=axes[(j,i)])
        
    fig.savefig(f'../img/cont_sample_tr_iter_{trainer.G.training_iterations}.png',format='png', dpi=200)
        #fig.show()

In [None]:
desc = f"multinormal_n{N//1000}_k{no_vars}_{modus}"
torch.save(generator.state_dict(), f"../models/wgan_generator_{desc}_{generator.training_iterations}")
torch.save(discriminator.state_dict(), f"../models/wgan_discriminator_{desc}_{generator.training_iterations}")

In [None]:
file_name = "multinormal_n10_k4_c2_6999"
generator.load_state_dict(torch.load(f"../models/wgan_generator_{file_name}"))
discriminator.load_state_dict(torch.load(f"../models/wgan_discriminator_{file_name}"))

## Distribution summary statistics

In [None]:
from torch import Tensor as T

In [None]:
fake_minority = generator(*generator.sample_latent(num_samples= minority_samples, class_index=1)).data.numpy()

In [None]:
print(np.mean(X_minority, axis=0))
print(np.mean(fake_minority, axis=0))

In [None]:
print(np.quantile(X_minority, q=np.arange(0,1,0.1), axis=0))
print(np.quantile(fake_minority, q=np.arange(0,1,0.1), axis=0))

In [None]:
print(np.cov(X_minority, rowvar=False) - np.cov(fake_minority,rowvar=False))


## Discriminator test

In [None]:
sample_size = X_minority.shape[0]

In [None]:
fake = generator(*generator.sample_latent(num_samples= sample_size, class_index=1)).data.numpy()
#fake = generator(generator.sample_latent(num_samples= sample_size)).data.numpy()

In [None]:
X_fakereal = np.vstack([X_minority, 
                        fake])
y_fakereal = np.concatenate([np.zeros(X_minority.shape[0]), 
                        np.ones(fake.shape[0])]).flatten()

In [None]:
clf = RandomForestClassifier(n_estimators=50, min_samples_leaf=20, n_jobs=10)
model_fakereal = clf.fit(X_fakereal, y_fakereal)

In [None]:
pred_fakereal = model_fakereal.predict_proba(X_fakereal)[:,1]
roc_auc_score(y_fakereal, pred_fakereal)

# Predictive performance testing

In [None]:
y_train_bin = np.argmax(y_train, axis=1)
y_test_bin = np.argmax(y_test, axis=1)

In [None]:
def test_auc(model_library, X, y_true):
    auc = {}
    for model in model_library.keys():
        pred = model_library[model].predict_proba(X_test)[:,1]
        auc[model] = roc_auc_score(y_true, pred)
    return auc

## Predictive test

In [None]:
minority_samples = X_minority.shape[0]
majority_samples = X_majority.shape[0]

fake_minority = generator(*generator.sample_latent(num_samples= minority_samples, class_index=1)).data.numpy()
fake_majority = generator(*generator.sample_latent(num_samples= majority_samples, class_index=0)).data.numpy()

X_synthetic = np.vstack([fake_majority, 
                         fake_minority])
y_synthetic = np.concatenate([np.zeros(majority_samples), 
                              np.ones(minority_samples)]).flatten()

In [None]:
clf_org = DecisionTreeClassifier(max_depth=10) #LogisticRegression(solver='saga') 
clf_fake = DecisionTreeClassifier(max_depth=10) #LogisticRegression(solver='saga')

predictive = {}
predictive["real"] = clf_org.fit(X=X_train, y=y_train_bin)
predictive["synthetic"] = clf_fake.fit(X=X_synthetic, y=y_synthetic)

test_auc(predictive, X_test, y_test)

## Upsampling performance

In [None]:
performance = {"original":[],"GANbalanced":[]}
for i in range(200):
    sample_size = X_minority.shape[0]*4
    X_fake = generator(*generator.sample_latent(num_samples= sample_size, class_index=1)).data.numpy()
    #X_fake = generator(generator.sample_latent(num_samples= sample_size, class_index=None)).data.numpy()
    y_fake = np.ones(shape=[sample_size])

    X_up = np.vstack([X_train,X_fake])
    y_up = np.hstack([y_train_bin,y_fake])

    clf_org = DecisionTreeClassifier(max_depth=5)
    clf_fake = DecisionTreeClassifier(max_depth=5)

    upsampling = {}
    upsampling["original"] =  clf_org.fit(X=X_train, y=y_train_bin)
    upsampling["GANbalanced"] = clf_fake.fit(X=X_up, y=y_up)
    
    performance_temp = test_auc(upsampling, X_test, y_test_bin)
    for model in performance_temp:
        performance[model].append(performance_temp[model])
    

In [None]:
print(pd.DataFrame(performance).mean())
print(pd.DataFrame(performance).std())

In [None]:
def plot_decision_function(X, y, clf, ax):
    plot_step = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha=0.4)
    ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
plot_decision_function(X_train, y_train, upsampling["original"], ax1)
plot_decision_function(X_up, y_up, upsampling["GANbalanced"], ax2)

fig.tight_layout()