In [1]:
import numpy as np
import scipy.stats as sp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import minmax_scale, scale, MinMaxScaler

from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split

In [2]:
from wgan.models_cont import Generator, Discriminator
from wgan.training import WGAN

import torch
import torch.optim as optim
from torch.autograd import Variable

In [3]:
from sklearn.linear_model import LogisticRegression, RidgeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, brier_score_loss

In [4]:
from torch.utils.data import Dataset, DataLoader
from imbalanced_sampler.sampler import ImbalancedDatasetSampler

In [5]:
def make_correlation_matrix(no_var):
    corr = np.zeros([no_var,no_var])
    corr_temp = np.random.uniform(-1,1,size=[(no_var-1)*2])
    corr[np.triu_indices(no_var, 1)] = corr_temp
    corr + corr.T + np.eye(no_var)
    return corr


def create_continuous_data(N, pos_ratio=0, noise_ratio=0, no_var=10, cov=None, random_state=None):
    if random_state is not None: np.random.seed(random_state)
    # Group indicator
    #group = sp.binom.rvs(p=0.25, n=1, size=N    
    N_neg = int(N*(1-pos_ratio))
    N_pos = N-N_neg
    y = np.concatenate([np.zeros(N_neg), np.ones(N_pos)])
    
    mean = np.random.uniform(size=no_var)
    mean0 = np.random.normal(loc=mean,scale=0.5)
    mean1 = np.random.normal(loc=mean,scale=0.5)
    
    if cov is None: 
        cov0 = sp.invwishart.rvs(df=no_var*2, scale=np.eye(no_var))
        cov1 = sp.invwishart.rvs(df=no_var*2, scale=np.eye(no_var))

    # Noise are variables with same distribution in majority and minority class
    if noise_ratio != 0:  
        no_noise = int(noise_ratio*no_var)
        no_var = no_var - no_noise
        X_noise = sp.multivariate_normal.rvs(mean=mean0[no_var:], cov=cov0[no_var:,no_var:], size=N).reshape([N,-1])

    X1 = sp.multivariate_normal.rvs(mean=mean1[0:no_var], cov= cov1[:no_var,:no_var], size=N_pos)
    X0 = sp.multivariate_normal.rvs(mean=mean0[0:no_var], cov= cov0[:no_var,:no_var], size=N_neg)
    X = np.vstack([X0,X1])
    X = np.hstack([X, X_noise])
    
    return {"X":X, "y":y,"mean0":mean0,"mean1":mean1, "cov0":cov0, "cov1":cov1}

def create_dataset(n_samples=1000, n_features=2, n_classes=3, weights=(0.01, 0.01, 0.98),
                   class_sep=0.8, n_clusters=1, random_state=0):
    return make_classification(n_samples=n_samples,
                               n_informative=2, n_redundant=0, n_repeated=0,
                               n_classes=n_classes, n_features = n_features,
                               n_clusters_per_class=n_clusters,
                               weights=list(weights),
                               class_sep=class_sep, random_state=random_state)

## Artifical Data Generation

In [6]:
class TabularDataset(Dataset):
    def __init__(self, X, y=None):
        """
        Characterizes a Dataset for PyTorch

        Parameters
        ----------

        data: pandas data frame
          The data frame object for the input data. It must
          contain all the continuous, categorical and the
          output columns to be used.

        cat_cols: List of strings
          The names of the categorical columns in the data.
          These columns will be passed through the embedding
          layers in the model. These columns must be
          label encoded beforehand.

        output_col: string
          The name of the output variable column in the data
          provided.
        """

        self.n = X.shape[0]
        self.X = X.astype(np.float32)
        self.y = y.astype(np.float32)

    def __len__(self):
        """
        Denotes the total number of samples.
        """
        return self.n

    def __getitem__(self, idx):
        """
        Generates one sample of data.
        """
        return [self.X[idx], self.y[idx],]

In [7]:
modus = 'full' #'full

In [8]:
no_vars = 2
N= 10000

#data = create_continuous_data(N, pos_ratio=0.1, noise_ratio=0.5, no_var=no_vars, random_state=123) #, cov=np.eye(no_vars)

X_full,y = make_classification(n_samples=N, weights=[0.9,0.1], n_clusters_per_class=1,
                              n_features=no_vars, 
                              n_informative=no_vars, 
                              n_redundant=0, n_repeated=0,
                             random_state=123)

X_train, X_test, y_train, y_test = train_test_split(X_full, y, 
                                                    stratify=y, test_size=0.5, random_state=123)


scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

X_train_majority = X_train[y_train==0,:]
X_train_minority = X_train[y_train==1,:]

y_train_bin = y_train[:]
y_temp = np.zeros([len(y_train),2])
y_temp[y_train==0,0] = 1
y_temp[y_train==1,1] = 1
y_train = y_temp

#mean_minority = np.mean(X_minority, axis=0)
#sd_minority = np.std(X_minority, axis=0)
#X_minority = (X_minority-mean_minority)/sd_minority

if modus == 'minority':
    dataset = TabularDataset(X_train_minority, y_train[np.argmax(y_train, axis=1),:])
elif modus == 'full':
    dataset = TabularDataset(X_train, y_train)
else:
    stop("Check modus. Must be one of ['minority, 'full]")

In [9]:
print(np.mean(X_train_minority, axis=0))
print(np.std(X_train_minority, axis=0))
print(np.mean(X_train_majority, axis=0))
print(np.std(X_train_majority, axis=0))

[0.35726075 0.3421497 ]
[0.14262311 0.11910413]
[0.63989079 0.66437871]
[0.10413336 0.10005858]


In [10]:
# no_vars = 12
# X, y = create_dataset(n_samples=200000, n_classes=2, weights=(0.05,0.95), n_features=no_vars,
#                      n_clusters=1, class_sep=0.8, random_state=123)

# X = minmax_scale(X)
# X_majority = X[y==0,:]
# X_minority = X[y==1,:]

# dataset = TabularDataset(X_minority, y[y==1])

In [11]:
#sampler = ImbalancedDatasetSampler(labels = y_train_bin, num_samples=1)

In [12]:
batch_size = 64
data_loader = DataLoader(dataset, batch_size = batch_size, shuffle=True)

# Balanced sampling through inverse propensiImbalancedDatasetSampler(labels = list(y_train), num_samples=batch_size)ty
#data_loader = DataLoader(dataset, batch_size = batch_size, 
#                     sampler = sampler)

In [13]:
generator = Generator(latent_dim=10, lin_layer_sizes=[128,256], output_dim=no_vars, aux_dim=0)

discriminator = Discriminator(input_size=no_vars, lin_layer_sizes=[128,128], aux_input_size=0)

print(generator)
print(discriminator)

Generator(
  (lin_layers): ModuleList(
    (0): Linear(in_features=10, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=256, bias=True)
  )
  (output_layer): Linear(in_features=256, out_features=2, bias=True)
)
Discriminator(
  (lin_layers): ModuleList(
    (0): Linear(in_features=2, out_features=128, bias=True)
    (1): Linear(in_features=128, out_features=128, bias=True)
  )
  (output_layer): Linear(in_features=128, out_features=1, bias=True)
)


In [14]:
# Initialize optimizers
lr_G = 5e-5
lr_D = 5e-5
betas = (.9, .99)
G_optimizer = optim.Adam(generator.parameters(), lr=lr_G, betas=betas)
D_optimizer = optim.Adam(discriminator.parameters(), lr=lr_D, betas=betas)

In [15]:
trainer = WGAN(generator, discriminator, G_optimizer, D_optimizer, print_every=1000,
                  use_cuda=torch.cuda.is_available())

In [16]:
trainer.gp_weight = 10

In [17]:
# Train model
epochs = 100
trainer.train(data_loader, epochs,  save_training_gif=False)


Epoch 1
Iteration 1
D: -0.014640197157859802
GP: 8.166168212890625
Gradient norm: 0.09640739113092422

Epoch 2
Iteration 1
D: -0.03755280375480652
GP: 3.8522980213165283
Gradient norm: 0.3799523711204529

Epoch 3
Iteration 1
D: -0.08430594205856323
GP: 1.4303369522094727
Gradient norm: 0.6253386735916138
G: -0.5480396747589111
Distance: 0.08430594205856323

Epoch 4
Iteration 1
D: -0.09473645687103271
GP: 0.21663829684257507
Gradient norm: 0.8623771667480469
G: -0.8660480976104736
Distance: 0.09473645687103271

Epoch 5
Iteration 1
D: -0.15546107292175293
GP: 0.006872779689729214
Gradient norm: 0.9974378347396851
G: -1.0165677070617676
Distance: 0.15546107292175293

Epoch 6
Iteration 1
D: -0.1536095142364502
GP: 0.00587007962167263
Gradient norm: 1.0061150789260864
G: -1.0165677070617676
Distance: 0.1536095142364502

Epoch 7
Iteration 1
D: -0.1534423828125
GP: 0.0050646415911614895
Gradient norm: 1.0079965591430664
G: -1.0130975246429443
Distance: 0.1534423828125

Epoch 8
Iteration 1
D:

In [None]:
#generator(generator.sample_latent(num_samples= 1000)).data.numpy()

## Visual test

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
combinations = [(x,y) for x in range(no_vars) for y in range(no_vars) if y>x]

In [None]:
fig, axes = plt.subplots(nrows=no_vars, ncols=no_vars, sharex=True, sharey=True, squeeze=True,figsize=(10,10))
for y in axes:
    for x in y:
        x.set_xticklabels([])
        x.set_yticklabels([])

for i,j in combinations:
    sns.kdeplot(X_majority[:,i], X_majority[:,j], alpha=0.5, cmap="Blues", ax=axes[(j,i)])
    sns.kdeplot(X_minority[:,i], X_minority[:,j], alpha=0.5, cmap="Greens", ax=axes[(j,i)])
fig.savefig(f'../img/cont_sample_tr_iter_{trainer.G.training_iterations}.png',format='png', dpi=100)
    #fig.show()

In [None]:
epochs = 90

for _ in range(30):
    trainer.train(data_loader, epochs)
    
    
    if modus == 'full':
        fake_minority = generator(*generator.sample_latent(num_samples= 1000, class_index=1)).data.numpy()
        fake_majority = generator(*generator.sample_latent(num_samples= 1000, class_index=0)).data.numpy()
    elif modus == 'minority':
        fake_minority = generator(generator.sample_latent(num_samples= 1000)).data.numpy()
        
    fig, axes = plt.subplots(nrows=no_vars, ncols=no_vars, sharex=True, squeeze=True,figsize=(10,10))
    for y in axes:
        for x in y:
            x.set_xticklabels([])
            x.set_yticklabels([])
    
    for i in range(no_vars):
        sns.kdeplot(X_minority[:,i], alpha=0.5, shade=True, color="blue", ax=axes[(i,i)])
        sns.kdeplot(fake_minority[:,i], alpha=0.5, shade=True, color="green", ax=axes[(i,i)])
    
    for i,j in combinations:
        axes[(i,j)].set_ylim(0,1)
        # majority (upper right)
        if modus == 'full':
            sns.kdeplot(X_majority[0:1000,i], X_majority[0:1000,j], alpha=0.5, cmap="Blues", ax=axes[(i,j)])
            sns.kdeplot(fake_majority[:,i], fake_majority[:,j], alpha=0.5, cmap="Greens", ax=axes[(i,j)], )
        
        # minority (lower left)
        sns.kdeplot(X_minority[:,i], X_minority[:,j], alpha=0.5, cmap="Blues", ax=axes[(j,i)])
        sns.kdeplot(fake_minority[:,i], fake_minority[:,j], alpha=0.5, cmap="Greens", ax=axes[(j,i)])
        
    fig.savefig(f'../img/cont_sample_tr_iter_{trainer.G.training_iterations}.png',format='png', dpi=200)
        #fig.show()

In [None]:
desc = f"multinormal_n{N//1000}_k{no_vars}_{modus}"
torch.save(generator.state_dict(), f"../models/wgan_generator_{desc}_{generator.training_iterations}")
torch.save(discriminator.state_dict(), f"../models/wgan_discriminator_{desc}_{generator.training_iterations}")

In [None]:
file_name = "multinormal_n10_k4_c2_6999"
generator.load_state_dict(torch.load(f"../models/wgan_generator_{file_name}"))
discriminator.load_state_dict(torch.load(f"../models/wgan_discriminator_{file_name}"))

## Distribution summary statistics

In [None]:
from torch import Tensor as T

In [None]:
fake_minority = generator(*generator.sample_latent(num_samples= minority_samples, class_index=1)).data.numpy()

In [None]:
print(np.mean(X_minority, axis=0))
print(np.mean(fake_minority, axis=0))

In [None]:
print(np.quantile(X_minority, q=np.arange(0,1,0.1), axis=0))
print(np.quantile(fake_minority, q=np.arange(0,1,0.1), axis=0))

In [None]:
print(np.cov(X_minority, rowvar=False) - np.cov(fake_minority,rowvar=False))


## Discriminator test

In [None]:
sample_size = X_minority.shape[0]

In [None]:
fake = generator(*generator.sample_latent(num_samples= sample_size, class_index=1)).data.numpy()
#fake = generator(generator.sample_latent(num_samples= sample_size)).data.numpy()

In [None]:
X_fakereal = np.vstack([X_minority, 
                        fake])
y_fakereal = np.concatenate([np.zeros(X_minority.shape[0]), 
                        np.ones(fake.shape[0])]).flatten()

In [None]:
clf = RandomForestClassifier(n_estimators=50, min_samples_leaf=20, n_jobs=10)
model_fakereal = clf.fit(X_fakereal, y_fakereal)

In [None]:
pred_fakereal = model_fakereal.predict_proba(X_fakereal)[:,1]
roc_auc_score(y_fakereal, pred_fakereal)

# Predictive performance testing

In [None]:
y_train_bin = np.argmax(y_train, axis=1)
y_test_bin = np.argmax(y_test, axis=1)

In [None]:
def test_auc(model_library, X, y_true):
    auc = {}
    for model in model_library.keys():
        pred = model_library[model].predict_proba(X_test)[:,1]
        auc[model] = roc_auc_score(y_true, pred)
    return auc

## Predictive test

In [None]:
minority_samples = X_minority.shape[0]
majority_samples = X_majority.shape[0]

fake_minority = generator(*generator.sample_latent(num_samples= minority_samples, class_index=1)).data.numpy()
fake_majority = generator(*generator.sample_latent(num_samples= majority_samples, class_index=0)).data.numpy()

X_synthetic = np.vstack([fake_majority, 
                         fake_minority])
y_synthetic = np.concatenate([np.zeros(majority_samples), 
                              np.ones(minority_samples)]).flatten()

In [None]:
clf_org = DecisionTreeClassifier(max_depth=10) #LogisticRegression(solver='saga') 
clf_fake = DecisionTreeClassifier(max_depth=10) #LogisticRegression(solver='saga')

predictive = {}
predictive["real"] = clf_org.fit(X=X_train, y=y_train_bin)
predictive["synthetic"] = clf_fake.fit(X=X_synthetic, y=y_synthetic)

test_auc(predictive, X_test, y_test)

## Upsampling performance

In [None]:
performance = {"original":[],"GANbalanced":[]}
for i in range(200):
    sample_size = X_minority.shape[0]*4
    X_fake = generator(*generator.sample_latent(num_samples= sample_size, class_index=1)).data.numpy()
    #X_fake = generator(generator.sample_latent(num_samples= sample_size, class_index=None)).data.numpy()
    y_fake = np.ones(shape=[sample_size])

    X_up = np.vstack([X_train,X_fake])
    y_up = np.hstack([y_train_bin,y_fake])

    clf_org = DecisionTreeClassifier(max_depth=5)
    clf_fake = DecisionTreeClassifier(max_depth=5)

    upsampling = {}
    upsampling["original"] =  clf_org.fit(X=X_train, y=y_train_bin)
    upsampling["GANbalanced"] = clf_fake.fit(X=X_up, y=y_up)
    
    performance_temp = test_auc(upsampling, X_test, y_test_bin)
    for model in performance_temp:
        performance[model].append(performance_temp[model])
    

In [None]:
print(pd.DataFrame(performance).mean())
print(pd.DataFrame(performance).std())

In [None]:
def plot_decision_function(X, y, clf, ax):
    plot_step = 0.02
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
                         np.arange(y_min, y_max, plot_step))

    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax.contourf(xx, yy, Z, alpha=0.4)
    ax.scatter(X[:, 0], X[:, 1], alpha=0.8, c=y, edgecolor='k')

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 7))
plot_decision_function(X_train, y_train, upsampling["original"], ax1)
plot_decision_function(X_up, y_up, upsampling["GANbalanced"], ax2)

fig.tight_layout()

## Effect of dimensionality on SMOTE

With increasing dimensionality, we expect SMOTE's underlying nearest neighbor approach to fail to capture relevant neighborhoods. We measure SMOTE performance in terms of RF being able to differentiate between real and synthetic data.

In [None]:
from imblearn.over_sampling import SMOTENC, SMOTE
from imblearn.under_sampling import TomekLinks

In [None]:
auc = {}

n_features = 320
# Create single dataset to avoid random effects
# Only works for all informative features
X_full,y = make_classification(n_samples=10000, weights=[0.9,0.1], n_clusters_per_class=1,
                              n_features=n_features, 
                              n_informative=n_features, 
                              n_redundant=0, n_repeated=0,
                             random_state=123)

# Drop variables until desired dimensionality
for k in [5,10,20,40,80,160,320]: #
    X = X_full[:,0:k]
    
    # Sample synthetic SMOTE data
    smote = SMOTE(sampling_strategy = {1:np.sum(y)*2}, k_neighbors=50,
                  random_state=123, n_jobs=20)
    X_smote, y_smote = smote.fit_sample(X,y)
    
    # Create fake/real discrimination problem
    X_fakereal = np.vstack([X[y==1], X_smote])
    y_fakereal = np.concatenate([np.zeros(X[y==1].shape[0]), 
                                 np.ones(  X_smote.shape[0])]).flatten()
    
    X_fakereal_train, X_fakereal_test, y_fakereal_train, y_fakereal_test =\
        train_test_split(X_fakereal, y_fakereal, test_size=0.5)
    clf = RandomForestClassifier(n_estimators=100, min_samples_leaf=50, n_jobs=20)
    model_fakereal = clf.fit(X_fakereal_train, y_fakereal_train)

    pred_fakereal = model_fakereal.predict_proba(X_fakereal_test)[:,1]
    auc[k] = roc_auc_score(y_fakereal_test, pred_fakereal)
    
print(auc)


In [None]:
plt.plot(auc.keys(), auc.values())
plt.xlabel("No. of variables (10,000 minority observations )")
plt.ylabel("Discriminator AUC (SMOTE)")
plt.savefig("../img/SMOTE_performance_over_variables_10k_minority.png", format='png',dpi=200)
#plt.show()