In [32]:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchvision.utils as vutils
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
import torch.nn.functional as F

import pandas as pd
import numpy as np
np.random.seed(0)
from sklearn.preprocessing import LabelEncoder

import os
import wget
from pathlib import Path

In [4]:
# Download census-income dataset
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
dataset_name = 'census-income'
out = Path(os.getcwd()+'/data/'+dataset_name+'.csv')

out.parent.mkdir(parents=True, exist_ok=True)
if out.exists():
    print("File already exists.")
else:
    print("Downloading file...")
    wget.download(url, out.as_posix())


Downloading file...


In [6]:
# Load data and split
train = pd.read_csv(out)
target = ' <=50K'
if "Set" not in train.columns:
    train["Set"] = np.random.choice(["train", "valid", "test"], p =[.8, .1, .1], size=(train.shape[0],))

train_indices = train[train.Set=="train"].index
valid_indices = train[train.Set=="valid"].index
test_indices = train[train.Set=="test"].index

In [8]:
# Label encode categorical features and fill empty cells.
nunique = train.nunique()
types = train.dtypes

categorical_columns = []
categorical_dims =  {}
for col in train.columns:
    if types[col] == 'object' or nunique[col] < 200:
        print(col, train[col].nunique())
        l_enc = LabelEncoder()
        train[col] = train[col].fillna("VV_likely")
        train[col] = l_enc.fit_transform(train[col].values)
        categorical_columns.append(col)
        categorical_dims[col] = len(l_enc.classes_)
    else:
        train.fillna(train.loc[train_indices, col].mean(), inplace=True)

# check that pipeline accepts strings
train.loc[train[target]==0, target] = "wealthy"
train.loc[train[target]==1, target] = "not_wealthy"

39 73
 State-gov 9
 Bachelors 16
 13 16
 Never-married 7
 Adm-clerical 15
 Not-in-family 6
 White 5
 Male 2
 2174 119
 0 92
 40 94
 United-States 42
 <=50K 2
Set 3


In [9]:
# Define categorical features for categorical embeddings

unused_feat = ['Set']
features = [ col for col in train.columns if col not in unused_feat+[target]]
cat_idxs = [ i for i, f in enumerate(features) if f in categorical_columns]
cat_dims = [ categorical_dims[f] for i, f in enumerate(features) if f in categorical_columns]
cat_emb_dim=3
optimizer_fn=torch.optim.Adam
optimizer_params=dict(lr=2e-2)
scheduler_params={"step_size":50, # how to use learning rate scheduler
                  "gamma":0.9}
scheduler_fn=torch.optim.lr_scheduler.StepLR
n_shared_decoder=1 # nb shared glu for decoding
n_indep_decoder=1 # nb independent glu for decoding

In [10]:
X_train = train[features].values[train_indices]
y_train = train[target].values[train_indices]

X_valid = train[features].values[valid_indices]
y_valid = train[target].values[valid_indices]

X_test = train[features].values[test_indices]
y_test = train[target].values[test_indices]

In [11]:
from pytorch_tabnet.pretraining import TabNetPretrainer

# TabNetPretrainer
unsupervised_model = TabNetPretrainer(
    cat_idxs=cat_idxs,
    cat_dims=cat_dims,
    cat_emb_dim=3,
    optimizer_fn=torch.optim.Adam,
    optimizer_params=dict(lr=2e-2),
    mask_type='entmax', # "sparsemax",
    n_shared_decoder=1, # nb shared glu for decoding
    n_indep_decoder=1, # nb independent glu for decoding
)

Device used : cpu


In [17]:
max_epochs = 1 if not os.getenv("CI", False) else 2
max_epochs

1

In [19]:
""" ==================== GENERATOR ======================== """

unsupervised_model.fit(
    X_train=X_train,
    eval_set=[X_valid],
    max_epochs=max_epochs , patience=5,
    batch_size=2048, virtual_batch_size=128,
    num_workers=0,
    drop_last=False,
    pretraining_ratio=0.8,

)

epoch 0  | loss: 6.57988 | val_0_unsup_loss: 1.9971  |  0:00:09s
Stop training because you reached max_epochs = 1 with best_epoch = 0 and best_val_0_unsup_loss = 1.9971
Best weights from best epoch are automatically used!
epoch 0  | loss: 1.60911 | val_0_unsup_loss: 1.22277 |  0:00:06s
Stop training because you reached max_epochs = 1 with best_epoch = 0 and best_val_0_unsup_loss = 1.22277
Best weights from best epoch are automatically used!


In [24]:
# gerator = encoder + decoder
generator = unsupervised_model.network
generator

TabNetPretraining(
  (embedder): EmbeddingGenerator(
    (embeddings): ModuleList(
      (0): Embedding(73, 3)
      (1): Embedding(9, 3)
      (2): Embedding(16, 3)
      (3): Embedding(16, 3)
      (4): Embedding(7, 3)
      (5): Embedding(15, 3)
      (6): Embedding(6, 3)
      (7): Embedding(5, 3)
      (8): Embedding(2, 3)
      (9): Embedding(119, 3)
      (10): Embedding(92, 3)
      (11): Embedding(94, 3)
      (12): Embedding(42, 3)
    )
  )
  (masker): RandomObfuscator()
  (encoder): TabNetEncoder(
    (initial_bn): BatchNorm1d(40, eps=1e-05, momentum=0.01, affine=True, track_running_stats=True)
    (initial_splitter): FeatTransformer(
      (shared): GLU_Block(
        (shared_layers): ModuleList(
          (0): Linear(in_features=40, out_features=32, bias=False)
          (1): Linear(in_features=16, out_features=32, bias=False)
        )
        (glu_layers): ModuleList(
          (0): GLU_Layer(
            (fc): Linear(in_features=40, out_features=32, bias=False)
       

In [20]:
# Make reconstruction from a dataset
reconstructed_X, embedded_X = unsupervised_model.predict(X_valid)
assert(reconstructed_X.shape==embedded_X.shape)

In [21]:
reconstructed_X

array([[-0.03961663, -0.24110238,  0.47189063, ..., -0.02406411,
        -0.95372117,  1.5843254 ],
       [-0.0527579 , -0.21691859,  0.39570072, ..., -0.02289679,
        -0.9455302 ,  1.4653947 ],
       [-0.05943928, -0.18790035,  0.37794593, ...,  0.00494059,
        -0.8585742 ,  1.4165832 ],
       ...,
       [-0.05337216, -0.24408716,  0.5111049 , ..., -0.02050782,
        -0.9150843 ,  1.575053  ],
       [-0.01427674, -0.26014954,  0.5003153 , ...,  0.03300002,
        -0.8445887 ,  1.6179703 ],
       [-0.07667636, -0.22761166,  0.343219  , ...,  0.00909834,
        -0.9047952 ,  1.3742868 ]], dtype=float32)

In [29]:
# nn.Embedding(10, 10).weight.shape

n_classes = 2

ModuleList(
  (0): Embedding(73, 3)
  (1): Embedding(9, 3)
  (2): Embedding(16, 3)
  (3): Embedding(16, 3)
  (4): Embedding(7, 3)
  (5): Embedding(15, 3)
  (6): Embedding(6, 3)
  (7): Embedding(5, 3)
  (8): Embedding(2, 3)
  (9): Embedding(119, 3)
  (10): Embedding(92, 3)
  (11): Embedding(94, 3)
  (12): Embedding(42, 3)
)

In [None]:
# Discrimator

""" ==================== DISCRIMINATOR ======================== """

class Discriminator(nn.Module):
	def __init__(self):
		super(Discriminator, self).__init__()
		self.label_embed1 = nn.Embedding(n_classes, n_classes)
		self.dropout = 0.4
		self.depth = 512

		def init(input, output, normalize=True):
			layers = [nn.Linear(input, output)]
			if normalize:
				layers.append(nn.Dropout(self.dropout))
			layers.append(nn.LeakyReLU(0.2, inplace=True))
			return layers

		self.discriminator = nn.Sequential(
			*init(n_classes+int(np.prod(img_shape)), self.depth, normalize=False),
			*init(self.depth, self.depth),
			*init(self.depth, self.depth),
			nn.Linear(self.depth, 1),
			nn.Sigmoid()
			)

	def forward(self, img, labels):
		imgs = img.view(img.size(0),-1)
		inpu = torch.cat((imgs, self.label_embed1(labels)), -1)
		validity = self.discriminator(inpu)
		return validity


In [30]:
# weight initialization
def init_weights(m):
	if type(m)==nn.Linear:
		torch.nn.init.xavier_uniform_(m.weight)
		m.bias.data.fill_(0.01)

In [None]:
beta = 0.5
beta1 = 0.999
lrate = 0.0002

# Building generator
generator = unsupervised_model.network
# gen_optimizer = torch.optim.Adam(generator.parameters(), lr=lrate, betas=(beta, beta1))

# Building discriminator
discriminator = Discriminator()
discriminator.apply(init_weights)
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=lrate, betas=(beta, beta1))

# Loss functions
a_loss = torch.nn.BCELoss()

# Labels
real_label = 0.9
fake_label = 0.0

In [None]:
FT = torch.LongTensor
FT_a = torch.FloatTensor

cuda = True if torch.cuda.is_available() else False

if cuda:
	generator.cuda()
	discriminator.cuda()
	a_loss.cuda()
	FT = torch.cuda.LongTensor
	FT_a = torch.cuda.FloatTensor

In [None]:
epochs = 10

# training
for epoch in range(epochs):
	for i, (imgs, labels) in enumerate(dataloader):
		batch_size = imgs.shape[0]

		# convert img, labels into proper form
		imgs = Variable(imgs.type(FT_a))
		labels = Variable(labels.type(FT))

		# creating real and fake tensors of labels
		reall = Variable(FT_a(batch_size,1).fill_(real_label))
		f_label = Variable(FT_a(batch_size,1).fill_(fake_label))

		# initializing gradient
		gen_optimizer.zero_grad()
		d_optimizer.zero_grad()

		#### TRAINING GENERATOR ####
		# Feeding generator noise and labels
		noise = Variable(FT_a(np.random.normal(0, 1,(batch_size, latentdim))))
		gen_labels = Variable(FT(np.random.randint(0, n_classes, batch_size)))

		gen_imgs = generator(noise, gen_labels)

		# Ability for discriminator to discern the real v generated images
		validity = discriminator(gen_imgs, gen_labels)

		# Generative loss function
		g_loss = a_loss(validity, reall)

		# Gradients
		g_loss.backward()
		gen_optimizer.step()

		#### TRAINING DISCRIMINTOR ####

		d_optimizer.zero_grad()

		# Loss for real images and labels
		validity_real = discriminator(imgs, labels)
		d_real_loss = a_loss(validity_real, reall)

		# Loss for fake images and labels
		validity_fake = discriminator(gen_imgs.detach(), gen_labels)
		d_fake_loss = a_loss(validity_fake, f_label)

		# Total discriminator loss
		d_loss = 0.5 * (d_fake_loss+d_real_loss)

		# calculates discriminator gradients
		d_loss.backward()
		d_optimizer.step()


		if i%100 == 0:
			vutils.save_image(gen_imgs, '%s/real_samples.png' % output, normalize=True)
			fake = generator(noise, gen_labels)
			vutils.save_image(fake.detach(), '%s/fake_samples_epoch_%03d.png' % (output, epoch), normalize=True)

	print("[Epoch: %d/%d]" "[D loss: %f]" "[G loss: %f]" % (epoch+1, epochs, d_loss.item(), g_loss.item()))

	# checkpoints
	torch.save(generator.state_dict(), '%s/generator_epoch_%d.pth' % (output, epoch))
	torch.save(discriminator.state_dict(), '%s/generator_epoch_%d.pth' % (output, epoch))

