In [1]:
import os
import time
import math
import numpy as np
import random
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.autograd import Variable

import datetime
now_time = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S')
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu") 

In [2]:
import pandas as pd
from transformers import AutoModel, AutoTokenizer, AutoConfig
from matplotlib import pyplot as plt
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [3]:
label_list = ['UNK',1,2,3,4,5,6,7,8,9,10]

In [4]:
df_train_l=pd.read_csv("../../yahoo/assigned/train_l.csv", index_col="Unnamed: 0")
df_test_l=pd.read_csv("../../yahoo/assigned/test_l.csv", index_col="Unnamed: 0")
df_u=pd.read_csv("../../yahoo/assigned/u.csv", index_col="Unnamed: 0")
df_train_u=pd.read_csv("../../yahoo/assigned/train_u.csv", index_col="Unnamed: 0")#.head(10000)
df_test_u=pd.read_csv("../../yahoo/assigned/test_u.csv", index_col="Unnamed: 0")#.head(5000)
df_all = pd.concat([df_train_l, df_test_l, df_u, df_train_u, df_test_u])

In [5]:
train_l =  list(df_train_l.to_records(index=False))
test_l = list(df_test_l.to_records(index=False))
u_list = list(df_u.to_records(index=False))
test_u = list(df_test_u.to_records(index=False))
train_u = list(df_train_u.to_records(index=False))
data_all = list(df_all["0"])

In [6]:
#--------------------------------
#  Transformer parameters
#--------------------------------
max_seq_length = 24
batch_size = 92

#--------------------------------
#  GAN-BERT specific parameters
#--------------------------------
# number of hidden layers in the generator, 
# each of the size of the output space
#num_hidden_layers_g = 1; 
# number of hidden layers in the discriminator, 
# each of the size of the input space
num_hidden_layers_d = 1; 
# size of the generator's input noisy vectors
noise_size = 100
# dropout to be applied to discriminator's input vectors
out_dropout_rate = 0.2

# Replicate labeled data to balance poorly represented datasets, 
# e.g., less than 1% of labeled material
apply_balance = True

#--------------------------------
#  Optimization parameters
#--------------------------------
learning_rate_discriminator = 5e-6 #5e-6?
#learning_rate_generator = 5e-5
epsilon = 1e-8
num_train_epochs = 50
multi_gpu = True
# Scheduler
apply_scheduler = False
warmup_proportion = 0.1
# Print
print_each_n_step = 10

#--------------------------------
#  Adopted Tranformer model
#--------------------------------
# Since this version is compatible with Huggingface transformers, you can uncomment
# (or add) transformer models compatible with GAN

#model_name = "bert-base-cased"
#model_name = "bert-base-uncased"
#model_name = "roberta-base"
#model_name = "albert-base-v2"
#model_name = "xlm-roberta-base"
#model_name = "amazon/bort"
#model_name="google/electra-large-discriminator"
#model_name="google/electra-small-discriminator"
#model_name="microsoft/deberta-v2-xxlarge"
#model_name="microsoft/deberta-v3-base"
model_name = "google/electra-base-discriminator"

In [7]:
transformer = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
def generate_data_loader(input_examples, label_masks, label_map, do_shuffle = False, balance_label_examples = False):
  '''
  Generate a Dataloader given the input examples, eventually masked if they are 
  to be considered NOT labeled.
  '''
  examples = []

  # Count the percentage of labeled examples  
  num_labeled_examples = 0
  for label_mask in label_masks:
    if label_mask: 
      num_labeled_examples += 1
  label_mask_rate = num_labeled_examples/len(input_examples)

  # if required it applies the balance
  for index, ex in enumerate(input_examples): 
    if label_mask_rate == 1 or not balance_label_examples:
      examples.append((ex, label_masks[index]))
    else:
      # IT SIMULATE A LABELED EXAMPLE
      if label_masks[index]:
        balance = int(1/label_mask_rate)
        balance = int(math.log(balance,2))
        if balance < 1:
          balance = 1
        for b in range(0, int(balance)):
          examples.append((ex, label_masks[index]))
      else:
        examples.append((ex, label_masks[index]))
  
  #-----------------------------------------------
  # Generate input examples to the Transformer
  #-----------------------------------------------
  input_ids = []
  input_mask_array = []
  label_mask_array = []
  label_id_array = []

  # Tokenization 
  for (text, label_mask) in examples:
    encoded_sent = tokenizer.encode(text[0], add_special_tokens=True, max_length=max_seq_length, padding="max_length", truncation=True)
    input_ids.append(encoded_sent)
    label_id_array.append(label_map[text[1]])
    label_mask_array.append(label_mask)
  
  # Attention to token (to ignore padded input wordpieces)
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]                          
    input_mask_array.append(att_mask)
  # Convertion to Tensor
  input_ids = torch.tensor(input_ids) 
  input_mask_array = torch.tensor(input_mask_array)
  label_id_array = torch.tensor(label_id_array, dtype=torch.long)
  label_mask_array = torch.tensor(label_mask_array)

  # Building the TensorDataset
  dataset = TensorDataset(input_ids, input_mask_array, label_id_array, label_mask_array)

  if do_shuffle:
    sampler = RandomSampler
  else:
    sampler = SequentialSampler

  # Building the DataLoader
  return DataLoader(
              dataset,  # The training samples.
              sampler = sampler(dataset), 
              batch_size = batch_size) # Trains with this batch size.

def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))
    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [9]:
def generate_data_fake(input_examples):
  '''
  Generate a Dataloader given the input examples, eventually masked if they are 
  to be considered NOT labeled.
  '''
  
  #-----------------------------------------------
  # Generate input examples to the Transformer
  #-----------------------------------------------
  input_ids = []
  input_mask_array = []

  # Tokenization 
  for text in input_examples:
    encoded_sent = tokenizer.encode(text, add_special_tokens=True, max_length=max_seq_length, padding="max_length", truncation=True)
    input_ids.append(encoded_sent)
  
  # Attention to token (to ignore padded input wordpieces)
  for sent in input_ids:
    att_mask = [int(token_id > 0) for token_id in sent]                          
    input_mask_array.append(att_mask)
  # Convertion to Tensor
  input_ids = torch.tensor(input_ids) 
  input_mask_array = torch.tensor(input_mask_array)

  # Building the DataLoader
  return input_ids, input_mask_array # Trains with this batch size.

In [10]:
#Load the examples
labeled_examples = train_l
unlabeled_examples = u_list
test_examples = test_l

In [11]:
label_map = {}
for (i, label) in enumerate(label_list):
  label_map[label] = i
#------------------------------
#   Load the train dataset
#------------------------------
train_examples = labeled_examples
#The labeled (train) dataset is assigned with a mask set to True
train_label_masks = np.ones(len(labeled_examples), dtype=bool)
#If unlabel examples are available
if unlabeled_examples:
  train_examples = train_examples + unlabeled_examples
  #The unlabeled (train) dataset is assigned with a mask set to False
  tmp_masks = np.zeros(len(unlabeled_examples), dtype=bool)
  train_label_masks = np.concatenate([train_label_masks,tmp_masks])

train_dataloader = generate_data_loader(train_examples, train_label_masks, label_map, do_shuffle = True, balance_label_examples = apply_balance)

#------------------------------
#   Load the test dataset
#------------------------------
#The labeled (test) dataset is assigned with a mask set to True
test_label_masks = np.ones(len(test_examples), dtype=bool)

test_dataloader = generate_data_loader(test_examples, test_label_masks, label_map, do_shuffle = False, balance_label_examples = False)

In [12]:
#------------------------------
#   The Discriminator
#   https://www.aclweb.org/anthology/2020.acl-main.191/
#   https://github.com/crux82/ganbert
#------------------------------
class Discriminator(nn.Module):
    def __init__(self, input_size=512, hidden_sizes=[512], num_labels=2, dropout_rate=0.1):
        super(Discriminator, self).__init__()
        self.input_dropout = nn.Dropout(p=dropout_rate)
        layers = []
        hidden_sizes = [input_size] + hidden_sizes
        for i in range(len(hidden_sizes)-1):
            layers.extend([nn.Linear(hidden_sizes[i], hidden_sizes[i+1]), nn.LeakyReLU(0.2, inplace=True), nn.Dropout(dropout_rate)])

        self.layers = nn.Sequential(*layers) #per il flatten
        self.logit = nn.Linear(hidden_sizes[-1],num_labels+1) # +1 for the probability of this sample being fake/real.
        self.softmax = nn.Softmax(dim=-1)

    def forward(self, input_rep):
        input_rep = self.input_dropout(input_rep)
        last_rep = self.layers(input_rep)
        logits = self.logit(last_rep)
        probs = self.softmax(logits)
        return last_rep, logits, probs

In [13]:
# The config file is required to get the dimension of the vector produced by 
# the underlying transformer
config = AutoConfig.from_pretrained(model_name)
hidden_size = int(config.hidden_size)
# Define the number and width of hidden layers
#hidden_levels_g = [hidden_size for i in range(0, num_hidden_layers_g)]
hidden_levels_d = [hidden_size for i in range(0, num_hidden_layers_d)]

#-------------------------------------------------
#   Instantiate the Generator and Discriminator
#-------------------------------------------------
#generator = Generator(noise_size=noise_size, output_size=hidden_size, hidden_sizes=hidden_levels_g, dropout_rate=out_dropout_rate)
discriminator = Discriminator(input_size=hidden_size, hidden_sizes=hidden_levels_d, num_labels=len(label_list), dropout_rate=out_dropout_rate)

# Put everything in the GPU if available
if torch.cuda.is_available():    
  #generator.cuda()
  discriminator.cuda()
  transformer.cuda()
  if multi_gpu:
    transformer = torch.nn.DataParallel(transformer)

# print(config)

In [14]:
training_stats = []

accuracy_array=[]

# Measure the total training time for the whole run.
total_t0 = time.time()

#models parameters
transformer_vars = [i for i in transformer.parameters()]
d_vars = transformer_vars + [v for v in discriminator.parameters()]
#g_vars = [v for v in generator.parameters()]

#optimizer
dis_optimizer = torch.optim.AdamW(d_vars, lr=learning_rate_discriminator)
#gen_optimizer = torch.optim.AdamW(g_vars, lr=learning_rate_generator) 

#scheduler
if apply_scheduler:
  num_train_examples = len(train_examples)
  num_train_steps = int(num_train_examples / batch_size * num_train_epochs)
  num_warmup_steps = int(num_train_steps * warmup_proportion)

  scheduler_d = get_constant_schedule_with_warmup(dis_optimizer, 
                                           num_warmup_steps = num_warmup_steps)
  scheduler_g = get_constant_schedule_with_warmup(gen_optimizer, 
                                           num_warmup_steps = num_warmup_steps)

In [15]:
#OPTAGAN
from __future__ import absolute_import, division, print_function, unicode_literals
import argparse

import logging
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import numpy as np

from torch.autograd import Variable
from modules.gan import Generator, Critic

import copy
import math
import glob
import os
import pickle
import random

import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, SequentialSampler, RandomSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from func import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, BertConfig
from func import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForLatentConnector, GPT2ForLatentConnectorValueHead
from func import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
from func import XLNetLMHeadModel, XLNetTokenizer
from func import TransfoXLLMHeadModel, TransfoXLTokenizer
from func import BertForLatentConnector, BertTokenizer

from collections import defaultdict
from utils import (TextDataset_Split, TextDataset_2Tokenizers, BucketingDataLoader)
import pdb
from modules.utils import (calc_blue_parallel_func, pad_seq, rollout, rollout_test)
#from transformers.modeling_utils import top_k_top_p_filtering


MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.INFO)
logger = logging.getLogger(__name__)

MODEL_CLASSES = {
    'gpt2': (GPT2Config, GPT2ForLatentConnectorValueHead, GPT2Tokenizer),
    'bert': (BertConfig, BertForLatentConnector, BertTokenizer)
}

num_txt = 1

def load_and_cache_examples(args, tokenizer):
    if isinstance(tokenizer, list):
        dataset = TextDataset_2Tokenizers(tokenizer, args, args.train_data_file, block_size=args.block_size)
    else:
        dataset = TextDataset_Split(tokenizer, args, args.train_data_file, block_size=args.block_size)
    return dataset

def build_dataload_and_cache_examples(args, tokenizer, num_txt):
    if isinstance(tokenizer, list):
        args.batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
        if num_txt<=20:
            concatenation="{}_{}{}".format(args.train_data_file, num_txt, ".txt")
            file_path=concatenation
            print("Train file used is number {}".format(num_txt))
            print(concatenation)
            num_txt=num_txt+1
        else:
            num_txt=1
            concatenation="{}_{}{}".format(args.train_data_file, num_txt, ".txt")
            file_path=concatenation
            print("Train file used is number {}".format(num_txt))
        dataloader = BucketingDataLoader(file_path, args.batch_size, args.max_seq_length, tokenizer, args, bucket=100, shuffle=True)
    else:
        pass 
    return dataloader, num_txt

def compute_grad_penalty(critic, real_data, fake_data):
    B = real_data.size(0)
    alpha = torch.FloatTensor(np.random.random((B, 1)))
    if args.cuda:
        alpha = alpha.cuda()
    sample = alpha*real_data + (1-alpha)*fake_data
    sample.requires_grad_(True)
    score = critic(sample)

    outputs = torch.FloatTensor(B, 1).fill_(1.0) #args.latent_size
    outputs.requires_grad_(False)
    if args.cuda:
        outputs = outputs.cuda()
    grads = autograd.grad(
        outputs=score,
        inputs=sample,
        grad_outputs=outputs,
        create_graph=True,
        retain_graph=True,
        only_inputs=True)[0]
    grad_penalty = ((grads.norm(2, dim=1) - 1.) ** 2).mean()
    return grad_penalty

def train(epoch):
    model_encoder.eval()
    model_decoder.eval()
    generator.train()
    critic.train()
    c_train_loss = 0.
    g_train_loss = 0.
    g_batches = 0
    c_batches = 0
    c_loss_0 = 1
    g_loss_0 = 1
    for i, x in enumerate(train_loader):
        x = x[0]
        if args.cuda:
            x = x.cuda()
        # Generate noise
        B = args.per_gpu_train_batch_size
        noise = torch.from_numpy(np.random.normal(0, 1, (B,
                                 args.latent_size))).float()
        if args.cuda:
            noise = noise.cuda()
        # Get original text latent embeddings
        with torch.no_grad(): 
            pooled_hidden_fea = model_encoder(x, attention_mask=(x > 0).float())[1]
            mean, logvar = model_encoder.linear(pooled_hidden_fea).chunk(2, -1)
            z_real = mean.squeeze(1) 

        # Evaluate and get losses
        z_fake = generator(noise)
        real_score = critic(z_real)
        fake_score = critic(z_fake)
        grad_penalty = compute_grad_penalty(critic, z_real.data, z_fake.data)
        c_loss = -torch.mean(real_score) + torch.mean(fake_score) + \
                 args.gp_lambda*grad_penalty

        fake_score = critic(generator(noise))
        g_loss = -torch.mean(fake_score)
        
        r_g = abs(((g_loss.item() - g_loss_0) / (g_loss_0 + 0.001))) 
        r_c = abs(((c_loss.item() - c_loss_0) / (c_loss_0 + 0.001))) 
        
        # Update critic or generator
        if ((2 + epoch) / epoch) * r_c > r_g:
            c_optimizer.zero_grad()
            c_batches += 1
            c_train_loss += c_loss.item()
            c_loss.backward()
            c_optimizer.step()
        else:
            g_optimizer.zero_grad()
            g_batches += 1
            g_train_loss += g_loss.item()
            g_loss.backward()
            g_optimizer.step()

        c_loss_0 = c_loss.item()
        g_loss_0 = g_loss.item()

        if args.interval > 0 and i % args.interval == 0:
            logger.info('Epoch: {} | Batch: {}/{} ({:.0f}%) | G Loss: {:.6f} | C Loss: {:.6f}'.format(
                epoch, args.batch_size*i, len(train_loader.dataset),
                100.*(args.batch_size*i)/len(train_loader.dataset),
                g_loss.item(), c_loss.item()
            ))
            test_noise = torch.Tensor(np.random.normal(0, 1, (1, args.latent_size))).to(args.device)
            test_new_z = generator(test_noise).data
            # create new sent
            test_z = rollout_test(model_decoder, test_new_z, tokenizer_decoder, args.max_seq_length, 1, 0, 1)
            logger.info("Text: {}".format(test_z))

    c_train_loss /= c_batches + 1
    g_train_loss /= g_batches + 1
    logger.info('* (Train) Epoch: {} | G Loss: {:.4f} | C Loss: {:.4f} | Updates G: {} | Updates C: {}'.format(
        epoch, g_train_loss, c_train_loss, g_batches, c_batches
    ))
    return (g_train_loss, c_train_loss)

[nltk_data] Downloading package punkt to /home/harry/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--seed', type=int, default=0)
    parser.add_argument('--epochs', type=int, default=15)
    parser.add_argument('--lr', type=float, default=1e-4)
    parser.add_argument('--gp_lambda', type=int, default=10)
    parser.add_argument('--n_layers', type=int, default=20, help="Number of layers of generator and critic")
    parser.add_argument('--block_dim', type=int, default=100)
    parser.add_argument('--interval', type=int, default=10, help="Steps before logging output")
    parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
    
    # Optimus parameters
    parser.add_argument("--train_data_file", default=None, type=str, required=True,
                        help="The input training data file (a text file).")
    parser.add_argument("--valid_data_file", default=None, type=str, required=True,
                        help="The input validation data file (a text file).")
    parser.add_argument("--checkpoint_dir", default=None, type=str, required=True,
                        help="The directory where checkpoints are saved.")
    parser.add_argument('--generator_dir', default=None, type=str, help="Directory where GAN models are saved")
    parser.add_argument("--output_dir", default=None, type=str, required=True,
                        help="The output directory where the model predictions and checkpoints will be written.")
    parser.add_argument("--dataset", default='Snli', type=str, help="The dataset.")    
    parser.add_argument("--latent_size", default=32, type=int, help="Latent space dimension.")
    ## Encoder options
    parser.add_argument("--encoder_model_type", default="bert", type=str,
                        help="The encoder model architecture to be fine-tuned.")
    parser.add_argument("--encoder_model_name_or_path", default="bert-base-cased", type=str,
                        help="The encoder model checkpoint for weights initialization.")
    parser.add_argument("--encoder_config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--encoder_tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
    ## Decoder options
    parser.add_argument("--decoder_model_type", default="gpt2", type=str,
                        help="The decoder model architecture to be fine-tuned.")
    parser.add_argument("--decoder_model_name_or_path", default="bert-base-cased", type=str,
                        help="The decoder model checkpoint for weights initialization.")
    parser.add_argument("--decoder_config_name", default="", type=str,
                        help="Optional pretrained config name or path if not the same as model_name_or_path")
    parser.add_argument("--decoder_tokenizer_name", default="", type=str,
                        help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
    parser.add_argument("--per_gpu_train_batch_size", default=1, type=int,
                        help="Batch size per GPU/CPU for training.")
    parser.add_argument("--max_seq_length", default=512, type=int,
                        help="Optional input sequence length before tokenization. The sequence will be dropped if it is longer the max_seq_length")

    ## Variational auto-encoder(check this)
    parser.add_argument("--prompt", type=str, default="")
    parser.add_argument("--padding_text", type=str, default="")
    parser.add_argument("--length", type=int, default=20)
    parser.add_argument("--block_size", default=-1, type=int,
                        help="Optional input sequence length after tokenization."
                             "The training dataset will be truncated in block of this size for training."
                             "Default to the model max input length for single sentence inputs (take into account special tokens).")
    parser.add_argument("--do_lower_case", action='store_true',
                        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--use_philly", action='store_true',
                        help="Use Philly for computing.")
    parser.add_argument('--gloabl_step_eval', type=int, default=661,
                        help="Evaluate the results at the given global step")
    # Reinforcement learning parameters
    parser.add_argument('--finetune_decoder', type=bool, default=True)
    parser.add_argument('--epochs_rl', type=int, default=1000)
    parser.add_argument('--batch_size_rl', type=int, default=32)
    parser.add_argument('--lr_rl', type=float, default=1e-6)


    # Load a trained Encoder model and vocabulary that you have fine-tuned
    args = parser.parse_args("--dataset EMNLP \
    --checkpoint_dir=output_dir_768_0_unsure \
    --output_dir=output_dir_768_0_unsure \
    --encoder_model_type=bert \
    --encoder_model_name_or_path=bert-base-cased \
    --decoder_model_type=gpt2 \
    --decoder_model_name_or_path=gpt2 \
    --train_data_file=../../yahoo/subdivided_large/train \
    --valid_data_file=../../yahoo/unlabelled_short/test.txt \
    --per_gpu_train_batch_size 12 \
    --block_size 100 \
    --max_seq_length 24 \
    --gloabl_step_eval 508523 \
    --latent_size 768 \
    --block_dim 100 \
    --n_layers 10 \
    --interval 50 \
    --epochs 200 \
    --finetune_decoder False \
    --lr_rl 1e-6 \
    --epochs_rl 100 \
    --batch_size_rl 32".split())
    
    print(args)

    global_step = args.gloabl_step_eval

    torch.backends.cudnn.deterministic = True
    #args.device = torch.device("cuda" if args.cuda else "cpu")
    #args.n_gpu = torch.cuda.device_count()
    args.device = torch.device("cuda:0")
    args.n_gpu=1
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if args.n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)       
    
    args.encoder_model_type = args.encoder_model_type.lower()
    args.decoder_model_type = args.decoder_model_type.lower()

    output_encoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-encoder-{}'.format(global_step))
    output_decoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-decoder-{}'.format(global_step)) 
    checkpoints = [ [output_encoder_dir, output_decoder_dir] ]

    # Load a trained Encoder model and vocabulary that you have fine-tuned
    encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES[args.encoder_model_type]
    model_encoder = encoder_model_class.from_pretrained(output_encoder_dir, latent_size=args.latent_size)
    tokenizer_encoder = encoder_tokenizer_class.from_pretrained(args.encoder_tokenizer_name if args.encoder_tokenizer_name else args.encoder_model_name_or_path, do_lower_case=args.do_lower_case)

    model_encoder.to(args.device)
    if args.block_size <= 0:
        args.block_size = tokenizer_encoder.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer_encoder.max_len_single_sentence)

    # Load a trained Decoder model and vocabulary that you have fine-tuned
    decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type]
    model_decoder = decoder_model_class.from_pretrained(output_decoder_dir, latent_size=args.latent_size)
    tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case)
    model_decoder.to(args.device)
    if args.block_size <= 0:
        args.block_size = tokenizer_decoder.max_len_single_sentence  # Our input block size will be the max possible for the model
    args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence)

    # Chunyuan: Add Padding token to GPT2
    special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>'}
    num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict)
    logger.info('We have added {} tokens to GPT2'.format(num_added_toks))
    model_decoder.resize_token_embeddings(len(tokenizer_decoder))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
    assert tokenizer_decoder.pad_token == '<PAD>'

    #train_loader, num_txt = build_dataload_and_cache_examples(args, [tokenizer_encoder, tokenizer_decoder], num_txt) 
    generator = Generator(args.n_layers, args.block_dim,args.latent_size)
    critic = Critic(args.n_layers, args.block_dim,args.latent_size)

    if args.generator_dir!=None:
        logger.info("Loading generator and critic")
        generator.load_state_dict(torch.load(args.generator_dir+'/generator_'+str(args.gloabl_step_eval)+'.th'))
        critic.load_state_dict(torch.load(args.generator_dir+'/critic_'+str(args.gloabl_step_eval)+'.th'))

    g_optimizer = optim.Adam(generator.parameters(), lr=args.lr, betas=(0.5, 0.999))
    c_optimizer = optim.Adam(critic.parameters(), lr=args.lr, betas=(0.5, 0.999))
    
    if args.cuda:
        generator = generator.cuda()
        critic = critic.cuda()
    
    logger.info('G Parameters:{}'.format(sum([p.numel() for p in generator.parameters() if \
                                p.requires_grad])))
    logger.info('C Parameters:{}'.format(sum([p.numel() for p in critic.parameters() if \
                                p.requires_grad])))
    
    device = args.device
    
    best_bleu = 0
    reference = list()
    with(open(args.valid_data_file,"r")) as valid:
        for sents in valid:
            reference.append(sents.replace("\n", ""))
            
    for epoch in range(1, args.epochs + 1):
        
        #Insert GAN-BERT Code Here
        
        train_loader, num_txt = build_dataload_and_cache_examples(args, [tokenizer_encoder, tokenizer_decoder], num_txt) 
        
        print("Train classification discriminator")
        # ========================================
        #               Training
        # ========================================
        # Perform one full pass over the training set.
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch, args.epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        tr_g_loss = 0
        tr_d_loss = 0

        # Put the model into training mode.
        transformer.train() 
        #generator.train()
        discriminator.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every print_each_n_step batches.
            if step % print_each_n_step == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader. 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)
            b_label_mask = batch[3].to(device)

            real_batch_size = b_input_ids.shape[0]

            # Encode real data in the Transformer
            model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
            hidden_states = model_outputs.last_hidden_state[:,0,:] 
            #hidden_states = model_outputs[-1]
            #print("  Number of real sentences (labelled and unlabelled): {}".format(len(hidden_states)))
            
            # Generate fake data that should have the same distribution of the ones
            # encoded by the transformer. 
            # First noisy input are used in input to the Generator
            fixed_noise = torch.Tensor(np.random.normal(0, 1, (real_batch_size, args.latent_size))).to(args.device)
            test_z_gb = generator(fixed_noise).data
            fake_sentences = rollout_test(model_decoder, test_z_gb, tokenizer_decoder, args.max_seq_length, real_batch_size, 0, 1)
            #print("  Number of generated sentences: {}".format(len(fake_sentences)))

            b_input_ids_fake, b_input_mask_fake = generate_data_fake(fake_sentences)
            model_outputs_fake = transformer(b_input_ids_fake, attention_mask=b_input_mask_fake)
            hidden_states_fake = model_outputs_fake.last_hidden_state[:,0,:] 
            #hidden_states_fake = model_outputs_fake[-1]

            #noise = torch.zeros(real_batch_size, noise_size, device=device).uniform_(0, 1)
            # Gnerate Fake data
            #gen_rep = generator(noise)
            #print("Length of generator output {}".format(len(gen_rep)))
            #print("Length of single generator output {}".format(len(gen_rep[0])))

            # Generate the output of the Discriminator for real and fake data.
            # First, we put together the output of the tranformer and the generator
            disciminator_input = torch.cat([hidden_states, hidden_states_fake], dim=0)
            # Then, we select the output of the disciminator
            features, logits, probs = discriminator(disciminator_input)

            # Finally, we separate the discriminator's output for the real and fake
            # data
            features_list = torch.split(features, real_batch_size)
            D_real_features = features_list[0]
            D_fake_features = features_list[1]

            logits_list = torch.split(logits, real_batch_size)
            D_real_logits = logits_list[0]
            D_fake_logits = logits_list[1]

            probs_list = torch.split(probs, real_batch_size)
            D_real_probs = probs_list[0]
            D_fake_probs = probs_list[1]

            #---------------------------------
            #  LOSS evaluation
            #---------------------------------
            # Generator's LOSS estimation
            g_loss_d = -1 * torch.mean(torch.log(1 - D_fake_probs[:,-1] + epsilon))
            g_feat_reg = torch.mean(torch.pow(torch.mean(D_real_features, dim=0) - torch.mean(D_fake_features, dim=0), 2))
            g_loss = g_loss_d + g_feat_reg

            # Disciminator's LOSS estimation
            logits = D_real_logits[:,0:-1]
            log_probs = F.log_softmax(logits, dim=-1)
            # The discriminator provides an output for labeled and unlabeled real data
            # so the loss evaluated for unlabeled data is ignored (masked)
            label2one_hot = torch.nn.functional.one_hot(b_labels, len(label_list))
            per_example_loss = -torch.sum(label2one_hot * log_probs, dim=-1)
            per_example_loss = torch.masked_select(per_example_loss, b_label_mask.to(device))
            labeled_example_count = per_example_loss.type(torch.float32).numel()

            # It may be the case that a batch does not contain labeled examples, 
            # so the "supervised loss" in this case is not evaluated
            if labeled_example_count == 0:
              D_L_Supervised = 0
            else:
              D_L_Supervised = torch.div(torch.sum(per_example_loss.to(device)), labeled_example_count)

            D_L_unsupervised1U = -1 * torch.mean(torch.log(1 - D_real_probs[:, -1] + epsilon))
            D_L_unsupervised2U = -1 * torch.mean(torch.log(D_fake_probs[:, -1] + epsilon))
            d_loss = D_L_Supervised + D_L_unsupervised1U + D_L_unsupervised2U

            #---------------------------------
            #  OPTIMIZATION
            #---------------------------------
            # Avoid gradient accumulation
            #gen_optimizer.zero_grad()
            dis_optimizer.zero_grad()

            # Calculate weigth updates
            # retain_graph=True is required since the underlying graph will be deleted after backward
            g_loss.backward(retain_graph=True)
            d_loss.backward() 

            # Apply modifications
            #gen_optimizer.step()
            dis_optimizer.step()

            # A detail log of the individual losses
            #print("{0:.4f}\t{1:.4f}\t{2:.4f}\t{3:.4f}\t{4:.4f}".
            #      format(D_L_Supervised, D_L_unsupervised1U, D_L_unsupervised2U,
            #             g_loss_d, g_feat_reg))

            # Save the losses to print them later
            tr_g_loss += g_loss.item()
            tr_d_loss += d_loss.item()

            # Update the learning rate with the scheduler
            if apply_scheduler:
              scheduler_d.step()
              #scheduler_g.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss_g = tr_g_loss / len(train_dataloader)
        avg_train_loss_d = tr_d_loss / len(train_dataloader)             

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss generetor: {0:.3f}".format(avg_train_loss_g))
        print("  Average training loss discriminator: {0:.3f}".format(avg_train_loss_d))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #     TEST ON THE EVALUATION DATASET
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our test set.
        print("")
        print("Running Test...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        transformer.eval() #maybe redundant
        discriminator.eval()
        #generator.eval()

        # Tracking variables 
        total_test_accuracy = 0

        total_test_loss = 0
        nb_test_steps = 0

        all_preds = []
        all_labels_ids = []

        #loss
        nll_loss = torch.nn.CrossEntropyLoss(ignore_index=-1)

        # Evaluate data for one epoch
        for batch in test_dataloader:

            # Unpack this training batch from our dataloader. 
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[2].to(device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():        
                model_outputs = transformer(b_input_ids, attention_mask=b_input_mask)
                hidden_states = model_outputs.last_hidden_state[:,0,:] 
                #hidden_states = model_outputs[-1]
                _, logits, probs = discriminator(hidden_states)
                ###log_probs = F.log_softmax(probs[:,1:], dim=-1)
                filtered_logits = logits[:,0:-1]
                # Accumulate the test loss.
                total_test_loss += nll_loss(filtered_logits, b_labels)

            # Accumulate the predictions and the input labels
            _, preds = torch.max(filtered_logits, 1)
            all_preds += preds.detach().cpu()
            all_labels_ids += b_labels.detach().cpu()

        # Report the final accuracy for this validation run.
        all_preds = torch.stack(all_preds).numpy()
        all_labels_ids = torch.stack(all_labels_ids).numpy()
        test_accuracy = np.sum(all_preds == all_labels_ids) / len(all_preds)
        print("  Accuracy: {0:.3f}".format(test_accuracy))

        # Calculate the average loss over all of the batches.
        avg_test_loss = total_test_loss / len(test_dataloader)
        avg_test_loss = avg_test_loss.item()

        # Measure how long the validation run took.
        test_time = format_time(time.time() - t0)

        print("  Test Loss: {0:.3f}".format(avg_test_loss))
        print("  Test took: {:}".format(test_time))

        # Record all statistics from this epoch.
        training_stats.append(
            {
                'epoch': epoch + 1,
                'Training Loss generator': avg_train_loss_g,
                'Training Loss discriminator': avg_train_loss_d,
                'Valid. Loss': avg_test_loss,
                'Valid. Accur.': test_accuracy,
                'Training Time': training_time,
                'Test Time': test_time
            }
        )

        accuracy_array.append(test_accuracy)
        
        #OPTAGAN Code
        
        g_loss, c_loss = train(epoch)

        data_test = list()
        for i in range(2):
            test_noise = torch.Tensor(np.random.normal(0, 1, (250, args.latent_size))).to(args.device)
            test_z = generator(test_noise).data
            new_sent = rollout_test(model_decoder, test_z, tokenizer_decoder, args.max_seq_length, 250, 0, 1)
            data_test.extend(new_sent)

        p_reference = random.sample(reference, 500)
        bleu = calc_blue_parallel_func(p_reference, data_test, 2, 500)
        b_bleu = calc_blue_parallel_func(data_test, p_reference, 2, 500)
        logger.info("Bleu-2:{:0.3f} | B-Bleu-2:{:0.3f}".format(bleu, b_bleu))
        
        print(bleu+b_bleu)
        if (bleu+b_bleu) > best_bleu:
            best_bleu = bleu + b_bleu
            logger.info('* Saving. Best Score:{:0.3f} | Bleu-2:{:0.3f} | B-Bleu-2:{:0.3f}'.format(best_bleu, bleu, b_bleu))
            torch.save(generator.state_dict(), args.output_dir+'/generator_'+str(args.gloabl_step_eval)+'.th')
            torch.save(critic.state_dict(), args.output_dir+'/critic_'+str(args.gloabl_step_eval)+'.th')
            
        

    if args.finetune_decoder: 
        logger.info("Loading generator")
        generator.load_state_dict(torch.load(args.output_dir+'/generator_'+str(args.gloabl_step_eval)+'.th'))
        
        model_decoder.train()
        generator.eval()
        dec_optimizer = optim.Adam(model_decoder.parameters(), lr=1e-4, betas=(0.5, 0.999))
        value_loss = nn.L1Loss()
        B = args.batch_size_rl
        total_scores = 0
        total_entropy = 0
        total_values = 0
        total_v_loss = 0
        for epoch_ in range(args.epochs_rl):
            if epoch_ == 200:
                # Finetune decoder after training of value head
                dec_optimizer = optim.Adam(model_decoder.parameters(), lr=args.lr_rl, betas=(0.5, 0.999))
            noise = torch.from_numpy(np.random.normal(0, 1, (B, args.latent_size))).float()
            noise = noise.to(args.device)
            z_fake = generator(noise)            
            sents, logprobs, values, entropy = rollout(model_decoder, z_fake, tokenizer_decoder, args.max_seq_length, B, 1)
            p_reference = random.sample(reference, 500)

            blue = []
            for i in sents:
                blue.append(calc_blue_parallel_func(p_reference, [i], 1, 0))

            values = torch.stack(values, dim=1)
            logprobs = torch.stack(logprobs, dim=1)
            entropy = torch.stack(entropy, dim=1)

            # Get tokens and mask of batch
            toks_gpt = [([50258] + tokenizer_decoder.encode(j) + [50259]) for j in sents]
            toks_gpt, mask = pad_seq(toks_gpt, tokenizer_decoder.encode("<PAD>")[0], values.size(1)+1)
            toks_gpt = torch.tensor(toks_gpt).to(args.device)
            mask = torch.tensor(mask).to(args.device)
              
            values = values * mask[:,1:]
            logprobs = logprobs * mask[:,1:]
            entropy = entropy * mask[:,1:]
            scores = torch.tensor(blue).to(args.device)
            # Get value loss
            v_loss = value_loss(torch.sum(values, dim=1), scores) 
              
            if epoch_ >= 200:
                R = 0
                rewards = []

                # Discount future rewards back to the present using gamma
                for j in range(len(values.tolist())):
                    R = 0
                    batch_rewards = []
                    for r in reversed(values.tolist()[j]):
                        R = r + 0.99 * R
                        batch_rewards.insert(0,R)
                    rewards.append(batch_rewards)

                # Penalizing low entropy states
                rewards = torch.FloatTensor(rewards).to(args.device)
                rewards = rewards + torch.log(torch.clamp(entropy,0.2,1))
                # Calculate loss
                d_loss = torch.sum(torch.mul(logprobs, rewards.detach()).mul(-1))
            else:
                d_loss = torch.tensor(0)

            # Backpropagate losses
            loss = v_loss + d_loss              
            dec_optimizer.zero_grad()              
            loss.backward()
            dec_optimizer.step()

            total_scores += torch.mean(scores).item()
            total_values += torch.mean(torch.sum(values,-1)).item()
            total_v_loss += v_loss.item()
            total_entropy += torch.mean(torch.mean(entropy,dim=1)).item()
            if (epoch_ % args.interval) == 0:
                logger.info("Epoch {}/{} | Value Loss:{} | Mean values:{} | Mean BLEU scores:{} | Mean Entropy: {}".format(epoch_, 
                args.epochs_rl, total_v_loss/args.interval, total_values/args.interval, total_scores/args.interval, total_entropy/args.interval))
                total_scores = 0
                total_values = 0
                total_v_loss = 0
                total_entropy = 0
        logger.info("Saving decoder")
        output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step))
        if not os.path.exists(output_decoder_dir):
            os.makedirs(output_decoder_dir)
        model_decoder.save_pretrained(output_decoder_dir)
        torch.save(args, os.path.join(output_decoder_dir, 'training_encoder_args.bin'))   

06/28/2022 00:50:29 - INFO - func.configuration_utils -   loading configuration file output_dir_768_0_unsure/checkpoint-encoder-508523/config.json
06/28/2022 00:50:29 - INFO - func.configuration_utils -   Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "torchscript": false,
  "type_vocab_size": 2,
  "vocab_size": 28996
}

06/28/2022 00:50:29 - INFO - func.modeling_utils -   loading weights file output_dir_768_0_unsure/checkpoint-encoder-508523/pytorch_model.bin


Namespace(batch_size_rl=32, block_dim=100, block_size=100, checkpoint_dir='output_dir_768_0_unsure', cuda=True, dataset='EMNLP', decoder_config_name='', decoder_model_name_or_path='gpt2', decoder_model_type='gpt2', decoder_tokenizer_name='', do_lower_case=False, encoder_config_name='', encoder_model_name_or_path='bert-base-cased', encoder_model_type='bert', encoder_tokenizer_name='', epochs=200, epochs_rl=100, finetune_decoder=True, generator_dir=None, gloabl_step_eval=508523, gp_lambda=10, interval=50, latent_size=768, length=20, lr=0.0001, lr_rl=1e-06, max_seq_length=24, n_layers=10, output_dir='output_dir_768_0_unsure', padding_text='', per_gpu_train_batch_size=12, prompt='', seed=0, train_data_file='../../yahoo/subdivided_large/train', use_philly=False, valid_data_file='../../yahoo/unlabelled_short/test.txt')


06/28/2022 00:50:33 - INFO - func.tokenization_utils -   loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/harry/.cache/torch/pytorch_transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1
06/28/2022 00:50:33 - INFO - func.configuration_utils -   loading configuration file output_dir_768_0_unsure/checkpoint-decoder-508523/config.json
06/28/2022 00:50:33 - INFO - func.configuration_utils -   Model config {
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "embd_pdrop": 0.1,
  "finetuning_task": null,
  "initializer_range": 0.02,
  "latent_size": 768,
  "layer_norm_epsilon": 1e-05,
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_layer": 12,
  "n_positions": 1024,
  "num_labels": 1,
  "output_attentions": false,
  "output_hidden_states": false,
  "pruned_heads": {},
  "resid_pdrop": 0.1,
  "summary_activation": nul

Train file used is number 1
../../yahoo/subdivided_large/train_1.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:13.
  Batch    20  of    120.    Elapsed: 0:00:26.
  Batch    30  of    120.    Elapsed: 0:00:42.
  Batch    40  of    120.    Elapsed: 0:00:57.
  Batch    50  of    120.    Elapsed: 0:01:08.
  Batch    60  of    120.    Elapsed: 0:01:22.
  Batch    70  of    120.    Elapsed: 0:01:33.
  Batch    80  of    120.    Elapsed: 0:01:47.
  Batch    90  of    120.    Elapsed: 0:02:00.
  Batch   100  of    120.    Elapsed: 0:02:12.
  Batch   110  of    120.    Elapsed: 0:02:24.

  Average training loss generetor: 0.347
  Average training loss discriminator: 3.965
  Training epcoh took: 0:02:34

Running Test...


06/28/2022 00:53:14 - INFO - __main__ -   Epoch: 1 | Batch: 0/10000 (0%) | G Loss: 0.273712 | C Loss: 1.674639
06/28/2022 00:53:14 - INFO - __main__ -   Text: ['']


  Accuracy: 0.133
  Test Loss: 2.357
  Test took: 0:00:00


06/28/2022 00:53:15 - INFO - __main__ -   Epoch: 1 | Batch: 600/10000 (6%) | G Loss: 154.376831 | C Loss: -98.768921
06/28/2022 00:53:15 - INFO - __main__ -   Text: ['']
06/28/2022 00:53:16 - INFO - __main__ -   Epoch: 1 | Batch: 1200/10000 (12%) | G Loss: 74.731468 | C Loss: -51.481277
06/28/2022 00:53:16 - INFO - __main__ -   Text: ['Falk a. Appl. Shea.making. Two.. M. known to in on 30.']
06/28/2022 00:53:17 - INFO - __main__ -   Epoch: 1 | Batch: 1800/10000 (18%) | G Loss: 73.271088 | C Loss: -55.373634
06/28/2022 00:53:18 - INFO - __main__ -   Text: [",-. to B... to. in.'.. and. Jewish.'s."]
06/28/2022 00:53:19 - INFO - __main__ -   Epoch: 1 | Batch: 2400/10000 (24%) | G Loss: 52.415874 | C Loss: -37.907539
06/28/2022 00:53:19 - INFO - __main__ -   Text: ['the. the. Kumarified the Balls 201448_ decline Eastern..']
06/28/2022 00:53:20 - INFO - __main__ -   Epoch: 1 | Batch: 3000/10000 (30%) | G Loss: 47.951366 | C Loss: -37.604858
06/28/2022 00:53:20 - INFO - __main__ -   Text: [''

0.3226229746114385
Train file used is number 2
../../yahoo/subdivided_large/train_2.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:18.
  Batch    20  of    120.    Elapsed: 0:00:36.
  Batch    30  of    120.    Elapsed: 0:00:53.
  Batch    40  of    120.    Elapsed: 0:01:11.
  Batch    50  of    120.    Elapsed: 0:01:29.
  Batch    60  of    120.    Elapsed: 0:01:47.
  Batch    70  of    120.    Elapsed: 0:02:05.
  Batch    80  of    120.    Elapsed: 0:02:23.
  Batch    90  of    120.    Elapsed: 0:02:41.
  Batch   100  of    120.    Elapsed: 0:02:58.
  Batch   110  of    120.    Elapsed: 0:03:16.

  Average training loss generetor: 0.543
  Average training loss discriminator: 3.463
  Training epcoh took: 0:03:34

Running Test...


06/28/2022 00:57:16 - INFO - __main__ -   Epoch: 2 | Batch: 0/10001 (0%) | G Loss: 9.731853 | C Loss: -9.938903


  Accuracy: 0.133
  Test Loss: 2.338
  Test took: 0:00:00


06/28/2022 00:57:16 - INFO - __main__ -   Text: ["Circle Snö Laws is to guarantee a strike from any one. or both. the. in.'road."]
06/28/2022 00:57:17 - INFO - __main__ -   Epoch: 2 | Batch: 600/10001 (6%) | G Loss: 8.173342 | C Loss: -8.331749
06/28/2022 00:57:17 - INFO - __main__ -   Text: ['People displayed capital).']
06/28/2022 00:57:18 - INFO - __main__ -   Epoch: 2 | Batch: 1200/10001 (12%) | G Loss: 7.877058 | C Loss: -8.514096
06/28/2022 00:57:18 - INFO - __main__ -   Text: ['arm to h into his We Sell Merden.']
06/28/2022 00:57:19 - INFO - __main__ -   Epoch: 2 | Batch: 1800/10001 (18%) | G Loss: 6.322692 | C Loss: -7.098281
06/28/2022 00:57:19 - INFO - __main__ -   Text: ['rum that we defeat Correct as a Criminal.']
06/28/2022 00:57:20 - INFO - __main__ -   Epoch: 2 | Batch: 2400/10001 (24%) | G Loss: 5.164805 | C Loss: -5.984857
06/28/2022 00:57:20 - INFO - __main__ -   Text: ['He he training with the amateur work he he they live.']
06/28/2022 00:57:21 - INFO - __main__ -   

0.41040155224083985
Train file used is number 3
../../yahoo/subdivided_large/train_3.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:12.
  Batch    20  of    120.    Elapsed: 0:00:26.
  Batch    30  of    120.    Elapsed: 0:00:41.
  Batch    40  of    120.    Elapsed: 0:00:56.
  Batch    50  of    120.    Elapsed: 0:01:11.
  Batch    60  of    120.    Elapsed: 0:01:25.
  Batch    70  of    120.    Elapsed: 0:01:39.
  Batch    80  of    120.    Elapsed: 0:01:54.
  Batch    90  of    120.    Elapsed: 0:02:08.
  Batch   100  of    120.    Elapsed: 0:02:23.
  Batch   110  of    120.    Elapsed: 0:02:35.

  Average training loss generetor: 0.674
  Average training loss discriminator: 3.083
  Training epcoh took: 0:02:49

Running Test...


06/28/2022 01:00:30 - INFO - __main__ -   Epoch: 3 | Batch: 0/10001 (0%) | G Loss: 4.620041 | C Loss: -4.130612
06/28/2022 01:00:30 - INFO - __main__ -   Text: ['makes "Jesus-" .']


  Accuracy: 0.185
  Test Loss: 2.297
  Test took: 0:00:00


06/28/2022 01:00:31 - INFO - __main__ -   Epoch: 3 | Batch: 600/10001 (6%) | G Loss: 4.340141 | C Loss: -3.689429
06/28/2022 01:00:31 - INFO - __main__ -   Text: ['From ethernet and MTT.']
06/28/2022 01:00:32 - INFO - __main__ -   Epoch: 3 | Batch: 1200/10001 (12%) | G Loss: 4.158864 | C Loss: -3.812348
06/28/2022 01:00:32 - INFO - __main__ -   Text: ['This kind of pronunciation may say: The magnificent :']
06/28/2022 01:00:33 - INFO - __main__ -   Epoch: 3 | Batch: 1800/10001 (18%) | G Loss: 4.775768 | C Loss: -3.935961
06/28/2022 01:00:33 - INFO - __main__ -   Text: ['is one of the most very profitable computer games out there.']
06/28/2022 01:00:34 - INFO - __main__ -   Epoch: 3 | Batch: 2400/10001 (24%) | G Loss: 4.668319 | C Loss: -3.826586
06/28/2022 01:00:34 - INFO - __main__ -   Text: ['Student Text is the current most famous book.']
06/28/2022 01:00:35 - INFO - __main__ -   Epoch: 3 | Batch: 3000/10001 (30%) | G Loss: 4.508978 | C Loss: -4.176436
06/28/2022 01:00:35 - INFO - _

0.4385859032969835
Train file used is number 4
../../yahoo/subdivided_large/train_4.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:14.
  Batch    20  of    120.    Elapsed: 0:00:28.
  Batch    30  of    120.    Elapsed: 0:00:42.
  Batch    40  of    120.    Elapsed: 0:00:56.
  Batch    50  of    120.    Elapsed: 0:01:09.
  Batch    60  of    120.    Elapsed: 0:01:23.
  Batch    70  of    120.    Elapsed: 0:01:38.
  Batch    80  of    120.    Elapsed: 0:01:53.
  Batch    90  of    120.    Elapsed: 0:02:07.
  Batch   100  of    120.    Elapsed: 0:02:20.
  Batch   110  of    120.    Elapsed: 0:02:34.

  Average training loss generetor: 0.689
  Average training loss discriminator: 2.868
  Training epcoh took: 0:02:47

Running Test...


06/28/2022 01:03:42 - INFO - __main__ -   Epoch: 4 | Batch: 0/10001 (0%) | G Loss: 4.529090 | C Loss: -3.749632
06/28/2022 01:03:42 - INFO - __main__ -   Text: ['examples".']


  Accuracy: 0.210
  Test Loss: 2.257
  Test took: 0:00:00


06/28/2022 01:03:43 - INFO - __main__ -   Epoch: 4 | Batch: 600/10001 (6%) | G Loss: 4.449077 | C Loss: -4.016423
06/28/2022 01:03:43 - INFO - __main__ -   Text: ['button pressing ectopus lottery exchopus right buy<|endoftext|> She is not fish .']
06/28/2022 01:03:44 - INFO - __main__ -   Epoch: 4 | Batch: 1200/10001 (12%) | G Loss: 4.584173 | C Loss: -3.994284
06/28/2022 01:03:44 - INFO - __main__ -   Text: ['barelyYou (doesnnit) we words."']
06/28/2022 01:03:45 - INFO - __main__ -   Epoch: 4 | Batch: 1800/10001 (18%) | G Loss: 4.874452 | C Loss: -4.203463
06/28/2022 01:03:45 - INFO - __main__ -   Text: ['Reddit claims that fish is quiet and lean.']
06/28/2022 01:03:46 - INFO - __main__ -   Epoch: 4 | Batch: 2400/10001 (24%) | G Loss: 4.629632 | C Loss: -4.190031
06/28/2022 01:03:46 - INFO - __main__ -   Text: ["Univbot li It's true if it can walk."]
06/28/2022 01:03:47 - INFO - __main__ -   Epoch: 4 | Batch: 3000/10001 (30%) | G Loss: 4.829841 | C Loss: -4.524186
06/28/2022 01:03:47 

0.4511395839191315
Train file used is number 5
../../yahoo/subdivided_large/train_5.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:14.
  Batch    20  of    120.    Elapsed: 0:00:28.
  Batch    30  of    120.    Elapsed: 0:00:42.
  Batch    40  of    120.    Elapsed: 0:00:55.
  Batch    50  of    120.    Elapsed: 0:01:09.
  Batch    60  of    120.    Elapsed: 0:01:23.
  Batch    70  of    120.    Elapsed: 0:01:36.
  Batch    80  of    120.    Elapsed: 0:01:50.
  Batch    90  of    120.    Elapsed: 0:02:04.
  Batch   100  of    120.    Elapsed: 0:02:17.
  Batch   110  of    120.    Elapsed: 0:02:32.

  Average training loss generetor: 0.695
  Average training loss discriminator: 2.626
  Training epcoh took: 0:02:45

Running Test...


06/28/2022 01:06:53 - INFO - __main__ -   Epoch: 5 | Batch: 0/10001 (0%) | G Loss: 4.357077 | C Loss: -3.894767
06/28/2022 01:06:53 - INFO - __main__ -   Text: ['aisles."']


  Accuracy: 0.247
  Test Loss: 2.214
  Test took: 0:00:00


06/28/2022 01:06:54 - INFO - __main__ -   Epoch: 5 | Batch: 600/10001 (6%) | G Loss: 4.318542 | C Loss: -3.710934
06/28/2022 01:06:54 - INFO - __main__ -   Text: ['"Crawfish appears strange."']
06/28/2022 01:06:55 - INFO - __main__ -   Epoch: 5 | Batch: 1200/10001 (12%) | G Loss: 4.520477 | C Loss: -3.616540
06/28/2022 01:06:55 - INFO - __main__ -   Text: ['These are called an iso.']
06/28/2022 01:06:56 - INFO - __main__ -   Epoch: 5 | Batch: 1800/10001 (18%) | G Loss: 4.798465 | C Loss: -4.036019
06/28/2022 01:06:56 - INFO - __main__ -   Text: ['For a true Britishian, Mountaineres is strongestly.']
06/28/2022 01:06:57 - INFO - __main__ -   Epoch: 5 | Batch: 2400/10001 (24%) | G Loss: 4.324715 | C Loss: -3.735215
06/28/2022 01:06:57 - INFO - __main__ -   Text: ['"That\'s her speed" may not seem like much to someone who owns Infos.']
06/28/2022 01:06:58 - INFO - __main__ -   Epoch: 5 | Batch: 3000/10001 (30%) | G Loss: 4.629351 | C Loss: -3.914187
06/28/2022 01:06:58 - INFO - __main__ -

0.4629196201065815
Train file used is number 6
../../yahoo/subdivided_large/train_6.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:13.
  Batch    20  of    120.    Elapsed: 0:00:27.
  Batch    30  of    120.    Elapsed: 0:00:41.
  Batch    40  of    120.    Elapsed: 0:00:56.
  Batch    50  of    120.    Elapsed: 0:01:09.
  Batch    60  of    120.    Elapsed: 0:01:24.
  Batch    70  of    120.    Elapsed: 0:01:38.
  Batch    80  of    120.    Elapsed: 0:01:52.
  Batch    90  of    120.    Elapsed: 0:02:07.
  Batch   100  of    120.    Elapsed: 0:02:21.
  Batch   110  of    120.    Elapsed: 0:02:36.

  Average training loss generetor: 0.697
  Average training loss discriminator: 2.369
  Training epcoh took: 0:02:49

Running Test...


06/28/2022 01:10:07 - INFO - __main__ -   Epoch: 6 | Batch: 0/10001 (0%) | G Loss: 4.222330 | C Loss: -3.565387
06/28/2022 01:10:07 - INFO - __main__ -   Text: ['breeds .']


  Accuracy: 0.265
  Test Loss: 2.162
  Test took: 0:00:00


06/28/2022 01:10:08 - INFO - __main__ -   Epoch: 6 | Batch: 600/10001 (6%) | G Loss: 4.613758 | C Loss: -3.678281
06/28/2022 01:10:08 - INFO - __main__ -   Text: ['Exiting this stage on bird is called free flow.']
06/28/2022 01:10:09 - INFO - __main__ -   Epoch: 6 | Batch: 1200/10001 (12%) | G Loss: 3.818054 | C Loss: -3.221541
06/28/2022 01:10:09 - INFO - __main__ -   Text: ['"Here goes another one," from a parody title.']
06/28/2022 01:10:10 - INFO - __main__ -   Epoch: 6 | Batch: 1800/10001 (18%) | G Loss: 3.995018 | C Loss: -3.366353
06/28/2022 01:10:10 - INFO - __main__ -   Text: ['They are mostly dolls of evil!']
06/28/2022 01:10:11 - INFO - __main__ -   Epoch: 6 | Batch: 2400/10001 (24%) | G Loss: 4.531995 | C Loss: -3.518365
06/28/2022 01:10:11 - INFO - __main__ -   Text: ['Just what you think, if you send me legs?!']
06/28/2022 01:10:12 - INFO - __main__ -   Epoch: 6 | Batch: 3000/10001 (30%) | G Loss: 3.899805 | C Loss: -3.294541
06/28/2022 01:10:12 - INFO - __main__ -   Text

0.45643042981607895
Train file used is number 7
../../yahoo/subdivided_large/train_7.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:15.
  Batch    20  of    120.    Elapsed: 0:00:30.
  Batch    30  of    120.    Elapsed: 0:00:45.
  Batch    40  of    120.    Elapsed: 0:01:00.
  Batch    50  of    120.    Elapsed: 0:01:15.
  Batch    60  of    120.    Elapsed: 0:01:31.
  Batch    70  of    120.    Elapsed: 0:01:45.
  Batch    80  of    120.    Elapsed: 0:02:01.
  Batch    90  of    120.    Elapsed: 0:02:16.
  Batch   100  of    120.    Elapsed: 0:02:30.
  Batch   110  of    120.    Elapsed: 0:02:45.

  Average training loss generetor: 0.701
  Average training loss discriminator: 2.079
  Training epcoh took: 0:02:59

Running Test...


06/28/2022 01:13:31 - INFO - __main__ -   Epoch: 7 | Batch: 0/10001 (0%) | G Loss: 3.807411 | C Loss: -3.225398
06/28/2022 01:13:31 - INFO - __main__ -   Text: ['They use the concept of digital money.']


  Accuracy: 0.270
  Test Loss: 2.132
  Test took: 0:00:00


06/28/2022 01:13:32 - INFO - __main__ -   Epoch: 7 | Batch: 600/10001 (6%) | G Loss: 3.468308 | C Loss: -2.857921
06/28/2022 01:13:32 - INFO - __main__ -   Text: ['To set a new record, Picctick needs to be on k-1.']
06/28/2022 01:13:33 - INFO - __main__ -   Epoch: 7 | Batch: 1200/10001 (12%) | G Loss: 3.210629 | C Loss: -3.062437
06/28/2022 01:13:33 - INFO - __main__ -   Text: ['In the US, you cannot have a spindle for your network.']
06/28/2022 01:13:34 - INFO - __main__ -   Epoch: 7 | Batch: 1800/10001 (18%) | G Loss: 3.782075 | C Loss: -3.122443
06/28/2022 01:13:34 - INFO - __main__ -   Text: ['Which drill is hiring now or will stay the same!']
06/28/2022 01:13:35 - INFO - __main__ -   Epoch: 7 | Batch: 2400/10001 (24%) | G Loss: 3.209568 | C Loss: -2.808022
06/28/2022 01:13:35 - INFO - __main__ -   Text: ['"What they\' written isn\'t really start HemProMed!"']
06/28/2022 01:13:36 - INFO - __main__ -   Epoch: 7 | Batch: 3000/10001 (30%) | G Loss: 3.444105 | C Loss: -2.956007
06/28/2

0.43944995800196185
Train file used is number 8
../../yahoo/subdivided_large/train_8.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:14.
  Batch    20  of    120.    Elapsed: 0:00:27.
  Batch    30  of    120.    Elapsed: 0:00:42.
  Batch    40  of    120.    Elapsed: 0:00:56.
  Batch    50  of    120.    Elapsed: 0:01:10.
  Batch    60  of    120.    Elapsed: 0:01:25.
  Batch    70  of    120.    Elapsed: 0:01:40.
  Batch    80  of    120.    Elapsed: 0:01:54.
  Batch    90  of    120.    Elapsed: 0:02:08.
  Batch   100  of    120.    Elapsed: 0:02:22.
  Batch   110  of    120.    Elapsed: 0:02:35.

  Average training loss generetor: 0.699
  Average training loss discriminator: 1.818
  Training epcoh took: 0:02:50

Running Test...


06/28/2022 01:16:48 - INFO - __main__ -   Epoch: 8 | Batch: 0/10001 (0%) | G Loss: 3.442065 | C Loss: -2.651783
06/28/2022 01:16:48 - INFO - __main__ -   Text: ['WTF?']


  Accuracy: 0.287
  Test Loss: 2.076
  Test took: 0:00:00


06/28/2022 01:16:49 - INFO - __main__ -   Epoch: 8 | Batch: 600/10001 (6%) | G Loss: 3.608249 | C Loss: -2.899277
06/28/2022 01:16:49 - INFO - __main__ -   Text: ['Kestrel is stronger on the random listeners.']
06/28/2022 01:16:50 - INFO - __main__ -   Epoch: 8 | Batch: 1200/10001 (12%) | G Loss: 3.427771 | C Loss: -2.840657
06/28/2022 01:16:50 - INFO - __main__ -   Text: ['Mimir makes one horrible plan for a time twenty-five years.']
06/28/2022 01:16:51 - INFO - __main__ -   Epoch: 8 | Batch: 1800/10001 (18%) | G Loss: 3.187002 | C Loss: -2.744500
06/28/2022 01:16:51 - INFO - __main__ -   Text: ['Food says, "Type one!']
06/28/2022 01:16:52 - INFO - __main__ -   Epoch: 8 | Batch: 2400/10001 (24%) | G Loss: 3.243271 | C Loss: -2.676908
06/28/2022 01:16:52 - INFO - __main__ -   Text: ['These refer to comparisons between Washington University and Cambridge University.']
06/28/2022 01:16:53 - INFO - __main__ -   Epoch: 8 | Batch: 3000/10001 (30%) | G Loss: 3.417011 | C Loss: -2.894614
06/2

0.46054437534804
Train file used is number 9
../../yahoo/subdivided_large/train_9.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:15.
  Batch    20  of    120.    Elapsed: 0:00:30.
  Batch    30  of    120.    Elapsed: 0:00:44.
  Batch    40  of    120.    Elapsed: 0:01:00.
  Batch    50  of    120.    Elapsed: 0:01:16.
  Batch    60  of    120.    Elapsed: 0:01:32.
  Batch    70  of    120.    Elapsed: 0:01:47.
  Batch    80  of    120.    Elapsed: 0:02:02.
  Batch    90  of    120.    Elapsed: 0:02:16.
  Batch   100  of    120.    Elapsed: 0:02:32.
  Batch   110  of    120.    Elapsed: 0:02:47.

  Average training loss generetor: 0.702
  Average training loss discriminator: 1.572
  Training epcoh took: 0:03:02

Running Test...


06/28/2022 01:20:16 - INFO - __main__ -   Epoch: 9 | Batch: 0/10001 (0%) | G Loss: 3.132714 | C Loss: -2.520920
06/28/2022 01:20:16 - INFO - __main__ -   Text: ['Microphones can also make one aware of these.']


  Accuracy: 0.278
  Test Loss: 2.074
  Test took: 0:00:00


06/28/2022 01:20:17 - INFO - __main__ -   Epoch: 9 | Batch: 600/10001 (6%) | G Loss: 2.838004 | C Loss: -2.457885
06/28/2022 01:20:17 - INFO - __main__ -   Text: ['Stu slams the brakes while J.Bugs will fast.']
06/28/2022 01:20:18 - INFO - __main__ -   Epoch: 9 | Batch: 1200/10001 (12%) | G Loss: 2.882060 | C Loss: -2.448639
06/28/2022 01:20:18 - INFO - __main__ -   Text: ['Other suitable names include:']
06/28/2022 01:20:19 - INFO - __main__ -   Epoch: 9 | Batch: 1800/10001 (18%) | G Loss: 3.025948 | C Loss: -2.496298
06/28/2022 01:20:19 - INFO - __main__ -   Text: ['About this purbt include "Together you are going to walk", that shows humanity.']
06/28/2022 01:20:20 - INFO - __main__ -   Epoch: 9 | Batch: 2400/10001 (24%) | G Loss: 2.875284 | C Loss: -2.465364
06/28/2022 01:20:20 - INFO - __main__ -   Text: ["It has Mary Bennet's works."]
06/28/2022 01:20:21 - INFO - __main__ -   Epoch: 9 | Batch: 3000/10001 (30%) | G Loss: 2.809541 | C Loss: -2.417715
06/28/2022 01:20:21 - INFO - __

0.5307288150795214
Train file used is number 10
../../yahoo/subdivided_large/train_10.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:16.
  Batch    20  of    120.    Elapsed: 0:00:32.
  Batch    30  of    120.    Elapsed: 0:00:48.
  Batch    40  of    120.    Elapsed: 0:01:04.
  Batch    50  of    120.    Elapsed: 0:01:19.
  Batch    60  of    120.    Elapsed: 0:01:35.
  Batch    70  of    120.    Elapsed: 0:01:51.
  Batch    80  of    120.    Elapsed: 0:02:06.
  Batch    90  of    120.    Elapsed: 0:02:22.
  Batch   100  of    120.    Elapsed: 0:02:38.
  Batch   110  of    120.    Elapsed: 0:02:54.

  Average training loss generetor: 0.702
  Average training loss discriminator: 1.387
  Training epcoh took: 0:03:10

Running Test...


06/28/2022 01:23:52 - INFO - __main__ -   Epoch: 10 | Batch: 0/10001 (0%) | G Loss: 2.754909 | C Loss: -2.349859
06/28/2022 01:23:52 - INFO - __main__ -   Text: ["says the book's author."]


  Accuracy: 0.320
  Test Loss: 2.037
  Test took: 0:00:00


06/28/2022 01:23:53 - INFO - __main__ -   Epoch: 10 | Batch: 600/10001 (6%) | G Loss: 2.536390 | C Loss: -2.246959
06/28/2022 01:23:53 - INFO - __main__ -   Text: ['By virtue of being very likable.']
06/28/2022 01:23:54 - INFO - __main__ -   Epoch: 10 | Batch: 1200/10001 (12%) | G Loss: 2.638145 | C Loss: -2.232198
06/28/2022 01:23:54 - INFO - __main__ -   Text: ['There are several books on it [La SpeciMazet].']
06/28/2022 01:23:55 - INFO - __main__ -   Epoch: 10 | Batch: 1800/10001 (18%) | G Loss: 2.741398 | C Loss: -2.320937
06/28/2022 01:23:55 - INFO - __main__ -   Text: ["Workers' jobs are like workers': Beehive is afraid of their own form of distress."]
06/28/2022 01:23:56 - INFO - __main__ -   Epoch: 10 | Batch: 2400/10001 (24%) | G Loss: 2.537118 | C Loss: -2.193499
06/28/2022 01:23:56 - INFO - __main__ -   Text: ['According to Wikipedia, the most popular brahmin is "RA".']
06/28/2022 01:23:57 - INFO - __main__ -   Epoch: 10 | Batch: 3000/10001 (30%) | G Loss: 2.445799 | C Loss:

0.5246343765730772
Train file used is number 11
../../yahoo/subdivided_large/train_11.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:16.
  Batch    20  of    120.    Elapsed: 0:00:32.
  Batch    30  of    120.    Elapsed: 0:00:49.
  Batch    40  of    120.    Elapsed: 0:01:06.
  Batch    50  of    120.    Elapsed: 0:01:22.
  Batch    60  of    120.    Elapsed: 0:01:38.
  Batch    70  of    120.    Elapsed: 0:01:55.
  Batch    80  of    120.    Elapsed: 0:02:10.
  Batch    90  of    120.    Elapsed: 0:02:26.
  Batch   100  of    120.    Elapsed: 0:02:42.
  Batch   110  of    120.    Elapsed: 0:02:58.

  Average training loss generetor: 0.704
  Average training loss discriminator: 1.225
  Training epcoh took: 0:03:15

Running Test...


06/28/2022 01:27:32 - INFO - __main__ -   Epoch: 11 | Batch: 0/10001 (0%) | G Loss: 2.318799 | C Loss: -2.049134
06/28/2022 01:27:32 - INFO - __main__ -   Text: ['"ai".']


  Accuracy: 0.300
  Test Loss: 2.047
  Test took: 0:00:00


06/28/2022 01:27:33 - INFO - __main__ -   Epoch: 11 | Batch: 600/10001 (6%) | G Loss: 2.216573 | C Loss: -2.029444
06/28/2022 01:27:33 - INFO - __main__ -   Text: ['Performative is a book that tells interesting but unsettling stories.']
06/28/2022 01:27:34 - INFO - __main__ -   Epoch: 11 | Batch: 1200/10001 (12%) | G Loss: 2.288198 | C Loss: -1.801602
06/28/2022 01:27:34 - INFO - __main__ -   Text: ['The difference between Invincible and Insane is Personality.']
06/28/2022 01:27:35 - INFO - __main__ -   Epoch: 11 | Batch: 1800/10001 (18%) | G Loss: 2.310474 | C Loss: -1.888917
06/28/2022 01:27:35 - INFO - __main__ -   Text: ['There is a turning point in sports within statistic theory "I\'m an expert".']
06/28/2022 01:27:36 - INFO - __main__ -   Epoch: 11 | Batch: 2400/10001 (24%) | G Loss: 2.187151 | C Loss: -1.791669
06/28/2022 01:27:36 - INFO - __main__ -   Text: ['Criticism towards people with AGI and others with TRI.']
06/28/2022 01:27:37 - INFO - __main__ -   Epoch: 11 | Batch: 30

0.4911365195798898
Train file used is number 12
../../yahoo/subdivided_large/train_12.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:14.
  Batch    20  of    120.    Elapsed: 0:00:29.
  Batch    30  of    120.    Elapsed: 0:00:43.
  Batch    40  of    120.    Elapsed: 0:00:58.
  Batch    50  of    120.    Elapsed: 0:01:13.
  Batch    60  of    120.    Elapsed: 0:01:28.
  Batch    70  of    120.    Elapsed: 0:01:44.
  Batch    80  of    120.    Elapsed: 0:01:58.
  Batch    90  of    120.    Elapsed: 0:02:14.
  Batch   100  of    120.    Elapsed: 0:02:28.
  Batch   110  of    120.    Elapsed: 0:02:42.

  Average training loss generetor: 0.704
  Average training loss discriminator: 1.112
  Training epcoh took: 0:02:57

Running Test...


06/28/2022 01:30:55 - INFO - __main__ -   Epoch: 12 | Batch: 0/10001 (0%) | G Loss: 2.266788 | C Loss: -1.771801
06/28/2022 01:30:55 - INFO - __main__ -   Text: ["While I'm not familiar with the kaza yader, I am really interested."]


  Accuracy: 0.335
  Test Loss: 2.063
  Test took: 0:00:00


06/28/2022 01:30:56 - INFO - __main__ -   Epoch: 12 | Batch: 600/10001 (6%) | G Loss: 2.120735 | C Loss: -1.685886
06/28/2022 01:30:56 - INFO - __main__ -   Text: ["At such a time there is also the cause of 'Bird Fever'."]
06/28/2022 01:30:57 - INFO - __main__ -   Epoch: 12 | Batch: 1200/10001 (12%) | G Loss: 2.075426 | C Loss: -1.653611
06/28/2022 01:30:58 - INFO - __main__ -   Text: ['"Lawrence", referred to as the "law good-doing judges".']
06/28/2022 01:30:58 - INFO - __main__ -   Epoch: 12 | Batch: 1800/10001 (18%) | G Loss: 2.118189 | C Loss: -1.747878
06/28/2022 01:30:59 - INFO - __main__ -   Text: ['", which can be found in this Broadway show.']
06/28/2022 01:31:00 - INFO - __main__ -   Epoch: 12 | Batch: 2400/10001 (24%) | G Loss: 1.951202 | C Loss: -1.582525
06/28/2022 01:31:00 - INFO - __main__ -   Text: ["'Macphersonism' is an imaginary one."]
06/28/2022 01:31:01 - INFO - __main__ -   Epoch: 12 | Batch: 3000/10001 (30%) | G Loss: 2.223698 | C Loss: -1.758448
06/28/2022 01:3

0.5046024794987829
Train file used is number 13
../../yahoo/subdivided_large/train_13.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:16.
  Batch    20  of    120.    Elapsed: 0:00:32.
  Batch    30  of    120.    Elapsed: 0:00:48.
  Batch    40  of    120.    Elapsed: 0:01:03.
  Batch    50  of    120.    Elapsed: 0:01:18.
  Batch    60  of    120.    Elapsed: 0:01:33.
  Batch    70  of    120.    Elapsed: 0:01:49.
  Batch    80  of    120.    Elapsed: 0:02:03.
  Batch    90  of    120.    Elapsed: 0:02:18.
  Batch   100  of    120.    Elapsed: 0:02:34.
  Batch   110  of    120.    Elapsed: 0:02:49.

  Average training loss generetor: 0.703
  Average training loss discriminator: 1.035
  Training epcoh took: 0:03:04

Running Test...


06/28/2022 01:34:25 - INFO - __main__ -   Epoch: 13 | Batch: 0/10001 (0%) | G Loss: 1.791413 | C Loss: -1.384568
06/28/2022 01:34:26 - INFO - __main__ -   Text: ["Let's say in a Break the fuck to have a conversation."]


  Accuracy: 0.345
  Test Loss: 2.113
  Test took: 0:00:00


06/28/2022 01:34:27 - INFO - __main__ -   Epoch: 13 | Batch: 600/10001 (6%) | G Loss: 1.786990 | C Loss: -1.443233
06/28/2022 01:34:27 - INFO - __main__ -   Text: ['That\'s how Steve kills himself."']
06/28/2022 01:34:28 - INFO - __main__ -   Epoch: 13 | Batch: 1200/10001 (12%) | G Loss: 1.729210 | C Loss: -1.416513
06/28/2022 01:34:28 - INFO - __main__ -   Text: ['In my books, "The Cage Body" revolves on a morality.']
06/28/2022 01:34:29 - INFO - __main__ -   Epoch: 13 | Batch: 1800/10001 (18%) | G Loss: 1.953499 | C Loss: -1.557848
06/28/2022 01:34:29 - INFO - __main__ -   Text: ['This is not a quiz though.']
06/28/2022 01:34:30 - INFO - __main__ -   Epoch: 13 | Batch: 2400/10001 (24%) | G Loss: 1.864702 | C Loss: -1.510411
06/28/2022 01:34:30 - INFO - __main__ -   Text: ['Many think it is a human male love song.']
06/28/2022 01:34:31 - INFO - __main__ -   Epoch: 13 | Batch: 3000/10001 (30%) | G Loss: 2.002443 | C Loss: -1.592843
06/28/2022 01:34:31 - INFO - __main__ -   Text: ['The 

0.5003474560911353
Train file used is number 14
../../yahoo/subdivided_large/train_14.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:15.
  Batch    20  of    120.    Elapsed: 0:00:31.
  Batch    30  of    120.    Elapsed: 0:00:47.
  Batch    40  of    120.    Elapsed: 0:01:02.
  Batch    50  of    120.    Elapsed: 0:01:18.
  Batch    60  of    120.    Elapsed: 0:01:34.
  Batch    70  of    120.    Elapsed: 0:01:49.
  Batch    80  of    120.    Elapsed: 0:02:03.
  Batch    90  of    120.    Elapsed: 0:02:19.
  Batch   100  of    120.    Elapsed: 0:02:35.
  Batch   110  of    120.    Elapsed: 0:02:50.

  Average training loss generetor: 0.704
  Average training loss discriminator: 0.970
  Training epcoh took: 0:03:05

Running Test...


06/28/2022 01:37:57 - INFO - __main__ -   Epoch: 14 | Batch: 0/10001 (0%) | G Loss: 1.783701 | C Loss: -1.254313
06/28/2022 01:37:57 - INFO - __main__ -   Text: ['Some academics believe ! <PAD> ).']


  Accuracy: 0.347
  Test Loss: 2.135
  Test took: 0:00:00


06/28/2022 01:37:58 - INFO - __main__ -   Epoch: 14 | Batch: 600/10001 (6%) | G Loss: 1.818893 | C Loss: -1.254474
06/28/2022 01:37:58 - INFO - __main__ -   Text: ['It is also known as the hunch test.']
06/28/2022 01:37:59 - INFO - __main__ -   Epoch: 14 | Batch: 1200/10001 (12%) | G Loss: 1.520453 | C Loss: -1.176982
06/28/2022 01:37:59 - INFO - __main__ -   Text: ['This is something that works in both worlds."']
06/28/2022 01:38:00 - INFO - __main__ -   Epoch: 14 | Batch: 1800/10001 (18%) | G Loss: 1.647090 | C Loss: -1.359046
06/28/2022 01:38:00 - INFO - __main__ -   Text: ['']
06/28/2022 01:38:01 - INFO - __main__ -   Epoch: 14 | Batch: 2400/10001 (24%) | G Loss: 1.631419 | C Loss: -1.207303
06/28/2022 01:38:01 - INFO - __main__ -   Text: ['As an example, notice how "Horrible Shit moves on".']
06/28/2022 01:38:02 - INFO - __main__ -   Epoch: 14 | Batch: 3000/10001 (30%) | G Loss: 1.560876 | C Loss: -1.192172
06/28/2022 01:38:02 - INFO - __main__ -   Text: ['"Olympia is purely about

0.4324452656977462
Train file used is number 15
../../yahoo/subdivided_large/train_15.txt
Train classification discriminator

Training...
  Batch    10  of    120.    Elapsed: 0:00:15.
  Batch    20  of    120.    Elapsed: 0:00:30.
  Batch    30  of    120.    Elapsed: 0:00:45.
  Batch    40  of    120.    Elapsed: 0:01:00.


In [None]:
print(max(accuracy_array))

In [None]:
print(accuracy_array[-1])

In [None]:
plt.figure(figsize=(8,6))
plt.plot(accuracy_array)
plt.title('OPTAGAN-GAN-BERT Performance over Training Epochs', fontsize=20)
plt.xlabel('Epochs', fontsize=16)
plt.ylabel('Accuracy', fontsize=16)
plt.xlim(0,200)
plt.show

In [None]:
df_to_save = pd.DataFrame(accuracy_array)
df_to_save.to_csv('accuracy_array_optagan_yahoo_nt_rt_768_0.csv')

In [None]:
# #Generating Sentences
# from __future__ import absolute_import, division, print_function, unicode_literals
# import argparse

# import logging
# import torch
# import torch.nn as nn
# import numpy as np

# from modules.gan import Generator

# import glob
# import os
# import pickle
# import random

# import torch.nn.functional as F
# from tqdm import tqdm, trange

# from func import GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig, BertConfig
# from func import GPT2LMHeadModel, GPT2Tokenizer, GPT2ForLatentConnector, GPT2ForLatentConnectorValueHead
# from func import OpenAIGPTLMHeadModel, OpenAIGPTTokenizer
# from func import XLNetLMHeadModel, XLNetTokenizer
# from func import TransfoXLLMHeadModel, TransfoXLTokenizer
# from func import BertForLatentConnector, BertTokenizer

# from collections import defaultdict
# import pdb
# from modules.utils import rollout_test

# MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop

# ALL_MODELS = sum((tuple(conf.pretrained_config_archive_map.keys()) for conf in (GPT2Config, OpenAIGPTConfig, XLNetConfig, TransfoXLConfig)), ())

# logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
#                     datefmt = '%m/%d/%Y %H:%M:%S',
#                     level = logging.INFO)
# logger = logging.getLogger(__name__)

# MODEL_CLASSES = {
#     'gpt2': (GPT2Config, GPT2ForLatentConnector, GPT2Tokenizer),
#     'bert': (BertConfig, BertForLatentConnector, BertTokenizer),
#     'gpt2v': (GPT2Config, GPT2ForLatentConnectorValueHead, GPT2Tokenizer)
# }

# if __name__ == '__main__':
#     parser = argparse.ArgumentParser()
#     parser.add_argument('--seed', type=int, default=0)
#     parser.add_argument('--new_sent', type=int, default=1, help="Number of sentences to generate")
#     parser.add_argument('--n_layers', type=int, default=20, help="Number of layers of generator")
#     parser.add_argument('--block_dim', type=int, default=100)
#     parser.add_argument('--interval', type=int, default=10)
#     parser.add_argument('--cuda', type=bool, default=torch.cuda.is_available())
#     parser.add_argument('--generator_dir', default=None, type=str, required=True, help="Directory of GAN model checkpoint")
#     parser.add_argument("--checkpoint_dir", default=None, type=str, required=True,
#                         help="The directory where checkpoints are saved.")
#     parser.add_argument("--output_dir", default=None, type=str, required=True,
#                         help="The output directory where the model predictions and checkpoints will be written.")
#     parser.add_argument("--save", default=False, type=bool, help="Save results to file.")
#     parser.add_argument("--latent_size", default=32, type=int, help="Latent space dimension.")
#     parser.add_argument("--output_name", default="results", type=str, help="File name of output")
#     parser.add_argument("--batch_size", default=100, type=int, help="Batch size to generate outputs")
#     ## Encoder options
#     parser.add_argument("--encoder_model_type", default="bert", type=str,
#                         help="The encoder model architecture to be fine-tuned.")
#     parser.add_argument("--encoder_model_name_or_path", default="bert-base-cased", type=str,
#                         help="The encoder model checkpoint for weights initialization.")
#     parser.add_argument("--encoder_config_name", default="", type=str,
#                         help="Optional pretrained config name or path if not the same as model_name_or_path")
#     parser.add_argument("--encoder_tokenizer_name", default="", type=str,
#                         help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
#     ## Decoder options
#     parser.add_argument("--decoder_model_type", default="gpt2", type=str,
#                         help="The decoder model architecture to be fine-tuned.")
#     parser.add_argument("--decoder_model_name_or_path", default="gpt2", type=str,
#                         help="The decoder model checkpoint for weights initialization.")
#     parser.add_argument("--decoder_config_name", default="", type=str,
#                         help="Optional pretrained config name or path if not the same as model_name_or_path")
#     parser.add_argument("--decoder_tokenizer_name", default="", type=str,
#                         help="Optional pretrained tokenizer name or path if not the same as model_name_or_path")
#     parser.add_argument("--max_seq_length", default=512, type=int,
#                         help="Optional input sequence length before tokenization. The sequence will be dropped if it is longer the max_seq_length")
#     parser.add_argument("--finetune_decoder", default=False, type=bool,
#                         help="Uses finetuned decoder in output dir if true.")

#     ## Variational auto-encoder(check this)
#     parser.add_argument("--top_k", type=int, default=0)
#     parser.add_argument("--top_p", type=float, default=1.0)
#     parser.add_argument("--prompt", type=str, default="")
#     parser.add_argument("--padding_text", type=str, default="")
#     parser.add_argument("--length", type=int, default=20)
#     parser.add_argument("--block_size", default=-1, type=int,
#                         help="Optional input sequence length after tokenization."
#                              "The training dataset will be truncated in block of this size for training."
#                              "Default to the model max input length for single sentence inputs (take into account special tokens).")
#     parser.add_argument("--do_lower_case", action='store_true',
#                         help="Set this flag if you are using an uncased model.")
#     parser.add_argument("--use_philly", action='store_true',
#                         help="Use Philly for computing.")
#     parser.add_argument('--gloabl_step_eval', type=int, default=508523,
#                         help="Evaluate the results at the given global step")

#     # Load a trained Encoder model and vocabulary that you have fine-tuned
#     args = parser.parse_args("--checkpoint_dir=output_dir_yahoo_768_0 \
#     --output_dir=output_dir_yahoo_768_0 \
#     --generator_dir=output_dir_yahoo_768_0 \
#     --block_size 100 \
#     --max_seq_length 60 \
#     --gloabl_step_eval 24000 \
#     --latent_size 32 \
#     --block_dim 100 \
#     --new_sent 100 \
#     --n_layers 10 \
#     --top_p 0.9 \
#     --output_name=results \
#     --save True".split())
#     global_step = args.gloabl_step_eval

#     np.random.seed(args.seed)
#     torch.manual_seed(args.seed)
#     torch.backends.cudnn.deterministic = True
#     args.device = torch.device("cuda" if args.cuda else "cpu")
#     args.n_gpu = torch.cuda.device_count()
#     if args.n_gpu > 0:
#         torch.cuda.manual_seed_all(args.seed)       
    
#     args.encoder_model_type = args.encoder_model_type.lower()
#     args.decoder_model_type = args.decoder_model_type.lower()

#     output_encoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-encoder-{}'.format(global_step))
#     output_decoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-decoder-{}'.format(global_step))
#     if not args.finetune_decoder:
#         output_decoder_dir = os.path.join(args.checkpoint_dir, 'checkpoint-decoder-{}'.format(global_step))
#     else:
#          output_decoder_dir = os.path.join(args.output_dir, 'checkpoint-decoder-{}'.format(global_step))
#     checkpoints = [ [output_encoder_dir, output_decoder_dir] ]

#     # Load a trained Encoder model and vocabulary that you have fine-tuned
#     encoder_config_class, encoder_model_class, encoder_tokenizer_class = MODEL_CLASSES[args.encoder_model_type]
#     model_encoder = encoder_model_class.from_pretrained(output_encoder_dir, latent_size=args.latent_size)
#     tokenizer_encoder = encoder_tokenizer_class.from_pretrained(args.encoder_tokenizer_name if args.encoder_tokenizer_name else args.encoder_model_name_or_path, do_lower_case=args.do_lower_case)

#     model_encoder.to(args.device)
#     if args.block_size <= 0:
#         args.block_size = tokenizer_encoder.max_len_single_sentence  # Our input block size will be the max possible for the model
#     args.block_size = min(args.block_size, tokenizer_encoder.max_len_single_sentence)

#     # Load a trained Decoder model and vocabulary that you have fine-tuned
#     if not args.finetune_decoder:
#         decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES[args.decoder_model_type]
#     else:
#         decoder_config_class, decoder_model_class, decoder_tokenizer_class = MODEL_CLASSES["gpt2v"]
#     model_decoder = decoder_model_class.from_pretrained(output_decoder_dir, latent_size=args.latent_size)
#     tokenizer_decoder = decoder_tokenizer_class.from_pretrained(args.decoder_tokenizer_name if args.decoder_tokenizer_name else args.decoder_model_name_or_path, do_lower_case=args.do_lower_case)
#     model_decoder.to(args.device)
#     if args.block_size <= 0:
#         args.block_size = tokenizer_decoder.max_len_single_sentence  # Our input block size will be the max possible for the model
#     args.block_size = min(args.block_size, tokenizer_decoder.max_len_single_sentence)

#     # Chunyuan: Add Padding token to GPT2
#     special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>'}
#     num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict)
#     logger.info('We have added {} tokens to GPT2'.format(num_added_toks))
#     model_decoder.resize_token_embeddings(len(tokenizer_decoder))  # Notice: resize_token_embeddings expect to receive the full size of the new vocabulary, i.e. the length of the tokenizer.
#     assert tokenizer_decoder.pad_token == '<PAD>'
    
#     generator = Generator(args.n_layers, args.block_dim, args.latent_size)

#     if args.cuda:
#         generator = generator.cuda()

#     generator.load_state_dict(torch.load(args.generator_dir+'/generator_'+str(args.gloabl_step_eval)+'.th'))
#     generator.eval()
#     model_decoder.eval()
#     model_encoder.eval()
#     if args.save:
#         if not os.path.exists(args.output_dir+"/{}.txt".format(args.output_name)):
#             with open(args.output_dir+"/{}.txt".format(args.output_name), 'w'): 
#                 pass

#     for i in range(int(args.new_sent/args.batch_size)):
#         # sample noise
#         noise = torch.Tensor(np.random.normal(0, 1, (args.batch_size, args.latent_size))).to(args.device)
#         new_z = generator(noise).data

#         # create new sent
#         sents = rollout_test(model_decoder, new_z, tokenizer_decoder, args.max_seq_length, args.batch_size, args.top_k, args.top_p)

#         if args.save:
#             with open(args.output_dir+"/{}.txt".format(args.output_name), 'a') as file:
#                 for i in sents:
#                     file.write(i+"\n")
#         else:
#             for i in sents:
#                 logger.info(i)
