In [1]:
%load_ext autoreload
%autoreload 2

from __future__ import print_function
from math import ceil
import numpy as np
import sys
import pdb

import torch
import torch.optim as optim
import torch.nn as nn

import generator
import discriminator
import oracle
import helpers
from jak_helpers import *

In [2]:
CUDA = False

REAL_DATA_PATH = './JAK2New.txt'
ORACLE_DATA_PATH = './jak.pt'

#These must be initialized
VOCAB_SIZE = None
MAX_SEQ_LEN = None
START_LETTER = None

BATCH_SIZE = 32

#Commented epochs for debugging
MLE_TRAIN_EPOCHS = 1 #100
ADV_TRAIN_EPOCHS = 50
POS_NEG_SAMPLES = 1000

GEN_EMBEDDING_DIM = 32
GEN_HIDDEN_DIM = 32
DIS_EMBEDDING_DIM = 64
DIS_HIDDEN_DIM = 64


In [3]:
#Encode the real data to int tokens, and set the key global variables from that encoding.
VOCAB_SIZE, MAX_SEQ_LEN, START_LETTER = encode_data(REAL_DATA_PATH, ORACLE_DATA_PATH)

'''
TODO: Oracle is currently a seeded random generator. We need to turn it into something that 
generates real data in a token-wise manner.
'''
#oracle = generator.Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)
oracle_obj = oracle.Oracle(VOCAB_SIZE, MAX_SEQ_LEN, './jak.pt', )

'''
This is the 'seed' part of the oracle. We likely won't need it.
'''
#oracle.load_state_dict(torch.load(oracle_state_dict_path))

'''
Oracle samples should be the real data - i.e., the JAK data.
'''
#oracle_samples = torch.load(oracle_samples_path).type(torch.LongTensor)
'''
Look into using the following:
'''
# samples for the new oracle can be generated using helpers.batchwise_sample()


'\nLook into using the following:\n'

In [4]:
from generator_attention import Generator_attention as ga

In [5]:
#(VOCAB_SIZE, g_emb_dim, g_hidden_dim, g_sequence_len, BATCH_SIZE, opt.cuda, POSITIVE_FILE)
#num_emb, emb_dim, hidden_dim, seq_len, batch_size, use_cuda, real_data_path, test_mode = False):
g = ga(VOCAB_SIZE, GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, MAX_SEQ_LEN, BATCH_SIZE, True, ORACLE_DATA_PATH)

100%|██████████| 12428/12428 [00:03<00:00, 3264.07it/s]


In [6]:
print((VOCAB_SIZE, GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, MAX_SEQ_LEN, BATCH_SIZE, True, ORACLE_DATA_PATH))

(37, 32, 32, 99, 32, True, './jak.pt')


In [7]:
'''
Sampling looks like it's very inefficient/slow. 
We'd want to speed that up, but first things first - plug the outputs into 
the discriminator.
'''

"\nSampling looks like it's very inefficient/slow. \nWe'd want to speed that up, but first things first - plug the outputs into \nthe discriminator.\n"

In [10]:
res = g.sample(3)

  1%|          | 1/99 [00:00<00:15,  6.27it/s]

Sampling output - Seq Len: 99


 30%|███       | 30/99 [00:05<00:13,  5.16it/s]


In [None]:
'''
Premature termination due to encountering eos - end of string - token in generator.
'''

In [11]:
res

tensor([[11,  8, 12, 27, 18, 33, 17, 12, 11, 35,  1,  2,  2,  2,  2,  2,  2,  2,
          2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2],
        [25,  3, 13, 34, 31, 26, 28, 32, 20,  7,  6,  7, 10, 20, 14, 13,  7, 26,
         28, 17,  7,  5, 33, 35, 20, 16, 31, 15, 13,  7,  1],
        [27, 10, 31, 27, 34, 29, 12, 36, 32, 25, 24, 23,  8, 36, 23,  6, 21,  1,
          2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2]])

In [12]:
'''
Transformer sample output isn't padded, so we have to retroactively pad it out to 
MAX_SEQ_LEN.

Pad with 25.
'''

"\nTransformer sample output isn't padded, so we have to retroactively pad it out to \nMAX_SEQ_LEN.\n\nPad with 25.\n"

In [13]:
padding = np.zeros((res.shape[0],MAX_SEQ_LEN-res.shape[1]))
padding += 25
np.concatenate([res, padding], axis=1)

array([[11.,  8., 12., 27., 18., 33., 17., 12., 11., 35.,  1.,  2.,  2.,
         2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,  2.,
         2.,  2.,  2.,  2.,  2., 25., 25., 25., 25., 25., 25., 25., 25.,
        25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25.,
        25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25.,
        25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25.,
        25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25.,
        25., 25., 25., 25., 25., 25., 25., 25.],
       [25.,  3., 13., 34., 31., 26., 28., 32., 20.,  7.,  6.,  7., 10.,
        20., 14., 13.,  7., 26., 28., 17.,  7.,  5., 33., 35., 20., 16.,
        31., 15., 13.,  7.,  1., 25., 25., 25., 25., 25., 25., 25., 25.,
        25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25.,
        25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25.,
        25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25., 25.,
  

In [11]:
from core.data_iter import GenDataIter

data_loader = GenDataIter(ORACLE_DATA_PATH, 1)

100%|██████████| 12428/12428 [00:04<00:00, 3095.51it/s]


In [17]:
data_loader.next()[0].shape

torch.Size([1, 99])

In [12]:

'''
Gen and Dis are both randomly initialized.
'''
gen = generator.Generator(GEN_EMBEDDING_DIM, GEN_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)
dis = discriminator.Discriminator(DIS_EMBEDDING_DIM, DIS_HIDDEN_DIM, VOCAB_SIZE, MAX_SEQ_LEN, gpu=CUDA)

In [13]:
gen.sample(50)

tensor([[10, 23,  5,  ..., 21, 28, 32],
        [21,  4,  4,  ...,  4, 34,  8],
        [28, 28, 11,  ..., 26, 12, 17],
        ...,
        [ 8,  2, 17,  ..., 31, 14, 17],
        [34, 32,  7,  ..., 16, 26,  1],
        [ 3, 34, 19,  ..., 12,  0, 36]])

In [5]:
oracle_obj.sample(50)

tensor([[22, 16, 26,  ..., 25, 25,  5],
        [17, 30,  7,  ..., 25, 25,  5],
        [16, 22, 30,  ..., 25, 25,  5],
        ...,
        [16, 16,  1,  ..., 25, 25,  5],
        [16, 21, 30,  ..., 25, 25,  5],
        [16, 33,  7,  ..., 25, 25,  5]])

In [6]:
s = helpers.batchwise_sample(gen, POS_NEG_SAMPLES, BATCH_SIZE)
r = helpers.batchwise_sample(oracle_obj, POS_NEG_SAMPLES, BATCH_SIZE)

In [7]:
gen.sample(1)

tensor([[16,  7,  3,  6, 33, 11, 17, 32, 12, 10, 12, 17, 27, 10, 27, 21,  4, 24,
         22, 21, 27, 23, 28, 14, 23, 22, 14, 28, 18, 17, 21,  1, 20, 23,  9, 21,
          2, 14,  8, 15, 18,  8, 30, 19, 33, 31, 22, 13, 15,  2, 27, 31, 15, 35,
         13, 29,  8, 11, 10, 27,  2,  3,  8, 10, 28, 36, 36, 35, 24, 12, 19, 35,
         16, 29, 33,  8, 19,  1, 12, 21,  2, 19, 27,  0, 11,  2, 17,  4, 26,  7,
         15, 28, 32,  3,  0, 10,  3,  5, 17]])

In [14]:
!pip install git+https://github.com/phohenecker/pytorch-transformer

Collecting git+https://github.com/phohenecker/pytorch-transformer
  Cloning https://github.com/phohenecker/pytorch-transformer to /tmp/pip-req-build-4kmc2gp_
Collecting insanity>=2017.1 (from transformer==2018.1)
  Downloading https://files.pythonhosted.org/packages/9a/64/ab5956d8360ed58e0c2f3bcf3a4c53b511dcc22ff84cec26cff559fe811c/insanity-2017.1.tar.gz
Building wheels for collected packages: transformer, insanity
  Running setup.py bdist_wheel for transformer ... [?25ldone
[?25h  Stored in directory: /tmp/pip-ephem-wheel-cache-9jmf2zat/wheels/88/ba/b9/970f87f50d549a7fd30b03b105af0d79bf210f2dc3ead0789b
  Running setup.py bdist_wheel for insanity ... [?25ldone
[?25h  Stored in directory: /home/ec2-user/.cache/pip/wheels/83/cc/18/590868a1a960f5421ec4846bfe93b228ea5e3f54c0c129b303
Successfully built transformer insanity
[31mfastai 1.0.55 requires nvidia-ml-py3, which is not installed.[0m
[31mthinc 6.12.1 has requirement msgpack<0.6.0,>=0.5.6, but you'll have msgpack 0.6.0 which is