In [None]:
!pip install imageio git+https://github.com/tensorflow/docs XlsxWriter tensorflow_addons &> /dev/null

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
# !echo "deb http://packages.cloud.google.com/apt gcsfuse-bionic main" > /etc/apt/sources.list.d/gcsfuse.list
# !curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | apt-key add -
# !apt -qq update
# !apt -qq install gcsfuse

In [None]:
# !gcloud auth application-default login

In [None]:
# !gcloud auth login

In [None]:
!mkdir historical
!mkdir future
!gsutil -m cp gs://ganstick_project/historical/*.png historical/
!gsutil -m cp gs://ganstick_project/future/*.png future/

In [None]:
# !gcsfuse --implicit-dirs ganstick_project gcloudbucket

In [None]:
import tensorflow as tf
import glob
import imageio
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
import PIL

import tensorflow.keras as keras
import tensorflow_addons as tfa
from tensorflow_addons.layers import SpectralNormalization as SpectralNorm
from tensorflow.keras import layers

from keras.preprocessing.image import load_img
from keras_preprocessing.image import ImageDataGenerator

from keras import backend as K
from keras.layers import InputSpec, Layer
from keras import initializers, regularizers, constraints

import time

from IPython import display
import tensorflow_docs.vis.embed as embed
import xlsxwriter

from sklearn.preprocessing import StandardScaler

In [None]:
pd.set_option('display.max_columns', None)
np.set_printoptions(suppress=True)

In [None]:
tickers = [
  'aapl',
  'mcd',
  'pld',
  'bac',
  'cvx',
  'ibm',
  'v',
  'pg',
  'nke',
  'abbv',
  'mmm',
  'rio',
  'cci',
  'ip',
  'gs',
  'hon',
  'msft',
  'amt',
  'spg',
  'jpm',
  'amzn',
  'unh',
  'wmt',
  'jnj',
  'vz',
  'bhp',
  'nee',
  'etr',
  'xel',
  'pfe',
  'xom',
  'lmt',
  'duk',
  'googl',
  'viac',
  'intc',
  'ko',
  ]
future_vols = []
hist_vols = []
for ticker in tickers:
  fname = glob.glob("drive/MyDrive/ganstick_project/historical_volatility/%s/%s_hist_vol.csv"%(ticker, ticker))[0]
  df = pd.read_csv(fname)
  df['id'] = range(0, len(df))
  # super super janky but possibly only way to make image data generator figure it out
  df['id'] = df['id'].apply(lambda x: 'historical/%s_'%ticker + str(x) + '.png')

  # chop off first 1000 days (4 years) to get rid of most of the all green candle images
  df = df.iloc[2500:]
  hist_vols.append(df)

all_hist_vols = pd.concat([df for df in hist_vols])

In [None]:
# 64 fast training for each but may be too slow overall --> use 256
BATCH_SIZE = 256

def process(image):
  image = tf.cast((image-127.5) / 127.5 ,tf.float32)
  return image

image_gen = ImageDataGenerator(
    preprocessing_function=process
)

# class_mode == raw --> pass in multiple columns to y_col to add
hist_gen = image_gen.flow_from_dataframe(
    dataframe=all_hist_vols,
    directory=None,
    x_col='id',
    y_col=['avg_vol', 'first_date', 'last_date', 'avg_volume', 'ticker'],
    target_size=(56,56),
    batch_size=BATCH_SIZE,
    shuffle=True,
    class_mode='raw',
)

Found 283820 validated image filenames.


In [None]:
def Attention(input_shape, k):
  '''
  @param int k: proportion by which we subsample the input
  '''
  channels = input_shape[-1]
  x = layers.Input(shape=input_shape[1:])
	
  f = layers.Conv2D(channels // k, kernel_size = (1, 1), strides = (1, 1), padding='same', use_bias=True)(x)
  f = layers.Reshape((-1, f.shape[-1]))(f)
	
  g = layers.Conv2D(channels // k, kernel_size = (1, 1), strides = (1, 1), padding='same', use_bias=True)(x)
  g = layers.Reshape((-1, g.shape[-1]))(g)
	
  h = layers.Conv2D(channels // 2, kernel_size = (1, 1), strides = (1, 1), padding='same', use_bias=True)(x)
  h = layers.Reshape((-1, h.shape[-1]))(h)
	
  s = tf.matmul(g, f, transpose_b=True)
  s = keras.layers.Softmax()(s)
		
  o = tf.matmul(s, h)

  # first dim is batch size
  height, width, channels = input_shape[1:]
  o = layers.Reshape((height, width, channels // 2))(o)
  o = layers.Conv2D(channels, kernel_size = (1, 1), strides = (1, 1), padding='same', use_bias=True)(o)
  o = Scalar()(o)
  o = o + x

  SA = keras.Model(inputs=x, outputs=o)
	
  return SA


class Scalar(layers.Layer):
  def __init__(self):
    super(Scalar, self).__init__()

  def build(self, input_shape):
    self.gamma = tf.Variable(initial_value=tf.zeros(1), trainable=True)
    self._trainable_weights=[self.gamma]

  def call(self, inputs):
    return layers.Rescaling(self.gamma)(inputs)


In [None]:
def make_generator_model():
  # create noise and reshape
  input_noise = layers.Input(shape=(100,))
  n_nodes = 256 * 7 * 7
  noise = layers.Dense(n_nodes)(input_noise)

  # not sure if we need activation or not
  gen_image = layers.Reshape((7, 7, 256))(noise)
  gen_image = layers.BatchNormalization()(gen_image)
  gen_image = layers.LeakyReLU()(gen_image)

  gen_image = SpectralNorm(layers.Conv2DTranspose(64, (3,3), strides=(2, 2), padding='same', use_bias=False))(gen_image)
  gen_image = layers.LeakyReLU()(gen_image)

  input_shape = gen_image.shape
  gen_image = Attention(input_shape, k=8)(gen_image)

  gen_image = layers.BatchNormalization()(gen_image)
  gen_image = SpectralNorm(layers.Conv2DTranspose(32, (3, 3), strides=(2, 2), padding='same', use_bias=False))(gen_image)
  gen_image = layers.LeakyReLU()(gen_image)

  # self attention
  input_shape = gen_image.shape
  gen_image = Attention(input_shape, k=4)(gen_image)
  
  gen_image = layers.BatchNormalization()(gen_image)
  gen_image = SpectralNorm(layers.Conv2DTranspose(3, (3, 3), strides=(2, 2), padding='same', use_bias=False, activation=keras.activations.tanh))(gen_image)
  
  model = keras.Model(inputs=input_noise, outputs=gen_image)

  return model

def make_discriminator_model():
  input_image = layers.Input(shape=(56, 56, 3))

  image = SpectralNorm(layers.Conv2D(32, (3, 3), strides=(2, 2), padding='same'))(input_image)
  image = layers.LeakyReLU()(image)

  # Self-attention
  input_shape = image.shape
  image = Attention(input_shape, k=4)(image)

  image = SpectralNorm(layers.Conv2D(64, (3, 3), strides=(2, 2), padding='same'))(image)
  image = layers.LeakyReLU()(image)

  # self attention
  input_shape = image.shape
  image = Attention(input_shape, k=8)(image)

  feature = layers.Flatten()(image)

  prediction = layers.Dense(1, activation=keras.activations.sigmoid)(feature)
  model = keras.Model(inputs=input_image, outputs=prediction)
  return model

generator = make_generator_model()
discriminator = make_discriminator_model()

In [None]:
generator.summary()
discriminator.summary()

In [None]:

cross_entropy = tf.keras.losses.BinaryCrossentropy()

# TTUR (discriminator 4x learning rate of generator)
generator_optimizer = tf.keras.optimizers.Adam(0.0001, beta_1=0.0, beta_2=0.9)
discriminator_optimizer = tf.keras.optimizers.Adam(0.0004, beta_1=0.0, beta_2=0.9)

def generator_loss(fake_output):
  return cross_entropy(tf.ones_like(fake_output), fake_output)

def discriminator_loss(real_output, fake_output):
  real_loss = cross_entropy(tf.ones_like(real_output), real_output)
  fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
  total_loss = real_loss + fake_loss
  return total_loss

# decode for ticker string
def recover_string_from_int(x):
  recoveredbytes = x.to_bytes((x.bit_length() + 7) // 8, 'little')
  recoveredstring = recoveredbytes.decode('utf-8')
  return recoveredstring


In [None]:
# this cell for saving directly to gdrive but setting experimental options

# for reading from either gdrive or local? or only local not sure
local_device_option = tf.train.CheckpointOptions(experimental_io_device="/job:localhost")

checkpoint_dir = 'drive/MyDrive/ganstick_project/checkpoints/finalgan/'
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                  discriminator_optimizer=discriminator_optimizer,
                  generator=generator,
                  discriminator=discriminator)

checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir), options=local_device_option)

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7ff6f6aca550>

In [None]:
# # this cell for loading/saving checkpoints locally in VM then copying to gdrive

# !gsutil cp -r drive/MyDrive/ganstick_project/checkpoints/finalgan/checkpoints .

# checkpoint_dir = 'checkpoints'
# checkpoint_dir = os.path.join(checkpoint_dir, 'ckpt')

# checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
#                   discriminator_optimizer=discriminator_optimizer,
#                   generator=generator,
#                   discriminator=discriminator)

# checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))

# # saving: 
# # !gsutil cp -r checkpoints/ drive/MyDrive/ganstick_project/checkpoints/finalgan/


In [None]:
# Training process
@tf.function
def train_step(img_batch):
  '''

  img_batch.shape = [batch_size, height, width, channels]
  img_labels.shape = [batch_size, labels]

  '''
  batch_size = img_batch.shape[0]
  noise = tf.random.normal([batch_size, noise_dim])

  with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
    # create our fake
    generated_image = generator(noise, training=True)

    # training using historical images instead
    pred_on_real = discriminator(img_batch, training=True)
    pred_on_fake = discriminator(generated_image, training=True)

    gen_loss = generator_loss(pred_on_fake)
    disc_loss = discriminator_loss(pred_on_real, pred_on_fake)
  
  gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
  generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))

  gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)
  discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

  return gen_loss, disc_loss, generated_image, pred_on_fake


def train(images_dataset, epochs, last_epoch):
  for epoch in range(epochs):
    start = time.time()
    recent_epoch = epoch + last_epoch + 1

    data = []

    real_imgs = []
    time_series_data = []
    generated_images = []

    batches_finished = 0

    print('Start training for epoch {}'.format(recent_epoch))
    for img_batch, time_series in images_dataset:
      gen_loss, disc_loss, generated_image, pred_on_fake = train_step(img_batch)

      # pick random index to sample generated image from batch (as well as associated real img)
      r = np.random.randint(img_batch.shape[0])

      batches_finished += 1
      if (batches_finished % 10) == 0:
        data.append((gen_loss.numpy(), disc_loss.numpy(), pred_on_fake.numpy()[r]))
        real_imgs.append(img_batch[r])
        time_series_data.append(time_series[r])
        generated_images.append(generated_image[r])
        print('Batch {} training finished'.format(batches_finished))

      # need to set manual loop break in current keras version (???)
      if batches_finished >= len(images_dataset):
        break
        
    checkpoint.save(checkpoint_dir, options=local_device_option)
      
    
    save_result(data, real_imgs, generated_images, time_series_data, recent_epoch)
    display.clear_output(wait=True)
    print ('Time for epoch {} is {} sec'.format(recent_epoch, time.time()-start))
    print ('generator loss:', gen_loss.numpy())
    print ('disciminator loss:', disc_loss.numpy())

# NOTE THIS IS THE CORRECT SAVE LOCATION ( THE CHECKPOINTS ARE IN FINALGAN_ONE_2500 THO CARE)
def save_result(data, real_imgs, generated_images, time_series_data, epoch_num):
  wb = xlsxwriter.Workbook(f'drive/MyDrive/ganstick_project/finalgan_results/epoch{epoch_num:03}.xlsx')
  os.makedirs(f'drive/MyDrive/ganstick_project/finalgan_results/epoch{epoch_num:03}/real', exist_ok=True)
  os.makedirs(f'drive/MyDrive/ganstick_project/finalgan_results/epoch{epoch_num:03}/generated', exist_ok=True)
  os.makedirs(f'drive/MyDrive/ganstick_project/finalgan_results/epoch{epoch_num:03}/timeseriesdata', exist_ok=True)
  ws = wb.add_worksheet()
  ws.write_row(0, 0, ('Batch Index', 'Generator Loss', 'Discriminator Loss', 'Generated Image Prediction'))
  batch_num = 1
  for result, real_img, gen_img, timedata in zip(data, real_imgs, generated_images, time_series_data):
    ws.write_row(batch_num, 0, (batch_num, result[0], result[1], result[2]))

    timedata = timedata.reshape((1,5))
    # kinda janky, to get ticker integer encoding
    ticker = recover_string_from_int(int(timedata[0][-1]))
    first_date = str(timedata[0][1])
    last_date = str(timedata[0][2])

    df = pd.DataFrame(timedata, columns=['avg_volatility', 'first_date', 'last_date', 'avg_volume', 'ticker'])

    # save ticker name as csv name, then timeseries in csv
    df.to_csv(f'drive/MyDrive/ganstick_project/finalgan_results/epoch{epoch_num:03}/timeseriesdata/{ticker}_{first_date}_{last_date}.csv', index=False)

    save_img = (real_img * 127.5 + 127.5)
    save_img = PIL.Image.fromarray(np.uint8(save_img))
    save_img.save(f'drive/MyDrive/ganstick_project/finalgan_results/epoch{epoch_num:03}/real/{ticker}_{first_date}_{last_date}.png') 

    save_img = (gen_img * 127.5 + 127.5)
    save_img = PIL.Image.fromarray(np.uint8(save_img))
    save_img.save(f'drive/MyDrive/ganstick_project/finalgan_results/epoch{epoch_num:03}/generated/{ticker}_{first_date}_{last_date}.png')



    batch_num += 1
  wb.close()

In [None]:
noise_dim = 100

# more is how many more epochs to run
# check sasgan_results dir for prev epoch index
# saving every epoch

MORE_EPOCH = 100
FINISHED_EPOCH = 74

train(hist_gen, MORE_EPOCH, FINISHED_EPOCH)