# Homework 10 - IANNwTF - January 26, 2023

### Preparation

In [65]:
# %cd drive/MyDrive/Github/IANNWTF_hw/HW_11
%ls

[0m[01;34mdata[0m/  GPT.ipynb  [01;34mlogs[0m/  tokenizer_model.model  tokenizer_model.vocab


In [66]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [67]:
!pip install tensorflow_text
!pip install sentencePiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [68]:
import re
import tensorflow as tf
import numpy as np
import math
import tensorflow_text as tf_text
import tqdm.notebook as note
import scipy

import io
import tqdm 
import datetime 
import pathlib
import sentencepiece as sp

In [69]:
gpus = tf.config.list_physical_devices('GPU')
if gpus :
  #Restrict TensorFlow to only allocate 1GB of memory on the first GPU
  try :
    tf.config.set_logical_device_configuration(
        gpus[0],
        [tf.config.LogicalDeviceConfiguration(memory_limit=1024)])
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e :
    #Virtual devices must be set before GPUs have been initialized
    print(e)

### The Dataset

In [70]:
path = tf.keras.utils.get_file('trumps.txt', origin='https://github.com/ryanmcdermott/trump-speeches/blob/master/speeches.txt')

#read trumps text
trumps = open("data/trumps.txt", "r")
data = trumps.read()
print(data[0:500])

SPEECH 1


...Thank you so much.  That's so nice.  Isn't he a great guy.  He doesn't get a fair press; he doesn't get it.  It's just not fair.  And I have to tell you I'm here, and very strongly here, because I have great respect for Steve King and have great respect likewise for Citizens United, David and everybody, and tremendous resect for the Tea Party.  Also, also the people of Iowa.  They have something in common.  Hard-working people.  They want to work, they want to make the country grea


### 2.1 The dataset, preprocessing & tokenization

In [71]:
#lower characters and no special chars or digits, line breake to space
data = data.replace("\n", " ")
data = data.lower()
data = re.sub(r"[^a-z ]", "", data)
#create a smaller sample to see if everything work
smaller_data = data[:100000]
#create date set
ds = ' '.join(smaller_data.split())
print(ds)

speech thank you so much thats so nice isnt he a great guy he doesnt get a fair press he doesnt get it its just not fair and i have to tell you im here and very strongly here because i have great respect for steve king and have great respect likewise for citizens united david and everybody and tremendous resect for the tea party also also the people of iowa they have something in common hardworking people they want to work they want to make the country great i love the people of iowa so thats the way it is very simple with that said our country is really headed in the wrong direction with a president who is doing an absolutely terrible job the world is collapsing around us and many of the problems weve caused our president is either grossly incompetent a word that more and more people are using and i think i was the first to use it or he has a completely different agenda than you want to know about which could be possible in any event washington is broken and our country is in serious 

In [73]:
tokens = set(list(smaller_data.split()))
vocab_size = len(tokens)
vocab_size

2309

In [74]:
sp.SentencePieceTrainer.train(
  input=path, model_prefix='tokenizer_model', model_type="unigram", vocab_size=vocab_size)

# deserialize the trained model file to load it in the correct format
trained_tokenizer_model = tf.io.gfile.GFile('tokenizer_model.model', "rb").read()

# load the model as a tokenizer that can be used inside a tensorflow model
tokenizer = tf_text.SentencepieceTokenizer(
  model=trained_tokenizer_model, out_type=tf.int32, nbest_size=-1, alpha=1, reverse=False,
  add_bos=False, add_eos=False, return_nbest=False, name=None)

In [75]:
m = 128

tokens = tokenizer.tokenize(ds)
width_tokens = tf_text.sliding_window(tokens, m+1)

text_dataset = tf.data.Dataset.from_tensor_slices(width_tokens)
text_dataset;

In [76]:
def prepare(ds) :   
  ds = ds.map(lambda data: (data[:m], data[-1]))
  # cache 
  ds = ds.cache()
  # shuffle, batch, prefetch our dataset
  ds = ds.shuffle(5000)
  ds = ds.batch(256)
  ds = ds.prefetch(1024)
  # split dataset into data and target   
  return ds

text_dataset = prepare(text_dataset)
text_dataset;

### 2.2 The model components

#### 2.2.1 The Embedding

In [83]:
class Encoding_layer(tf.keras.models.Model) :
  def __init__(self) :
    super(Encoding_layer, self).__init__()
    self.embedding = tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=m)

  def call(self, x) :
    indices = tf.range(0, m)
    a = self.embedding(x)
    b = self.embedding(indices)
    c = a + b
    return c

#### 2.2.2 The TransformerBlock layer

In [84]:
class TransformerBlock(tf.keras.models.Model) :
  def __init__(self) :
    super(TransformerBlock, self).__init__()
                       
    self.Attention = tf.keras.layers.MultiHeadAttention(2, m)
    self.Drop1 = tf.keras.layers.Dropout(0.1)
    self.Drop2 = tf.keras.layers.Dropout(0.1)

    self.Dense1 = tf.keras.layers.Dense(128, activation='relu')
    self.Dense2 = tf.keras.layers.Dense(m)
    
    self.LayNorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.LayNorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

  def call(self, x) :
    in_out = self.Attention(x,x)
    in_out = self.Drop1(in_out)
    in_out = self.LayNorm1(in_out)
    in_out = tf.keras.layers.Add()([in_out, x])
    in_out = self.LayNorm2(self.Drop2(self.Dense2(self.Dense1(in_out))))
    in_out = tf.keras.layers.Add()([in_out, x])

    return in_out

#### 2.2.3 The subclassed model

In [85]:
class MyModel(tf.keras.models.Model) :
  def __init__(self) :
    super(MyModel, self).__init__()
    self.optimizer = tf.keras.optimizers.Adam()
    self.loss_function = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)   
    
    self.metrics_list = [
                    tf.keras.metrics.Mean(name="loss"),
                    tf.keras.metrics.CategoricalAccuracy(name="acc")]
                    #tf.keras.metrics.TopKCategoricalAccuracy(3,name="top-3-acc")
    
    self.layer_list = [
      Encoding_layer(),
      TransformerBlock(),
      tf.keras.layers.GlobalAveragePooling1D(),
      tf.keras.layers.Dense(vocab_size)] 
    
  def call(self, x) :
    for layer in self.layer_list :
      x = layer(x)
    return x

  def reset_metrics(self) :
    for metric in self.metrics :
      metric.reset_states()
          
  @tf.function
  def train_step(self, data) :
    x, target = data
    
    with tf.GradientTape() as tape:
        predictions = self(x)
        loss = self.loss_function(target, predictions) + tf.reduce_sum(self.losses)
    gradients = tape.gradient(loss, self.trainable_variables)
    self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
    
    # update loss metric
    self.metrics[0].update_state(loss)
    # for all metrics except loss, update states (accuracy etc.)
    for metric in self.metrics[1:]:
        metric.update_state(target, predictions)
        
    # Return a dictionary mapping metric names to current value
    return {m.name: m.result() for m in self.metrics}

# initialize models 
model = MyModel()

### Training

In [86]:
# Hyperparameters
BATCH_SIZE = 256
# Training for more than 300 epochs would be required for meaningful results 
EPOCHS = 100

In [87]:
# Define saving location for log
hyperparameter_string= f"Trial:01_BATCH:{BATCH_SIZE}_EPOCH:{EPOCHS}_Adam"
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

train_log_path = f"logs/{hyperparameter_string}/{current_time}/train"

# log writer for training metrics
train_summary_writer = tf.summary.create_file_writer(train_log_path)

In [None]:
# Training Loop
def training(model, train_ds, epochs):
  model = model
  for epoch in range(epochs):
    print(f"Epoch {epoch}:")

    # Training:
    for data in train_ds:
      metrics = model.train_step(data)

    # print the metrics
    print([f"{key}: {value}" for (key, value) in zip(list(metrics.keys()), list(metrics.values()))])
    # logging the validation metrics to the log file which is used by tensorboard
    with train_summary_writer.as_default():
        for metric in model.metrics:
            tf.summary.scalar(f"{metric.name}", metric.result(), step=epoch)

    # reset all metrics (requires a reset_metrics method in the model)
    model.reset_metrics()
    print("\n")
    
# train the model 
training(model, text_dataset, EPOCHS)

Epoch 0:
['loss: 6.400969505310059', 'acc: 0.0']


Epoch 1:
['loss: 5.63647985458374', 'acc: 0.0']


Epoch 2:
['loss: 5.6031036376953125', 'acc: 0.0']


Epoch 3:
['loss: 5.59666633605957', 'acc: 0.0']


Epoch 4:
['loss: 5.588434219360352', 'acc: 0.009295232594013214']


Epoch 5:
['loss: 5.580106258392334', 'acc: 0.0']


Epoch 6:
['loss: 5.564887523651123', 'acc: 0.0']


Epoch 7:
['loss: 5.548681259155273', 'acc: 0.0']


Epoch 8:
['loss: 5.534760475158691', 'acc: 0.0']


Epoch 9:
['loss: 5.510494232177734', 'acc: 0.0']


Epoch 10:
['loss: 5.398716449737549', 'acc: 0.008968447335064411']


Epoch 11:
['loss: 5.2201104164123535', 'acc: 0.0006535710417665541']


Epoch 12:
['loss: 5.098369598388672', 'acc: 0.0018517846474424005']


Epoch 13:
['loss: 4.9616522789001465', 'acc: 0.005047020968049765']


Epoch 14:
['loss: 4.84832239151001', 'acc: 0.002759522059932351']


Epoch 15:
['loss: 4.7611002922058105', 'acc: 0.006317853461951017']


Epoch 16:
['loss: 4.693447113037109', 'acc: 0.0019970226