## Setup

In [None]:
!pip install -q transformers==4.37.2
!pip install numba



In [None]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import mixed_precision

from tensorflow.keras.layers import Embedding, Input, Dense, Lambda
from tensorflow.keras.models import Model
import tensorflow.keras.backend as K
from numba import cuda

import sklearn as sk
import os
import nltk
from nltk.data import find
import pandas as pd
import gc

import matplotlib.pyplot as plt

import re

## Load datasets

In [None]:
## Load datasets
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')
model_checkpoints_path = '/content/drive/My Drive/ds266proj/model_checkpoints'

# Read data from to Google Drive
hs_all_data = pd.read_csv('/content/drive/My Drive/ds266proj/hs_cards_data_text.csv')
nr_all_data = pd.read_csv('/content/drive/My Drive/ds266proj/nr_cards_data_text.csv')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
## CODE FOR TRAIN AND TEST SPLITS
##===========================================================

def splitTrainTest(dataset,
                   label,
                   test_ratio = 0.3):

  # Set up the modeling dataset and classification label
  modeling_df = dataset.copy()
  classes = modeling_df[label].unique().tolist()
  print(f"Label classes: {classes}")
  # Convert the label to a numeric value
  modeling_df.loc[:, label] = modeling_df[label].map(classes.index)

  # Split into training and test data
  np.random.seed(2319)
  test_indices = np.random.rand(len(modeling_df)) < test_ratio
  print("{} examples in training, {} examples for testing.".format(
    len(modeling_df[~test_indices]), len(modeling_df[test_indices])))

  return modeling_df[~test_indices], modeling_df[test_indices]

hs_train, hs_test = splitTrainTest(hs_all_data, label = 'classes')
nr_train, nr_test = splitTrainTest(nr_all_data, label = 'faction_code')

Label classes: ['PRIEST', 'WARRIOR', 'WARLOCK', 'SHAMAN', 'DRUID', 'PALADIN', 'MAGE', 'HUNTER', 'ROGUE', 'DEMONHUNTER', 'DEATHKNIGHT', 'NEUTRAL']
5234 examples in training, 2291 examples for testing.
Label classes: ['neutral-corp', 'neutral-runner', 'weyland-consortium', 'nbn', 'jinteki', 'haas-bioroid', 'shaper', 'criminal', 'anarch']
1495 examples in training, 628 examples for testing.


  modeling_df.loc[:, label] = modeling_df[label].map(classes.index)


In [None]:
from transformers import BertTokenizer, TFBertModel, BertModel, AutoTokenizer
from transformers import logging
logging.set_verbosity_error()

In [None]:
MAX_SEQ_LENGTH = 512
checkpoint = 'bert-base-cased'

In [None]:
bert_tokenizer = BertTokenizer.from_pretrained(checkpoint)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


## Define functions for modeling

In [None]:
## Simplistic BERT model creation
## Tensorflow version
##-----------------------------------------------------
def create_bert_classification_model(bert_model,
                                     checkpoint_name = 'DEFAULT',
                                     model_checkpoints_path = '/content/drive/My Drive/ds266proj/model_checkpoints/',
                                     num_train_layers = 0,
                                     retrain_layers = None,
                                     hidden_size = 200,
                                     dropout = 0.3,
                                     chkpt_period = 1,
                                     learning_rate = 0.00005):
    """
    Build a simple classification model with BERT. Use the Pooler Output for classification purposes
    """
    if num_train_layers == 0:
        # Freeze all layers of pre-trained BERT model
        bert_model.trainable = False
    elif num_train_layers == 12 and retrain_layers is None:
        # Train all layers of the BERT model
        bert_model.trainable = True
    else:
        # Restrict training to the num_train_layers outer transformer layers
        if retrain_layers is not None:
          print('retrain layers: ', retrain_layers)
        else:
          retrain_layers = []
          for retrain_layer_number in range(num_train_layers):
              layer_code = '_' + str(11 - retrain_layer_number)
              retrain_layers.append(layer_code)
          print('retrain layers: ', retrain_layers)

        for w in bert_model.weights:
            if not any([x in w.name for x in retrain_layers]):
                # print('freezing: ', w.name)
                w._trainable = False

    input_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int64, name='input_ids_layer')
    token_type_ids = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int64, name='token_type_ids_layer')
    attention_mask = tf.keras.layers.Input(shape=(MAX_SEQ_LENGTH,), dtype=tf.int64, name='attention_mask_layer')

    ## Set up the BERT model; this will use the CLS token input
    bert_inputs = {'input_ids': input_ids,
                   'token_type_ids': token_type_ids,
                   'attention_mask': attention_mask}
    bert_out = bert_model(bert_inputs)
    cls_token = bert_out[0][:, 0, :]

    hidden = tf.keras.layers.Dense(hidden_size, activation='relu', name='hidden_layer')(cls_token)
    hidden = tf.keras.layers.Dropout(dropout)(hidden)

    classification = tf.keras.layers.Dense(12, activation='softmax',name='classification')(hidden)
    classification_model = tf.keras.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=[classification])
    classification_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
                                 loss='sparse_categorical_crossentropy',
                                 metrics='accuracy')

    ## Set up the checkpointing for the model
    checkpoint_filepath = model_checkpoints_path + checkpoint_name + '_tfweights.{epoch:02d}-{val_accuracy:.2f}.hdf5'
    model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_filepath, save_weights_only=True, period=chkpt_period)

    return classification_model, model_checkpoint_callback

## Paramteriz Data For Models

In [None]:
## Set up the model inputs
HS_LABEL_NAMES = ['PRIEST', 'WARRIOR', 'WARLOCK', 'SHAMAN', 'DRUID', 'PALADIN', 'MAGE', 'HUNTER', 'ROGUE', 'DEMONHUNTER', 'DEATHKNIGHT', 'NEUTRAL']

def genBERTInputs(train_data, test_data, desc_field, label_field, return_tensors):
  # Tokenize the training and test data
  x_train = bert_tokenizer(train_data[desc_field].tolist(),
                           padding = "max_length",
                           truncation = True,
                           max_length = MAX_SEQ_LENGTH,
                           return_tensors = return_tensors)
  x_test = bert_tokenizer(test_data[desc_field].tolist(),
                          padding = "max_length",
                          truncation = True,
                          max_length = MAX_SEQ_LENGTH,
                          return_tensors = return_tensors)
  # Return array versions of the labels
  y_train = np.array(train_data[label_field].tolist())
  y_test = np.array(test_data[label_field].tolist())
  # Return all data
  return x_train, x_test, y_train, y_test

# Create the datasets
x_hs_train_tf, x_hs_test_tf, y_hs_train_tf, y_hs_test_tf = genBERTInputs(hs_train, hs_test, 'description', 'classes', 'tf')
x_nr_train_tf, x_nr_test_tf, y_nr_train_tf, y_nr_test_tf = genBERTInputs(nr_train, nr_test, 'description', 'faction_code', 'tf')

## Run Models

### Base Models

#### BERT, TF

In [None]:
## Parametrize
EPOCHS = 2
tf_bert_model = TFBertModel.from_pretrained(checkpoint)

hs_bert_1, hs_bert_1_chkpt = create_bert_classification_model(tf_bert_model, checkpoint_name = 'BERT_base_HS', num_train_layers=0, chkpt_period = EPOCHS)

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]



In [None]:
## Train the model
hs_bert_1_history = hs_bert_1.fit(
    [x_hs_train_tf.input_ids, x_hs_train_tf.token_type_ids, x_hs_train_tf.attention_mask],
    y_hs_train_tf,
    validation_data=([x_hs_test_tf.input_ids, x_hs_test_tf.token_type_ids, x_hs_test_tf.attention_mask], y_hs_test_tf),
    batch_size=32,
    epochs=EPOCHS,
    callbacks=[hs_bert_1_chkpt]
)

Epoch 1/2
Epoch 2/2


In [None]:
# Clear out the previous model to make RAM space for a bigger pass
# Delete the variable holding the NLP model
del tf_bert_model
del hs_bert_1
del hs_bert_1_chkpt
gc.collect()

In [None]:
## Parametrize
EPOCHS = 2
tf_bert_model = TFBertModel.from_pretrained(checkpoint)

nr_bert_1, nr_bert_1_chkpt = create_bert_classification_model(tf_bert_model, checkpoint_name = 'BERT_base_NR', num_train_layers=0, chkpt_period = EPOCHS)



In [None]:
## Train the model
nr_bert_1_history = nr_bert_1.fit(
    [x_nr_train_tf.input_ids, x_nr_train_tf.token_type_ids, x_nr_train_tf.attention_mask],
    y_nr_train_tf,
    validation_data=([x_nr_test_tf.input_ids, x_nr_test_tf.token_type_ids, x_nr_test_tf.attention_mask], y_nr_test_tf),
    batch_size=32,
    epochs=EPOCHS,
    callbacks=[nr_bert_1_chkpt]
)

Epoch 1/2
Epoch 2/2


In [None]:
# Clear out the previous model to make RAM space for a bigger pass
# Delete the variable holding the NLP model
del tf_bert_model
del nr_bert_1
del nr_bert_1_chkpt
gc.collect()

19041

#### Bigger Bert, TF

In [None]:
# Train the first layer and allow more epochs -- better?
tf_bert_model = TFBertModel.from_pretrained(checkpoint)
hs_bert_2, hs_bert_2_chkpt = create_bert_classification_model(tf_bert_model, checkpoint_name = 'BERT_shallow_train_HS', num_train_layers=1)

retrain layers:  ['_11']




In [None]:
hs_bert_2_history = hs_bert_2.fit(
    [x_hs_train_tf.input_ids, x_hs_train_tf.token_type_ids, x_hs_train_tf.attention_mask],
    y_hs_train_tf,
    validation_data=([x_hs_test_tf.input_ids, x_hs_test_tf.token_type_ids, x_hs_test_tf.attention_mask], y_hs_test_tf),
    batch_size=32,
    epochs=4,
    callbacks=[hs_bert_2_chkpt]
)

Epoch 1/4




Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
del tf_bert_model
del hs_bert_2
del hs_bert_2_chkpt
gc.collect()

18993

In [None]:
# Train the first layer and allow more epochs -- better?
tf_bert_model = TFBertModel.from_pretrained(checkpoint)
nr_bert_2, nr_bert_2_chkpt = create_bert_classification_model(tf_bert_model, checkpoint_name = 'BERT_shallow_train_NR', num_train_layers=1)

retrain layers:  ['_11']




In [None]:
nr_bert_2_history = nr_bert_2.fit(
    [x_nr_train_tf.input_ids, x_nr_train_tf.token_type_ids, x_nr_train_tf.attention_mask],
    y_nr_train_tf,
    validation_data=([x_nr_test_tf.input_ids, x_nr_test_tf.token_type_ids, x_nr_test_tf.attention_mask], y_nr_test_tf),
    batch_size=32,
    epochs=4,
    callbacks=[nr_bert_2_chkpt]
)

Epoch 1/4




Epoch 2/4
Epoch 3/4
Epoch 4/4


In [None]:
del tf_bert_model
del nr_bert_2
del nr_bert_2_chkpt
gc.collect()

18985

In [None]:
# Train the first layer and allow more epochs -- better?
tf_bert_model = TFBertModel.from_pretrained(checkpoint)
hs_bert_3, hs_bert_3_chkpt = create_bert_classification_model(tf_bert_model, checkpoint_name = 'BERT_medium_train_HS', num_train_layers=4)

retrain layers:  ['_11', '_10', '_9', '_8']




In [None]:
hs_bert_3_history = hs_bert_3.fit(
    [x_hs_train_tf.input_ids, x_hs_train_tf.token_type_ids, x_hs_train_tf.attention_mask],
    y_hs_train_tf,
    validation_data=([x_hs_test_tf.input_ids, x_hs_test_tf.token_type_ids, x_hs_test_tf.attention_mask], y_hs_test_tf),
    batch_size=32,
    epochs=8,
    callbacks=[hs_bert_3_chkpt]
)

Epoch 1/8




Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
del tf_bert_model
del hs_bert_3
del hs_bert_3_chkpt
gc.collect()

In [None]:
# Train the first layer and allow more epochs -- better?
tf_bert_model = TFBertModel.from_pretrained(checkpoint)
nr_bert_3, nr_bert_3_chkpt = create_bert_classification_model(tf_bert_model, checkpoint_name = 'BERT_medium_train_NR', num_train_layers=4)

retrain layers:  ['_11', '_10', '_9', '_8']




In [None]:
nr_bert_3_history = nr_bert_3.fit(
    [x_nr_train_tf.input_ids, x_nr_train_tf.token_type_ids, x_nr_train_tf.attention_mask],
    y_nr_train_tf,
    validation_data=([x_nr_test_tf.input_ids, x_nr_test_tf.token_type_ids, x_nr_test_tf.attention_mask], y_nr_test_tf),
    batch_size=32,
    epochs=8,
    callbacks=[nr_bert_3_chkpt]
)

Epoch 1/8




Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
del tf_bert_model
del nr_bert_3
del nr_bert_3_chkpt
del nr_bert_3_history
gc.collect()

1689

In [None]:
# Clear the GPU cache
device = cuda.get_current_device()
device.reset()

In [None]:
## Attempt to train a deeper model
mixed_precision.set_global_policy('mixed_float16')

In [None]:
# Train the first layer and allow more epochs -- better?
tf_bert_model = TFBertModel.from_pretrained(checkpoint)
hs_bert_4, hs_bert_4_chkpt = create_bert_classification_model(tf_bert_model, checkpoint_name = 'BERT_twothirds_train_HS', num_train_layers=12)



In [None]:
hs_bert_4_history = hs_bert_4.fit(
    [x_hs_train_tf.input_ids, x_hs_train_tf.token_type_ids, x_hs_train_tf.attention_mask],
    y_hs_train_tf,
    validation_data=([x_hs_test_tf.input_ids, x_hs_test_tf.token_type_ids, x_hs_test_tf.attention_mask], y_hs_test_tf),
    batch_size=16,
    epochs=8,
    callbacks=[hs_bert_4_chkpt]
)

Epoch 1/8




Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [None]:
# Train the first layer and allow more epochs -- better?
tf_bert_model = TFBertModel.from_pretrained(checkpoint)
nr_bert_4, nr_bert_4_chkpt = create_bert_classification_model(tf_bert_model, checkpoint_name = 'BERT_full_train_NR', num_train_layers=12)



In [None]:
nr_bert_4_history = nr_bert_4.fit(
    [x_nr_train_tf.input_ids, x_nr_train_tf.token_type_ids, x_nr_train_tf.attention_mask],
    y_nr_train_tf,
    validation_data=([x_nr_test_tf.input_ids, x_nr_test_tf.token_type_ids, x_nr_test_tf.attention_mask], y_nr_test_tf),
    batch_size=16,
    epochs=8,
    callbacks=[nr_bert_4_chkpt]
)

Epoch 1/8




Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
