In [None]:
import tensorflow as tf
import transformers
import pandas as pd
from sklearn.model_selection import train_test_split
import utils as utils
import numpy as np
from keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [None]:
# Version Info
print("Tensforflow Version : " ,tf.__version__)
print("Transformers Version : " ,transformers.__version__)

In [None]:
# make a data folder
!mkdir -p data

# download the data
out_path = tf.keras.utils.get_file(origin="https://archive.ics.uci.edu/static/public/331/sentiment+labelled+sentences.zip",extract=True,cache_dir="data")
print("\n",out_path)

In [None]:
# Read the CSV file into a DataFrame
df = pd.read_csv('text_entailment_dataset/train.csv')

# Shuffle the DataFrame
df = df.sample(frac=1, random_state=42)  # Shuffle with fixed seed for reproducibility

# Split the data into training and validation sets
train_df, validation_df = train_test_split(df, test_size=0.1, random_state=42)

# Write the training and validation DataFrames to separate CSV files
train_df.to_csv('text_entailment_dataset/train_data.csv', index=False)
validation_df.to_csv('text_entailment_dataset/validation_data.csv', index=False)

train_dataset = df = pd.read_csv('text_entailment_dataset/train_data.csv')
validation_dataset = df = pd.read_csv('text_entailment_dataset/validation_data.csv')
test_dataset = df = pd.read_csv('text_entailment_dataset/test_data.csv')

train_dataset.head()


In [None]:
train_dataset.shape

In [None]:
validation_dataset.head()

In [None]:
validation_dataset.shape


In [None]:
test_dataset.head()


In [None]:
test_dataset.shape

In [None]:
# define a max length constant
MAX_LENGTH = 64

In [None]:
from transformers import DistilBertTokenizer, DistilBertModel

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert = DistilBertModel.from_pretrained('distilbert-base-uncased')

In [None]:
print(tokenizer)
print(bert)

In [None]:
train_dataset[["premise"]] = train_dataset[["premise"]].astype(str)
train_dataset["premise"] = train_dataset["premise"].apply(utils.change_lower)
train_dataset["premise"] = train_dataset["premise"].apply(utils.clean_data)
train_dataset["premise"] = train_dataset["premise"].apply(utils.remover)

train_dataset[["hypothesis"]] = train_dataset[["hypothesis"]].astype(str)
train_dataset["hypothesis"] = train_dataset["hypothesis"].apply(utils.change_lower)
train_dataset["hypothesis"] = train_dataset["hypothesis"].apply(utils.clean_data)
train_dataset["hypothesis"] = train_dataset["hypothesis"].apply(utils.remover)

validation_dataset[["premise"]] = validation_dataset[["premise"]].astype(str)
validation_dataset["premise"] = validation_dataset["premise"].apply(utils.change_lower)
validation_dataset["premise"] = validation_dataset["premise"].apply(utils.clean_data)
validation_dataset["premise"] = validation_dataset["premise"].apply(utils.remover)

validation_dataset[["hypothesis"]] = validation_dataset[["hypothesis"]].astype(str)
validation_dataset["hypothesis"] = validation_dataset["hypothesis"].apply(utils.change_lower)
validation_dataset["hypothesis"] = validation_dataset["hypothesis"].apply(utils.clean_data)
validation_dataset["hypothesis"] = validation_dataset["hypothesis"].apply(utils.remover)

test_dataset[["premise"]] = test_dataset[["premise"]].astype(str)
test_dataset["premise"] = test_dataset["premise"].apply(utils.change_lower)
test_dataset["premise"] = test_dataset["premise"].apply(utils.clean_data)
test_dataset["premise"] = test_dataset["premise"].apply(utils.remover)

test_dataset[["hypothesis"]] = test_dataset[["hypothesis"]].astype(str)
test_dataset["hypothesis"] = test_dataset["hypothesis"].apply(utils.change_lower)
test_dataset["hypothesis"] = test_dataset["hypothesis"].apply(utils.clean_data)
test_dataset["hypothesis"] = test_dataset["hypothesis"].apply(utils.remover)

In [None]:
X_train = train_dataset['premise'] + train_dataset['hypothesis']
X_Val = validation_dataset['premise'] + validation_dataset['hypothesis']
X_test = test_dataset['premise'] + test_dataset['hypothesis']

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [None]:
# define a batch size for our experiments
BATCH_SIZE = 64
# define a percentage of the data to use for training

train_dataset = df = pd.read_csv('text_entailment_dataset/train_data.csv')
validation_dataset = df = pd.read_csv('text_entailment_dataset/validation_data.csv')
test_dataset = df = pd.read_csv('text_entailment_dataset/test_data.csv')



In [None]:
train_sentences = [train_dataset.loc[s][0] + train_dataset.loc[s][1] for s in range(len(train_dataset))]
train_labels = [train_dataset.loc[l][2] for l in range(len(train_dataset))]

validation_sentences = [validation_dataset.loc[s][0] + validation_dataset.loc[s][1] for s in range(len(validation_dataset))]
validation_labels = [validation_dataset.loc[l][2] for l in range(len(validation_dataset))]

test_sentences = [test_dataset.loc[s][0] + test_dataset.loc[s][1] for s in range(len(test_dataset))]

print("LENGTHS // Train sentences: " + str(len(train_sentences)) + ". Train labels: " + str(len(train_labels)))
print("LENGTHS // Test sentences: " + str(len(validation_sentences)) + ". Test labels: " + str(len(validation_labels)))
print("LENGTHS // Test sentences: " + str(len(test_sentences)))

In [None]:
train_labels = train_dataset["label"]
validation_labels = validation_dataset["label"]

# Convert to one-hot encoded format
num_classes = len(set(train_labels))  # Calculate the number of classes

train_labels = to_categorical(train_labels, num_classes=num_classes)
validation_labels = to_categorical(validation_labels, num_classes=num_classes)

print("train label shape:", train_labels.shape)
print("val label shape:", validation_labels.shape)

steps_per_epoch = len(train_labels) // BATCH_SIZE
validation_steps = len(validation_labels) // BATCH_SIZE


In [None]:
#Implementing KMeans clustering for better labels
from sklearn.feature_extraction.text import TfidfVectorizer

# Use a tfidf vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(train_sentences)

# Choosing 3 clusters
k = 3

# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=0, n_init="auto").fit(X)

# Get cluster assignments for the training data
train_cluster_labels = kmeans.labels_
train_labels = to_categorical(train_cluster_labels, num_classes=num_classes)

In [None]:
# data generator for the model
def data_generator(sentences: np.array,labels: np.array,batch_size: int) -> (dict,tf.Tensor):
    i = 0
    start_idx = -1 * batch_size
    end_idx = 0

    while True:
        start_idx += batch_size
        end_idx += batch_size
        # TODO: append batch_size number of sentences and labels to batch_x and batch_y
        # Make sure that you don't re-use sentences and labels that you've already put into batches!

        if end_idx > len(sentences):
            end_idx = batch_size
            start_idx = 0

        batch_y = labels[start_idx:end_idx]

        # TODO: tokenize the batch_x, padding to MAX_LENGTH, and truncating to MAX_LENGTH
        batch_x = tokenizer(sentences[start_idx:end_idx], return_tensors="tf", max_length=MAX_LENGTH, truncation="longest_first", padding="max_length")

        # debugging prints (make sure that these are commented out when you actually train your model)
        # should be (batch_size, MAX_LENGTH)
        # print(batch_x['input_ids'].shape)

        # convert our ys into the appropriate tensor
        batch_y = tf.convert_to_tensor(batch_y)

        # debugging prints (make sure that these are commented out when you actually train your model)
        # should be (batch_size,)
        # print(batch_y.shape)
        yield dict(batch_x), batch_y

train_data = data_generator(train_sentences,train_labels,BATCH_SIZE)
val_data = data_generator(validation_sentences,validation_labels,BATCH_SIZE)

In [None]:
# TODO: Take a look at the contents of tmp_batch_x and tmp_batch_y and report the shapes of the `input_ids`
# and the y label tensor.
# make sure that the shapes are what you expect them to be
# (take a look at the comments in the data_generator code)

tmp_batch_x,tmp_batch_y = next(train_data)
val_batch_x, val_batch_y = next(val_data)

print(tmp_batch_x["input_ids"].shape)
print(tmp_batch_y.shape)
#print(tmp_batch_y)

print(val_batch_x["input_ids"].shape)
print(val_batch_y.shape)
#print(val_batch_y)
print(train_cluster_labels)


In [None]:
from keras.src.callbacks import History
from transformers import TFDistilBertModel
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.optimizers.schedules import ExponentialDecay

# Define custom metrics functions
def precision(y_true, y_pred):
    true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
    predicted_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + tf.keras.backend.epsilon())
    return precision

def recall(y_true, y_pred):
    true_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true * y_pred, 0, 1)))
    possible_positives = tf.keras.backend.sum(tf.keras.backend.round(tf.keras.backend.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + tf.keras.backend.epsilon())
    return recall

def f1(y_true, y_pred):
    p = precision(y_true, y_pred)
    r = recall(y_true, y_pred)
    f1 = 2 * ((p * r) / (p + r + tf.keras.backend.epsilon()))
    return f1


In [None]:
strategy = tf.distribute.MirroredStrategy()

In [None]:
#Get the number of GPUs needed
print("Total GPUs: ", strategy.num_replicas_in_sync)

In [None]:
#Prints the GPUs on your machine (if it's nvidia)
!nvidia-smi

In [None]:
#This will be our steps per epoch
print(len(train_sentences)//BATCH_SIZE)

In [None]:
with strategy.scope() as scope:

    bert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased',output_attentions = False,return_dict=False)
    # we do not need attention outputs
    # we want to return tuples since they are easier to access

    bert_model.trainable = False
    # setting trainable to false ensures
    # we do not update its weights

    # Define the learning rate schedule parameters
    initial_learning_rate = 0.001
    decay_rate = 0.95
    decay_steps = 1000

    # Create an exponential decay learning rate schedule
    lr_schedule = ExponentialDecay(
        initial_learning_rate,
        decay_steps=decay_steps,
        decay_rate=decay_rate,
        staircase=True  # Optional: Whether to apply decay in a staircase manner
    )

    optimizer = Adam(learning_rate=lr_schedule)

    model_ = tf.keras.Sequential([
        bert_model,
        tf.keras.layers.Lambda(lambda x: x[0][:,0,:]), # https://keras.io/api/layers/core_layers/lambda/
        tf.keras.layers.Dense(64,activation="relu"),
        tf.keras.layers.Dense(32,activation="relu"),
        tf.keras.layers.Dense(10,activation="relu"),
        tf.keras.layers.Dense(3,activation="softmax") # we have 3 classes
    ])

    # Define a checkpoint callback to save the best model
    checkpoint = ModelCheckpoint('distilbert_trained_model.h5', monitor='val_loss', mode='min', save_best_only=True, verbose=1)

    model_.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy', precision, recall, f1])

    # Use the first batch of training data we got to instantiate -- needed for loading weights
    model_(tmp_batch_x) 
    
    #Load model if needed
    #model_.load_weights('transformer_weights.h5')
    # Define a callback to collect metrics history
    history = History()

    model_.fit(
        train_data,
        epochs=2,
        batch_size=BATCH_SIZE,
        steps_per_epoch=steps_per_epoch,
        validation_data=val_data,
        validation_steps=validation_steps,
        validation_batch_size=BATCH_SIZE,
        callbacks=[history, checkpoint]
    )


In [None]:
#save our weights -- CANNOT SAVE MODEL BECAUSE OF LAMBDA LAYER
model_.save_weights('transformer_weights.h5')

In [None]:
# Access the metrics history
print(history.history.keys())  # To see what metrics are available

In [None]:
import numpy as np

# Access the metrics history
epochs = list(range(1, len(history.history['accuracy']) + 1))  # Assuming all metrics have the same length
precision = np.array(history.history['precision'])
recall = np.array(history.history['recall'])
f1 = np.array(history.history['f1'])
accuracy = np.array(history.history['accuracy'])
loss = np.array(history.history['loss'])
val_accuracy = np.array(history.history['val_accuracy'])
val_loss = np.array(history.history['val_loss'])


In [None]:
# Plot all four metrics on one graph
plt.figure(figsize=(10, 5))
plt.plot(epochs, val_loss, label='Validation Loss')
plt.plot(epochs, val_accuracy, label='Validation Accuracy')
plt.plot(epochs, loss, label='Training Loss')
plt.plot(epochs, accuracy, label='Training Accuracy')

plt.title('Metrics across Epochs')
plt.xlabel('Epoch')
plt.xticks(epochs)
plt.legend()
plt.savefig("Transformer_BERT_training_val_loss_acc(1).pdf")  # Save the plot before showing
plt.show()


In [None]:
# Plot all four metrics on one graph
plt.figure(figsize=(10, 5))
plt.plot(epochs, precision, label='Precision')
plt.plot(epochs, recall, label='Recall')
plt.plot(epochs, f1, label='F1 Score')
plt.plot(epochs, accuracy, label='Accuracy')

plt.title('Metrics across Epochs')
plt.xlabel('Epoch')
plt.xticks(epochs)
plt.legend()
plt.savefig("Transformer_BERT_training_PRFA(1).pdf")
plt.show()