# README

## Directory structure for running
```
root/
├── training_data/
│   └── ED/
│       ├── train.csv      
│       └── dev.csv       
├── test_data/
│   └── ED/
│       └── test.csv       # Test data for demo
├── bert_cnn_multi.ipynb  # Notebook
├── bert_cnn_model_256_multi/      # Directory for the trained model
└── bert_cnn_tokenizer_256_multi/  # Directory for the tokenizer

```

### Model can be downloaded from [here](https://drive.google.com/drive/folders/1-_eka-0MsXlYmHGJDICqZJBYtkpvPGYF?usp=sharing)
### Tokenizer can be downloaded from [here](https://drive.google.com/drive/folders/1-UzHyEx1RSmysrhRTIt5DmnDu7BzbMhZ?usp=sharing)

## Misc

- The notebook is structured in 3 sections seperated my markdown cells for Training, Evaluation, and Demo
- Model was inspired by the following [paper](https://www.sciencedirect.com/science/article/pii/S187705092300234X)


In [None]:
# Uncomment if using drive

# from google.colab import drive
# import os

# drive.mount('/content/drive')
# os.chdir('/content/drive/My Drive/NLU')
# !ls

In [None]:
!pip install tensorflow transformers optuna

# Training

In [None]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification, TFBertModel
import optuna

In [None]:
TRAINING_FILE = "./training_data/ED/train.csv"
VALIDATION_FILE = "./training_data/ED/dev.csv"
tf.random.set_seed(42)

In [None]:
train_df = pd.read_csv(TRAINING_FILE)
val_df = pd.read_csv(VALIDATION_FILE)

In [None]:
# load bert tokenizer and tokenize our claims and evidence
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

train_encodings = tokenizer(
    train_df["Claim"].tolist(),
    train_df["Evidence"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=256
)

val_encodings = tokenizer(
    val_df["Claim"].tolist(),
    val_df["Evidence"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=256
)


In [None]:
# convert our data to a tensorflow dataset
train_labels = train_df["label"]
val_labels = val_df["label"]

train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))


In [None]:
# get class weights
train_class_counts = train_df["label"].value_counts().to_dict()
total_samples = len(train_df)
class_weights = {
    label: total_samples / (len(train_class_counts) * count)
    for label, count in train_class_counts.items()
}
print("\nClass weights:", class_weights)
print(train_class_counts)

class_weight_dict = {int(cls): weight for cls, weight in class_weights.items()}

In [None]:
# set batch size and shuffle data
batch_size = 16

train_dataset = train_dataset.shuffle(1000).batch(batch_size)
val_dataset = val_dataset.batch(batch_size)


In [None]:

class BertCNNModel(tf.keras.Model):
    def __init__(self,
                 num_filters=128,
                 kernel_sizes=[3,4,5],
                 dropout_rate=0.2,
                 dense_units=128):
        super(BertCNNModel, self).__init__()
        self.bert = TFBertModel.from_pretrained('bert-base-uncased')

        self.convs = [
            tf.keras.layers.Conv1D(filters=num_filters,
                                   kernel_size=k,
                                   activation='relu',
                                   padding='valid')
            for k in kernel_sizes
        ]
        self.pool = tf.keras.layers.GlobalMaxPooling1D()
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dense = tf.keras.layers.Dense(dense_units, activation='relu')
        self.classifier = tf.keras.layers.Dense(2)

    def call(self, inputs, training=False):
        bert_output = self.bert(inputs, training=training).last_hidden_state

        # apply convolutions and pooling for each filter size
        conv_outputs = []
        for conv in self.convs:
            x = conv(bert_output)
            x = self.pool(x)
            conv_outputs.append(x)

        concated_conv_outputs = tf.concat(conv_outputs, axis=-1)
        dropout_output = self.dropout(concated_conv_outputs, training=training)
        dense_output = self.dense(dropout_output)
        logits = self.classifier(dense_output)
        return logits

In [None]:
# hyperparameter fine-tuning
def objective(trial):
    num_filters = trial.suggest_categorical('num_filters', [64, 128, 256])
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.3, step=0.1)
    dense_units = trial.suggest_categorical('dense_units', [64, 128, 256])
    learning_rate = trial.suggest_loguniform('learning_rate', 1e-5, 2e-5)


    model = BertCNNModel(num_filters=num_filters,
                                  dropout_rate=dropout_rate,
                                  dense_units=dense_units, kernel_size=3)


    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer,
                  loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                  metrics=['accuracy'])

    history = model.fit(train_dataset,
                        validation_data=val_dataset,
                        epochs=2,
                        verbose=0,
                        class_weight=class_weight_dict
                        )

    # return the highest accuracy over all epochs
    best_val_accuracy = max(history.history['val_accuracy'])
    return best_val_accuracy


In [None]:
# Run the trial then print the results

# study = optuna.create_study(direction="maximize")
# study.optimize(objective, timeout=7200, show_progress_bar=True)

# print("Best trial:")
# trial = study.best_trial
# print(trial.params)

# print("============================TRAIL DATA=========================================")
# trials = study.get_trials()
# for t in trials:
#   print(t.params, t.value)

In [None]:
model = BertCNNModel(num_filters=256, dropout_rate=0.2, dense_units=256)

In [None]:
# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=2e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=["accuracy"])

In [None]:
# callback to save our model at best epoch
checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath="./bert_cnn_model_256_multi",
    save_best_only=True,
    monitor="val_accuracy",
    mode="max"
)

# train our model
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=2,
    verbose=1,
    class_weight=class_weight_dict


)

In [None]:
model.save("./bert_cnn_model_256_multi")

tokenizer.save_pretrained("./bert_cnn_tokenizer_256_multi")

# Evaluation

In [None]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, TFBertForSequenceClassification
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [None]:
TEST_FILE = "./training_data/ED/dev.csv"
test_df = pd.read_csv(TEST_FILE)


In [None]:
# Load our tokenizer
tokenizer = BertTokenizer.from_pretrained("./bert_cnn_tokenizer_256_multi")

# Tokenize our claims and evidence
test_encodings = tokenizer(
    test_df["Claim"].tolist(),
    test_df["Evidence"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=256
)

In [None]:
# convert tokenized encodings to a tensorflow dataset
test_labels = test_df["label"]

test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings),
    test_labels
))


In [None]:
# Copy of the model class used in training, for the model to use during inference
class BertCNNModel(tf.keras.Model):
    def __init__(self,
                 bert_model_name='bert-base-uncased',
                 num_filters=128,
                 kernel_sizes=[3,4,5],
                 dropout_rate=0.2,
                 dense_units=128):
        super(BertCNNModel, self).__init__()
        self.bert = TFBertModel.from_pretrained(bert_model_name)

        self.convs = [
            tf.keras.layers.Conv1D(filters=num_filters,
                                   kernel_size=k,
                                   activation='relu',
                                   padding='valid')
            for k in kernel_sizes
        ]
        self.pool = tf.keras.layers.GlobalMaxPooling1D()
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dense = tf.keras.layers.Dense(dense_units, activation='relu')
        self.classifier = tf.keras.layers.Dense(2)

    def call(self, inputs, training=False):
        bert_output = self.bert(inputs, training=training).last_hidden_state

        conv_outputs = []
        for conv in self.convs:
            x = conv(bert_output)
            x = self.pool(x)
            conv_outputs.append(x)

        concat = tf.concat(conv_outputs, axis=-1)
        dropout_output = self.dropout(concat, training=training)
        dense_output = self.dense(dropout_output)
        logits = self.classifier(dense_output)
        return logits

In [None]:
# Load the model
model = tf.keras.models.load_model("./bert_cnn_model_256_multi")

In [None]:
# Batch the dataset so into the correct shape for the model
test_dataset_batched = test_dataset.batch(16)

In [None]:
# convert out logits output to a class prediction
predictions_logits = model.predict(test_dataset_batched)
y_pred = np.argmax(predictions_logits, axis=1)

In [None]:
# Get accuracy and weighted/macro precision, recall, F1
y_true = np.concatenate([y for _, y in test_dataset_batched], axis=0)

acc = accuracy_score(y_true, y_pred)
print("Accuracy:", acc, "\n")


print("Macro")
prec = precision_score(y_true, y_pred, average="macro")
rec = recall_score(y_true, y_pred, average="macro")
f1 = f1_score(y_true, y_pred, average="macro")

print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1)

print("Weighted: ")
prec = precision_score(y_true, y_pred, average='weighted')
rec = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

print("Precision:", prec)
print("Recall:", rec)
print("F1 Score:", f1, "\n")


# Demo

In [None]:
import pandas as pd
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, TFBertForSequenceClassification
import numpy as np

In [None]:
TEST_FILE = "./test_data/ED/test.csv"
test_df = pd.read_csv(TEST_FILE)

In [None]:
# Load our tokenizer
tokenizer = BertTokenizer.from_pretrained("./bert_cnn_tokenizer_256_multi")

# Tokenize our claims and evidence
test_encodings = tokenizer(
    test_df["Claim"].tolist(),
    test_df["Evidence"].tolist(),
    truncation=True,
    padding="max_length",
    max_length=256
)

In [None]:
# convert tokenized encodings to a tensorflow dataset
test_dataset = tf.data.Dataset.from_tensor_slices((
    dict(test_encodings)
))

In [None]:
# Copy of the model class used in training, for the model to use during inference
class BertCNNModel(tf.keras.Model):
    def __init__(self,
                 bert_model_name='bert-base-uncased',
                 num_filters=128,
                 kernel_sizes=[3,4,5],
                 dropout_rate=0.2,
                 dense_units=128):
        super(BertCNNModel, self).__init__()
        self.bert = TFBertModel.from_pretrained(bert_model_name)

        self.convs = [
            tf.keras.layers.Conv1D(filters=num_filters,
                                   kernel_size=k,
                                   activation='relu',
                                   padding='valid')
            for k in kernel_sizes
        ]
        self.pool = tf.keras.layers.GlobalMaxPooling1D()
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        self.dense = tf.keras.layers.Dense(dense_units, activation='relu')
        self.classifier = tf.keras.layers.Dense(2)

    def call(self, inputs, training=False):
        bert_output = self.bert(inputs, training=training).last_hidden_state

        conv_outputs = []
        for conv in self.convs:
            x = conv(bert_output)
            x = self.pool(x)
            conv_outputs.append(x)

        concat = tf.concat(conv_outputs, axis=-1)
        dropout_output = self.dropout(concat, training=training)
        dense_output = self.dense(dropout_output)
        logits = self.classifier(dense_output)
        return logits

In [None]:
# Load the model
model = tf.keras.models.load_model("./bert_cnn_model_256_multi")

In [None]:
# Batch the dataset so into the correct shape for the model
test_dataset_batched = test_dataset.batch(16)


In [None]:
# convert out logits output to a class prediction
predictions_logits = model.predict(test_dataset_batched)
y_pred = np.argmax(predictions_logits, axis=1)

In [None]:
# save predictions
pred_df = pd.DataFrame({"prediction": y_pred})
pred_df.to_csv("predictions_test_data.csv", index=False)