### It turns out that Epoch = 3, Batch Size = 8 , Learning Rate = 5e-5 performs well (without overfitting)

In [None]:
!pip install datasets transformers
!pip install emoji

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:

import pandas as pd

from datasets import Dataset
from transformers import AutoTokenizer
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification
from transformers import DefaultDataCollator

In [None]:
##BERT_MODEL = "bert-base-uncased"
BERT_MODEL = "vinai/bertweet-base"
#BERT_MODEL = "vinai/bertweet-large"
NUM_EPOCHS = 25
BATCH_SIZE = 32
DROPOUT = 0.1

## Load Data into Pandas DF

In [None]:

data_train = pd.read_csv("/content/drive/My Drive/trunc_data_train.csv")  
data_train.fillna(" ", inplace=True)
data_dev = pd.read_csv("/content/drive/My Drive/trunc_data_dev.csv")
data_dev.fillna(" ", inplace=True)


In [None]:
dataset_train = Dataset.from_pandas(data_train)
dataset_train

Dataset({
    features: ['Unnamed: 0', 'source', 'replies', 'label'],
    num_rows: 1895
})

In [None]:
dataset_dev = Dataset.from_pandas(data_dev)
dataset_dev

Dataset({
    features: ['Unnamed: 0', 'source', 'replies', 'label'],
    num_rows: 632
})

In [None]:
total_train = pd.concat([data_dev,data_train], axis=0)
dataset_total_train = Dataset.from_pandas(total_train)

## Transformer Dataset to TensorFlow TF Dataset

Makin use of Transformer Dataset

In [None]:
## Set up Tokenizer


tokenizer = AutoTokenizer.from_pretrained(BERT_MODEL, normalization=True)

## Tokenize with two sentences separated by [SEP]m, use source and reply as two sentences
def tokenize_function(dataset):
    return tokenizer(dataset["source"], dataset["replies"], padding="max_length", truncation=True)
    #return tokenizer(dataset["text"], padding=True, truncation=True)

# Set up Train data
tokenized_train_datasets = dataset_train.map(tokenize_function, batched=True)
# Set up Dev Data
tokenized_dev_datasets = dataset_dev.map(tokenize_function, batched=True)


# Set up Total Data
tokenized_total_datasets = dataset_total_train.map(tokenize_function, batched=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


  0%|          | 0/2 [00:00<?, ?ba/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

  0%|          | 0/1 [00:00<?, ?ba/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

  0%|          | 0/3 [00:00<?, ?ba/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
## Use data_collator to batch the dataset
data_collator = DefaultDataCollator(return_tensors="tf")

In [None]:
tf_train_dataset = tokenized_train_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
)

tf_validation_dataset = tokenized_dev_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
)



tf_totaltrain_dataset = tokenized_total_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["label"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
)

## Tensor Flow Bert Model

Use Training Set to train and test against dev set

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
# https://stackoverflow.com/questions/52041931/is-there-an-optimizer-in-keras-based-on-precision-or-recall-instead-of-loss
from keras import backend as K
THRESHOLD = 0.5
def precision(y_true, y_pred, threshold_shift=0.5-THRESHOLD):

    # just in case 
    y_pred = K.clip(y_pred, 0, 1)

    # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred + threshold_shift)

    tp = K.sum(K.round(y_true * y_pred_bin)) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred_bin - y_true, 0, 1)))

    precision = tp / (tp + fp)
    return precision


def recall(y_true, y_pred, threshold_shift=0.5-THRESHOLD):

    # just in case 
    y_pred = K.clip(y_pred, 0, 1)

    # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred + threshold_shift)

    tp = K.sum(K.round(y_true * y_pred_bin)) + K.epsilon()
    fn = K.sum(K.round(K.clip(y_true - y_pred_bin, 0, 1)))

    recall = tp / (tp + fn)
    return recall


def fbeta(y_true, y_pred, beta = 2, threshold_shift=0.5-THRESHOLD):   
    # just in case 
    y_pred = K.clip(y_pred, 0, 1)

    # shifting the prediction threshold from .5 if needed
    y_pred_bin = K.round(y_pred + threshold_shift)

    tp = K.sum(K.round(y_true * y_pred_bin)) + K.epsilon()
    fp = K.sum(K.round(K.clip(y_pred_bin - y_true, 0, 1)))
    fn = K.sum(K.round(K.clip(y_true - y_pred, 0, 1)))

    precision = tp / (tp + fp)
    recall = tp / (tp + fn)

    beta_squared = beta ** 2
    return (beta_squared + 1) * (precision * recall) / (beta_squared * precision + recall) 


In [None]:


## Define Model
model = TFAutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=2,
        hidden_dropout_prob=DROPOUT)

#model = TFAutoModelForSequenceClassification.from_pretrained(BERT_MODEL, num_labels=2)


## Set up optimisation method, minimise which loss
# model.compile(
#     optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
#     loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
#     metrics=tf.metrics.SparseCategoricalAccuracy(),
# )

# Use f1 score to capture information
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
    metrics=[tf.metrics.SparseCategoricalAccuracy(), fbeta,precision,recall]
)


All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Fit model using Training ONLY
#model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=NUM_EPOCHS)


## Train using Train + Dev Set

In [None]:
# Fit model
model.fit(tf_totaltrain_dataset, validation_data=tf_validation_dataset, epochs=NUM_EPOCHS)


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x7f04219df6d0>

## Test on Test Set

In [None]:
test_df = pd.read_csv("/content/drive/My Drive/dataset_test.csv")
test_df.fillna(" ", inplace=True)
test_dataset = Dataset.from_pandas(test_df)  # Convert to Transformer Dataset

In [None]:
# Convert to Keras input for Bert model
tokenized_test_datasets = test_dataset.map(tokenize_function, batched=True)
tf_test_dataset = tokenized_test_datasets.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=BATCH_SIZE,
)

  0%|          | 0/1 [00:00<?, ?ba/s]

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
y_pred = model.predict(tf_test_dataset)

In [None]:
# generate the csv for prediction
def generate_csv(pred, csv_name):
    ids = pd.Index(range(len(pred)), name='Id')
    predictions = pd.DataFrame(pred, index=ids)
    predictions.columns = ['Predicted']
    predictions.to_csv(csv_name)

def model_output_to_label(model_output):
    """Conver the output class of a tensorflow model to label"""
    logit_df = pd.DataFrame(model_output.to_tuple()[0], columns = ["0","1"])
    ## Choose highest logit as the predicted class
    logit_df["label"] = logit_df.apply(lambda x: 0 if x["0"] > x["1"] else 1, axis=1)
    return logit_df["label"]

In [None]:
labels = model_output_to_label(y_pred)
generate_csv(labels, "/content/drive/My Drive/vanillabertmodel.csv")

In [None]:
labels.value_counts()

0    437
1    121
Name: label, dtype: int64

In [None]:
# Save model
#tf.keras.models.save_model(model, "saved_model.hp5", save_format="h5")

# Load model

#custom_metric = pickle.load(open("/content/drive/My Drive/bert_metric.pickle", 'rb'))
#loaded_model = tf.keras.models.load_model("saved_model.hp5", custom_objects=custom_metric)

# Predict Covid Tweets Label



In [None]:
covid_df = pd.read_csv("/content/drive/My Drive/covid_bert_data.csv")
covid_df.fillna(" ", inplace=True)
covid_dataset = Dataset.from_pandas(covid_df)

In [None]:
## Use data_collator to batch the dataset

tokenized_covid_dataset = covid_dataset.map(tokenize_function, batched=True)
tf_covid_dataset = tokenized_covid_dataset.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=20,
)

  0%|          | 0/18 [00:00<?, ?ba/s]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens ar

In [None]:
y_pred_covid = model.predict(tf_covid_dataset)


In [None]:

labels2 = model_output_to_label(y_pred_covid)
generate_csv(labels2, "/content/drive/My Drive/covid_labels.csv")

In [None]:
labels2.value_counts()

0    13049
1     4409
Name: label, dtype: int64