You can look at already run notebook on kaggle [here](https://www.kaggle.com/code/hazrulakmal/transfer-learning-data-augmentation-distillation). This code requires GPU to run for faster loading time.

In [1]:
!apt-get install git-lfs
!git lfs install

In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from datasets import Dataset as Dataset_dict 
from datasets import load_metric, DatasetDict
from transformers import pipeline, TrainingArguments, Trainer, AutoConfig, AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoModelForSequenceClassification

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, accuracy_score

import torch
import torch.nn as nn
import torch.nn.functional as F

import keras
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.data import Dataset
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

os.environ["WANDB_DISABLED"] = "true"
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [3]:
#from huggingface_hub import notebook_login #uncomment this to login to your huggingface account
#notebook_login() 

In [4]:
def split_data(dataframe, ylabel, test_size=0.2):
    
    training_df, test_df = train_test_split(
        dataframe,
        test_size=test_size,
        random_state=42,
        shuffle= True,
        stratify=dataframe[ylabel],
    )

    return training_df, test_df

## Data Load

In [5]:
phrasebank = pd.read_csv("/kaggle/input/financial-phrasebank/phrasebank.csv")
augmented = pd.read_csv("/kaggle/input/augmented-phrasebank-dataset/augmentation.csv")
augmented = augmented.drop(columns=["Unnamed: 0"]) # remove unnecessary items
augmented = augmented.dropna()

#Label sentiments to numerical representation
le = LabelEncoder()
le.fit(phrasebank["labels"])
phrasebank["labels"] = le.transform(phrasebank["labels"])

label2id = {}
id2label = {}
for each_class in le.classes_:
    label2id[each_class] = int(le.transform([each_class]))
    id2label[int(le.transform([each_class]))] = each_class
    
label2id

In [6]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_labels = 3
batch_size = 64

# Transfer Learning Finetuning
## BERT

In [7]:

#Metrics Function
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [8]:
train, test = split_data(phrasebank, "labels")
train, val = split_data(train, "labels", test_size=0.2)

dataset = { "train": Dataset_dict.from_pandas(train, preserve_index = False),
            "validation": Dataset_dict.from_pandas(val, preserve_index = False),
            "test" : Dataset_dict.from_pandas(test, preserve_index = False)
          }

distil_data_small = DatasetDict(dataset) #groundtruth dataset
distil_data_small

In [11]:
# Intialising a model
model_ckpt =  "bert-base-uncased" 

#Tokenization
benchmark_tokenizer = AutoTokenizer.from_pretrained(model_ckpt) 
def tokenize_text(batch):
    return benchmark_tokenizer(batch["titles"], truncation=True)

distil_data_small_enc = distil_data_small.map(tokenize_text, batched=True, remove_columns=["titles"])

In [12]:
#Load a pre-trained model
benchmark_model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device)

In [13]:
model_name = "benchmark-finetuned-bert" #initialise model directory

logging_steps = len(distil_data_small_enc["train"]) // batch_size
training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=4,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 weight_decay=0.01,
                                 evaluation_strategy="epoch",
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=False,
                                 log_level="error")


trainer = Trainer(model=benchmark_model, args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=distil_data_small_enc["train"],
                 eval_dataset=distil_data_small_enc["validation"],
                 tokenizer=benchmark_tokenizer)

trainer.train()

In [20]:
preds_output = trainer.predict(distil_data_small_enc["test"])
preds_output.metrics

In [None]:
#trainer.push_to_hub(commit_message="Training completed!")

## DistilBERT

In [9]:
# Intialising a model
model_ckpt =  "distilbert-base-uncased" 
model_name = f"benchmark-finetuned-distilbert"

#Tokenization
benchmark_distil_tokenizer = AutoTokenizer.from_pretrained(model_ckpt) 
def tokenize_text(batch):
    return benchmark_distil_tokenizer(batch["titles"], truncation=True)

distil_data_small_enc = distil_data_small.map(tokenize_text, batched=True, remove_columns=["titles"])

#Initialise pre-trained Model
benchmark_distil_model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device)

In [10]:
logging_steps = len(distil_data_small_enc["train"]) // batch_size

training_args = TrainingArguments(output_dir=model_name,
                                num_train_epochs=3,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 weight_decay=0.01,
                                 evaluation_strategy="epoch",
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=False,
                                 log_level="error")

trainer = Trainer(model=benchmark_distil_model, args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=distil_data_small_enc["train"],
                 eval_dataset=distil_data_small_enc["validation"],
                 tokenizer=benchmark_distil_tokenizer)

trainer.train()

In [11]:
preds_output = trainer.predict(distil_data_small_enc["test"])
preds_output.metrics

In [12]:
#trainer.push_to_hub(commit_message="Training completed!")

# Data Augmentation
## Finetuning BERT Model (teacher) with Augmented Dataset 
`Training Dataset is split into half, one used for training and the other one used for validation`

In [7]:
train_df, test = split_data(phrasebank, "labels")
aug_train_df, aug_val = split_data(train_df, "labels", test_size=0.5)

aug_train = pd.concat([aug_train_df, augmented], axis=0)

dataset = { "train": Dataset_dict.from_pandas(aug_train, preserve_index = False),
            "validation": Dataset_dict.from_pandas(test, preserve_index = False)
                    #"test" : Dataset_dict.from_pandas(test, preserve_index = False)
          }

distil_data = DatasetDict(dataset)
distil_data

In [None]:
# Intialising a model
model_ckpt =  "bert-base-uncased"
model_name = f"{model_ckpt}-finetuned"

#Tokenization
teacher_tokenizer = AutoTokenizer.from_pretrained(model_ckpt) 

def tokenize_text(batch):
    return teacher_tokenizer(batch["titles"], truncation=True)

distil_data_enc = distil_data.map(tokenize_text, batched=True, remove_columns=["titles"])

#Intialising a pre-trained model
model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device)

In [None]:
logging_steps = len(distil_data_enc["train"]) // batch_size

training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=2,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 weight_decay=0.01,
                                 evaluation_strategy="epoch",
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=False,
                                 log_level="error")


trainer = Trainer(model=model, args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=distil_data_enc["train"],
                 eval_dataset=distil_data_enc["validation"],
                 tokenizer=teacher_tokenizer)

trainer.train()

In [None]:
preds_output = trainer.predict(distil_data_enc["validation"])
preds_output.metrics

In [None]:
#trainer.push_to_hub(commit_message="Training completed!")

## Finetunning DistilBERT on Augmented Data

In [None]:
# Intialising a model
model_ckpt =  "distilbert-base-uncased" #"ProsusAI/finbert"

#Tokenization
student_tokenizer = AutoTokenizer.from_pretrained(model_ckpt) 
def tokenize_text(batch):
    return student_tokenizer(batch["titles"], truncation=True)

distil_data_enc = distil_data.map(tokenize_text, batched=True, remove_columns=["titles"])

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device) 

In [None]:
batch_size = 64

logging_steps = len(distil_data_enc["train"]) // batch_size
model_name = f"augmented-distilbert-finetuned"

training_args = TrainingArguments(output_dir=model_name,
                                 num_train_epochs=3,
                                 learning_rate=2e-5,
                                 per_device_train_batch_size=batch_size,
                                 per_device_eval_batch_size=batch_size,
                                 weight_decay=0.01,
                                 evaluation_strategy="epoch",
                                 disable_tqdm=False,
                                 logging_steps=logging_steps,
                                 push_to_hub=False,
                                 log_level="error")


trainer = Trainer(model=model, args=training_args,
                 compute_metrics=compute_metrics,
                 train_dataset=distil_data_enc["train"],
                 eval_dataset=distil_data_enc["validation"],
                 tokenizer=student_tokenizer)

trainer.train()

In [None]:
preds_output = trainer.predict(distil_data_enc["validation"])
preds_output.metrics

In [None]:
#trainer.push_to_hub(commit_message="Training completed!")

## Knowledge Distillation

In [6]:
accuracy_score2 = load_metric("accuracy")

class DistillationTrainingArguments(TrainingArguments):
    def __init__(self, *args, alpha=0.5, temperature=2.0, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.temperature = temperature
    

class DistillationTrainer(Trainer):
    def __init__(self, *args, teacher_model=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.teacher_model = teacher_model

    def compute_loss(self, model, inputs, return_outputs=False):
        outputs_stu = model(**inputs)
        # Extract cross-entropy loss and logits from student
        loss_ce = outputs_stu.loss
        logits_stu = outputs_stu.logits
        # Extract logits from teacher
        with torch.no_grad():
            outputs_tea = self.teacher_model(**inputs)
            logits_tea = outputs_tea.logits
        # Soften probabilities and compute distillation loss
        loss_fct = nn.KLDivLoss(reduction="batchmean")
        loss_kd = self.args.temperature ** 2 * loss_fct(
            F.log_softmax(logits_stu / self.args.temperature, dim=-1),
            F.softmax(logits_tea / self.args.temperature, dim=-1))
        # Return weighted student loss
        loss = self.args.alpha * loss_ce + (1. - self.args.alpha) * loss_kd
        return (loss, outputs_stu) if return_outputs else loss

def compute_metrics(pred):
    predictions, labels = pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy_score2.compute(predictions=predictions, references=labels)

In [8]:
# Intialising student model
student_ckpt = "distilbert-base-uncased" #distilbert-base-uncased
finetuned_ckpt = "distilbert-optimised-finetuned-financial-sentiment"

#Tokenization
student_tokenizer = AutoTokenizer.from_pretrained(student_ckpt) 
def tokenize_text(batch):
    return student_tokenizer(batch["titles"], truncation=True)

distil_data_enc = distil_data.map(tokenize_text, batched=True, remove_columns=["titles"])

In [18]:
#initialising student distilation args & configs
student_training_args = DistillationTrainingArguments(output_dir=finetuned_ckpt,
                             evaluation_strategy = "epoch",
                             num_train_epochs=2, learning_rate=2e-5,
                             per_device_train_batch_size=batch_size,
                             per_device_eval_batch_size=batch_size, alpha=0.5, weight_decay=0.01, push_to_hub=False)

student_config = AutoConfig.from_pretrained(student_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id)

In [19]:
def student_init():
     return AutoModelForSequenceClassification.from_pretrained(student_ckpt, config=student_config).to(device)
    
#initialise teacher model
teacher_ckpt = "hazrulakmal/bert-base-uncased-finetuned"
teacher_model = AutoModelForSequenceClassification.from_pretrained(teacher_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device)

distilbert_trainer = DistillationTrainer(model_init=student_init,
                                         teacher_model=teacher_model, args=student_training_args,
                                         train_dataset=distil_data_enc['train'], eval_dataset=distil_data_enc['validation'],
                                         compute_metrics=compute_metrics, tokenizer=student_tokenizer)

distilbert_trainer.train()

In [17]:
preds_output = distilbert_trainer.predict(distil_data_enc["validation"])
preds_output.metrics

In [20]:
#distilbert_trainer._to_to_hub("Training completed!")