You can refer to already run notebook on kaggle [here](https://www.kaggle.com/code/hazrulakmal/performance-evaluation). Else, GPU is required to run this notebook.

In [1]:
import re
import os
import random
import numpy as np
import pandas as pd
import tensorflow as tf
import torch

import matplotlib.pyplot as plt
from datasets import Dataset as Dataset_dict 
from datasets import DatasetDict, load_metric
from tqdm.auto import tqdm
from time import perf_counter
from pathlib import Path

from tensorflow.keras.utils import to_categorical
from tensorflow.data import Dataset
from tensorflow.keras.preprocessing.text import Tokenizer
from transformers import pipeline, AutoTokenizer,AutoModelForSequenceClassification

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder

from wordcloud import WordCloud, STOPWORDS
import re,string, nltk
from nltk.stem.wordnet import WordNetLemmatizer
from nltk import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.stem.snowball import SnowballStemmer

nltk.download('omw-1.4')


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["WANDB_DISABLED"] = "true"

# Data Load

In [2]:
def split_data(dataframe, ylabel, test_size=0.2):
    
    training_df, test_df = train_test_split(
        dataframe,
        test_size=test_size,
        random_state=42,
        shuffle= True,
        stratify=dataframe[ylabel],
    )

    return training_df, test_df

In [3]:
financial_news = pd.read_csv("/kaggle/input/financial-phrasebank/phrasebank.csv")


le = LabelEncoder()
le.fit(financial_news["labels"])
financial_news["labels"] = le.transform(financial_news["labels"])

label2id = {}
id2label = {}
for each_class in le.classes_:
    label2id[each_class] = int(le.transform([each_class]))
    id2label[int(le.transform([each_class]))] = each_class
    
id2label

In [4]:
train, test = split_data(financial_news, "labels")
train, val = split_data(train, "labels", test_size=0.2)

dataset = { "train": Dataset_dict.from_pandas(train, preserve_index = False),
            "validation": Dataset_dict.from_pandas(val, preserve_index = False),
            "test" : Dataset_dict.from_pandas(test, preserve_index = False)
          }

ground_truth = DatasetDict(dataset)

In [5]:
evaluation_df = pd.DataFrame(columns=['Accuracy', 'F1'], index=['A1: XGBoost+Tfidf', 'A2: Fasttext', 'B1: BERT', 'B2: DistilBERT', 'C1: BERT', "C2: DistilBERT", "D: DistilBERT"])
evaluation_df

## TFIDF Embedding Model (Baseline)

## Data Cleaning

In [None]:
def clean_text(df,field):
    df[field] = df[field].str.replace("#[A-Za-z0-9_]+", ' ')
    df[field] = df[field].str.replace(r"[^A-Za-z(),!?@\'\"_\n]"," ")
    df[field] = df[field].str.lower()
    return df 
    
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")
STOPWORDS.update(['rt', 'mkr', 'didn', 'bc', 'n', 'm','im', 'll', 'y', 've', 
                      'u', 'ur', 'don','p', 't', 's', 'aren', 'kp', 'o', 'kat', 
                      'de', 're', 'amp', 'will'])

def preprocess_text(text):
    text = re.sub('[^a-zA-Z]',' ',text) #remove non words
    text = re.sub(r'[^\x00-\x7f]','',text)
    text = " ".join([stemmer.stem(word) for word in text.split()])
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if not word in set(STOPWORDS)]) 
    return text

In [None]:
ml_df = clean_text(financial_news, "titles")
ml_df["titles"] = ml_df["titles"].apply(preprocess_text)

X_train, X_test, y_train, y_test = train_test_split(np.array(ml_df["titles"]),np.array(ml_df["labels"]), test_size=0.2, random_state=42)

tfidf = TfidfVectorizer(use_idf=True, tokenizer=word_tokenize, min_df=0.00002, max_df=0.70)
X_train_tf = tfidf.fit_transform(X_train.astype('U'))
X_test_tf = tfidf.transform(X_test.astype('U'))

print(f"TF_IDF Model: Train features shape:{X_train_tf.shape} and Test features shape:{X_test_tf.shape}")

In [None]:
rf = RandomForestClassifier(random_state=42)
gb = GradientBoostingClassifier(random_state=42)
ada = AdaBoostClassifier(random_state=42)
lgb = LGBMClassifier(random_state=42)
xgb = XGBClassifier(eval_metric="mlogloss",random_state=42)
dt = DecisionTreeClassifier(random_state=42)
svc = SVC(random_state=42)
nb = MultinomialNB()
mlp = MLPClassifier(random_state=42)

clfs = {
    "Random Forest": rf,
    "Gradient Boosting":gb,
    "AdaBoost": ada,
    "LightGBM": lgb,
    "XGBoost": xgb,
    "Decision Tree":dt,
    "Support Vector Machine":svc,
    "Naive Bayes": nb,
    "Multilayer Perceptron":mlp
}

In [None]:
def fit_model(clf,x_train,y_train,x_test, y_test):
    clf.fit(x_train,y_train)
    y_pred = clf.predict(x_test)
    accuracy = accuracy_score(y_pred, y_test)
    f1_weighted = f1_score(y_test, y_pred, average="weighted")
    return accuracy, f1_weighted

accuracy_all = []
f1_scores_weighted =[]

for name,clf in tqdm(clfs.items()):
    curr_acc,curr_f1_weighted = fit_model(clf,X_train_tf,y_train,X_test_tf,y_test)
    accuracy_all.append(curr_acc)
    f1_scores_weighted.append(curr_f1_weighted)

In [None]:
models_df = pd.DataFrame({"Models":clfs.keys(),"Accuracy Scores":accuracy_all, "F1 Scores": f1_scores_weighted}).sort_values('Accuracy Scores',ascending=False)
models_df

In [None]:
evaluation_df.loc["A1: XGBoost+Tfidf"] = [round(metric, 4) for metric in models_df.loc[4,"Accuracy Scores":"F1 Scores"]]
evaluation_df

# A2: Fasttext

In [None]:
import fasttext

model = fasttext.train_supervised('/kaggle/input/fasttext-sentiment-classification/fasttext_input_sentiment_train.txt' ,wordNgrams = 3, autotuneValidationFile='/kaggle/input/fasttext-sentiment-classification/fasttext_input_sentiment_val.txt')

In [None]:
fasttext_test = test.copy()
def predict(row):
    return model.predict(row['titles'])[0][0]

#Predict
fasttext_test['predictions'] = fasttext_test.apply(predict, axis=1)
pred_labels ={"__label__1": 1, "__label__0":0, "__label__2":2}
fasttext_test.predictions = fasttext_test.predictions.map(pred_labels)

#Evaluate
test_acc = accuracy_score(fasttext_test["labels"],fasttext_test["predictions"])
weighted_test_f1 = f1_score(fasttext_test["labels"], fasttext_test["predictions"], average="weighted")

evaluation_df.loc["A2: Fasttext"] = (round(test_acc, 4), round(weighted_test_f1,4))
evaluation_df

# B: Transfer Learning Models
## BERT

In [None]:
bert_classifier = pipeline("text-classification", model= "hazrulakmal/benchmark-finetuned-bert")#device=0 change this parameter to use GPU

preds = bert_classifier(test["titles"].to_list())
pred_df = pd.DataFrame(preds, columns =["label", "score"])
pred_df.label = le.transform(pred_df["label"])

test_acc = accuracy_score(test["labels"], pred_df["label"])
test_f1_weighted = f1_score(test["labels"], pred_df["label"], average="weighted")

evaluation_df.loc["B1: BERT"] = (round(test_acc,4), round(test_f1_weighted, 4))
evaluation_df

## DistilBERT

In [None]:
ditlbert_classifier = pipeline("text-classification", model= "hazrulakmal/benchmark-finetuned-distilbert")#device=0 change this parameter to use GPU

preds = ditlbert_classifier(test["titles"].to_list())
pred_df = pd.DataFrame(preds, columns =["label", "score"])
pred_df.label = le.transform(pred_df["label"])

test_acc = accuracy_score(test["labels"], pred_df["label"])
test_f1_weighted = f1_score(test["labels"], pred_df["label"], average="weighted")

evaluation_df.loc["B2: DistilBERT"] = (round(test_acc,4), round(test_f1_weighted, 4))
evaluation_df

# C: Models trained on Augmented Data
## BERT

In [6]:
augbert_classifier = pipeline("text-classification", model= "hazrulakmal/bert-base-uncased-finetuned")#device=0 change this parameter to use GPU

preds = augbert_classifier(test["titles"].to_list())
pred_df = pd.DataFrame(preds, columns =["label", "score"])
pred_df.label = le.transform(pred_df["label"])

test_acc = accuracy_score(test["labels"], pred_df["label"])
test_f1_weighted = f1_score(test["labels"], pred_df["label"], average="weighted")

evaluation_df.loc["C1: BERT"] = (round(test_acc,4),round(test_f1_weighted, 4))
evaluation_df

## DistilBERT

In [7]:
augdistilbert_classifier = pipeline("text-classification", model= "hazrulakmal/augmented-distilbert-finetuned")#device=0 change this parameter to use GPU

preds = augdistilbert_classifier(test["titles"].to_list())
pred_df = pd.DataFrame(preds, columns =["label", "score"])
pred_df.label = le.transform(pred_df["label"])

test_acc = accuracy_score(test["labels"], pred_df["label"])
test_f1_weighted = f1_score(test["labels"], pred_df["label"], average="weighted")

evaluation_df.loc["C2: DistilBERT"] = (round(test_acc,4), round(test_f1_weighted, 4))
evaluation_df

# D: Distillation Model

In [8]:
distildistilbert_classifier = pipeline("text-classification", model= "hazrulakmal/distilbert-optimised-finetuned-financial-sentiment")#device=0 change this parameter to use GPU

preds = distildistilbert_classifier(test["titles"].to_list())
pred_df = pd.DataFrame(preds, columns =["label", "score"])
pred_df.label = le.transform(pred_df["label"])

test_acc = accuracy_score(test["labels"], pred_df["label"])
test_f1_weighted = f1_score(test["labels"], pred_df["label"], average="weighted")

evaluation_df.loc["D: DistilBERT"] = (round(test_acc,4),  round(test_f1_weighted, 4))
evaluation_df

## Speed and Size Performance Evaluation

In [7]:
f1_score = load_metric("f1")
class PerformanceBenchmark:
    def __init__(self, pipeline, dataset, label2id, optim_type="BERT baseline"):
        self.pipeline = pipeline
        self.dataset = dataset
        self.label2id = label2id
        self.optim_type = optim_type
        
    def compute_accuracy(self):
        preds, labels = [], []
        for example in self.dataset:
            pred = self.pipeline(example["titles"])[0]["label"]
            label = example["labels"] #actual value
            preds.append(self.label2id[pred])
            labels.append(label)
        accuracy = f1_score.compute(predictions=preds, references=labels, average="weighted")
        print(f"Accuracy on test set - {accuracy['f1']:.4f}")
        return accuracy
    
    def compute_size(self):
        state_dict = self.pipeline.model.state_dict()
        tmp_path = Path("model.pt")
        torch.save(state_dict, tmp_path)
         # Calculate size in megabytes
        size_mb = Path(tmp_path).stat().st_size / (1024 * 1024)
         # Delete temporary file
        tmp_path.unlink()
        print(f"Model size (MB) - {size_mb:.2f}")
        return {"size_mb": size_mb}
    
    def time_pipeline(self, query= "After the acquisition , Basware 's preliminary pro forma net sales for 2005 amount to EUR"):
        latencies = []
         # Warmup
        for _ in range(10):
            _ = self.pipeline(query)
        
        # Timed run
        for _ in range(100):
            start_time = perf_counter()
            _ = self.pipeline(query)
            latency = perf_counter() - start_time
            latencies.append(latency)
         
        # Compute run statistics
        time_avg_ms = 1000 * np.mean(latencies)
        time_std_ms = 1000 * np.std(latencies)
        
        print(f"Average latency (ms) - {time_avg_ms:.2f} +\- {time_std_ms:.2f}")
        return {"time_avg_ms": time_avg_ms, "time_std_ms": time_std_ms}
            
    def run_benchmark(self):
        metrics = {}
        metrics[self.optim_type] = self.compute_size()
        metrics[self.optim_type].update(self.time_pipeline())
        metrics[self.optim_type].update(self.compute_accuracy())
        return metrics

In [8]:
def plot_metrics(perf_metrics, current_optim_type):
    df = pd.DataFrame.from_dict(perf_metrics, orient='index')
    for idx in df.index:
        df_opt = df.loc[idx]
    # Add a dashed circle around the current optimization type
        if idx == current_optim_type:
            plt.scatter(df_opt["time_avg_ms"], df_opt["f1"] * 100,
                        alpha=0.5, s=df_opt["size_mb"], label=idx,
                        marker='$\u25CC$')
        else:
            plt.scatter(df_opt["time_avg_ms"], df_opt["f1"] * 100, s=df_opt["size_mb"], label=idx, alpha=0.5)
    
    legend = plt.legend(bbox_to_anchor=(1,1))
    for handle in legend.legendHandles:
        handle.set_sizes([20])
    
    plt.ylim(80,90)
    # Use the slowest model to define the x-axis range
    xlim = int(perf_metrics["C1: BERT"]["time_avg_ms"] + 8)
    plt.xlim(10, xlim)
    plt.ylabel("F1 Score (%)")
    plt.xlabel("Average latency (ms)")
    plt.show()

In [9]:
pb = PerformanceBenchmark(augbert_classifier, ground_truth["test"], label2id=label2id, optim_type= "C1: BERT")
perf_metrics = pb.run_benchmark()

In [10]:
perf_metrics

In [12]:
pb = PerformanceBenchmark(augdistilbert_classifier, ground_truth["test"], label2id=label2id, optim_type = "C2: DistilBERT")
perf_metrics.update(pb.run_benchmark())

In [13]:
pb = PerformanceBenchmark(distildistilbert_classifier, ground_truth["test"], label2id=label2id, optim_type = "D: DistilBERT")
perf_metrics.update(pb.run_benchmark())

In [25]:
plot_metrics(perf_metrics, "D: DistilBERT")

In [17]:
df = pd.DataFrame.from_dict(perf_metrics, orient='index')
df