In [None]:
# packages needed
import pandas as pd
import numpy as np

import ktrain
from ktrain import text
import nltk
from sklearn.model_selection import train_test_split,KFold
from sklearn.metrics import accuracy_score

from torch.utils.data import TensorDataset,RandomSampler,SequentialSampler
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from wordcloud import WordCloud,STOPWORDS
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from bs4 import BeautifulSoup
import spacy
import re,string,unicodedata

# data preparation

In [None]:
data_train_taskb['label_category'].value_counts()

In [None]:
#drop the 'none' data
index = data_train_taskb[ (data_train_taskb['label_category'] == 'none')  ].index
data_train_taskb.drop(index , inplace=True)

In [None]:
import matplotlib.pyplot as plt

# Data
categories = [ "derogation", "animosity", "prejudiced discussions", "threats, plans to harm and incitement"]
counts = [ 6904, 4578, 1463, 1182]

# Create a bar plot
plt.figure(figsize=(3, 1.8))
plt.barh(categories, counts, color='skyblue')
plt.xlabel('Count')
plt.title('Category Counts')
plt.gca().invert_yaxis()  # Invert y-axis to show the highest count at the top
plt.show()

# Create a pie chart
plt.figure(figsize=(3, 3))
plt.pie(counts, labels=categories, autopct='%1.1f%%', startangle=140, colors=['skyblue', 'lightcoral', 'lightgreen', 'lightsalmon', 'lightseagreen'])
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle.

plt.title('Category Distribution')
plt.show()

In [None]:
# split data to training and test set
x = data_train_taskb.text.tolist()
y = data_train_taskb.label_category.tolist()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size= 0.2, random_state= 42)

In [None]:
#encoding data for task B
encoding = {
    '2. derogation': 1,
    '1. threats, plans to harm and incitement': 0,
    '4. prejudiced discussions': 3,
    '3. animosity':2
}

# Integer values for each class
y_train = [encoding[x] for x in y_train]
y_test = [encoding[x] for x in y_test]

# focal loss

In [None]:
import tensorflow as tf
from tensorflow.keras import activations
def focal_loss(gamma=2., alpha=4., from_logits=False):

    gamma = float(gamma)
    alpha = float(alpha)

    def focal_loss_fixed(y_true, y_pred):
        """Focal loss for multi-classification
        FL(p_t)=-alpha(1-p_t)^{gamma}ln(p_t)
        Notice: y_pred is probability after softmax if from_logits is False.
        gradient is d(Fl)/d(p_t) not d(Fl)/d(x) as described in paper
        d(Fl)/d(p_t) * [p_t(1-p_t)] = d(Fl)/d(x)
        Focal Loss for Dense Object Detection
        https://arxiv.org/abs/1708.02002

        Arguments:
            y_true {tensor} -- ground truth labels, shape of [batch_size, num_cls]
            y_pred {tensor} -- model's output, shape of [batch_size, num_cls]

        Keyword Arguments:
            gamma {float} -- (default: {2.0})
            alpha {float} -- (default: {4.0})

        Returns:
            [tensor] -- loss.
        """
        epsilon = 1.e-9
        y_true = tf.cast(y_true, dtype=tf.float32)
        y_pred = tf.cast(y_pred, dtype=tf.float32)
        if from_logits:
            y_pred = activations.softmax(y_pred)

        model_out = tf.add(y_pred, epsilon)
        ce = tf.multiply(y_true, -tf.math.log(model_out))
        weight = tf.multiply(y_true, tf.pow(tf.subtract(1., model_out), gamma))
        fl = tf.multiply(alpha, tf.multiply(weight, ce))
        reduced_fl = tf.reduce_max(fl, axis=1)
        return tf.reduce_mean(reduced_fl)
    return focal_loss_fixed

# 4-categories classification

In [None]:
BATCH_SIZE = 6

## Electra

In [None]:
model_1 = 'google/electra-base-discriminator'

In [None]:
t_1 = text.Transformer(model_1, maxlen=500, class_names=[0,1,2,3])
trn_1 = t_1.preprocess_train(x_train, y_train)
val_1 = t_1.preprocess_test(x_test, y_test)

model_electra = t_1.get_classifier()

In [None]:
model_electra.compile(loss=focal_loss(alpha=1, from_logits=True),optimizer='adam',metrics=['accuracy'])

In [None]:
learner_electra = ktrain.get_learner(model_electra, train_data=trn_1, val_data=val_1, batch_size =BATCH_SIZE)

In [None]:
learner_electra.fit_onecycle(2e-5, 4) 

## Roberta

In [None]:
model_2 = 'roberta-base' 

In [None]:
t_2 = text.Transformer(model_2, maxlen=500, class_names=[0,1,2,3])

trn_2 = t_2.preprocess_train(x_train, y_train)
val_2 = t_2.preprocess_test(x_test, y_test)

model_roberta = t_2.get_classifier()

In [None]:
model_roberta.compile(loss=focal_loss(alpha=1, from_logits=True),optimizer='adam',metrics=['accuracy'])

In [None]:
learner_roberta = ktrain.get_learner(model_roberta, train_data=trn_2, val_data=val_2, batch_size =BATCH_SIZE)

In [None]:
learner_roberta.fit_onecycle(2e-5, 4) 

## roberta-large

In [None]:
model_3 = 'roberta-large' 

In [None]:
t_roberta_large = text.Transformer(model_3, maxlen=500, class_names=[0, 1, 2, 3])
trn_roberta_large = t_roberta_large.preprocess_train(x_train, y_train)
val_roberta_large = t_roberta_large.preprocess_test(x_test, y_test)

model_roberta_large = t_roberta_large.get_classifier()

In [None]:
model_roberta_large.compile(loss=focal_loss(alpha=1, from_logits=True),optimizer='adam',metrics=['accuracy'])

In [None]:
learner_roberta_large = ktrain.get_learner(model_roberta_large, train_data=trn_roberta_large, val_data=val_roberta_large, batch_size =BATCH_SIZE)

In [None]:
learner_roberta_large.fit_onecycle(2e-5, 4) #use hardware acceleartor while running this cell