### Install ToxicSpans

In [20]:
!git clone https://github.com/ipavlopoulos/toxic_spans

Cloning into 'toxic_spans'...
remote: Enumerating objects: 469, done.[K
remote: Counting objects: 100% (201/201), done.[K
remote: Compressing objects: 100% (149/149), done.[K
remote: Total 469 (delta 80), reused 142 (delta 45), pack-reused 268[K
Receiving objects: 100% (469/469), 5.39 MiB | 9.47 MiB/s, done.
Resolving deltas: 100% (210/210), done.


### Install Requirements

In [21]:
!pip install -r toxic_spans/ACL2022/requirements_for_toxic_spans_exps.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Imports

In [3]:
import numpy as np 
import pandas as pd
from ast import literal_eval
from tqdm import tqdm

### Define Metrics

In [4]:
from toxic_spans.SemEval2021.evaluation import semeval2021
from toxic_spans.SemEval2021.baselines import models

def precision(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(predictions_set)
  return float(nom)/float(denom)

def recall(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(gold_set)
  return float(nom)/float(denom)

### Method for Preparing the dataset (literal_eval some columns)

In [5]:
def prepare_dataset(dataset):
  dataset.probability = dataset.probability.apply(literal_eval)
  dataset.position = dataset.position.apply(literal_eval)
  dataset.text = dataset.text.apply(literal_eval)
  dataset['type'] = dataset['type'].apply(literal_eval)
  dataset.position_probability = dataset.position_probability.apply(literal_eval)
  if 'position_lbl'in dataset.columns:
    dataset.position_lbl = dataset.position_lbl.apply(literal_eval)
  return dataset

### Align tokens with token labels

In [6]:
#for each token extract the probabilistic label 
def extract_xy(data, tokenizer = None):
  X = []
  y = []
  t_of = []
  for i in tqdm(range(data.shape[0])):
    toks = []
    labels = []
    offsets = []
    (tokens, start_offsets, end_offsets) = tokenizer.tokenize_with_offsets(data.iloc[i].text_of_post)
    for j in  range(len(tokens)):
      span = []
      token = data.iloc[i].text_of_post[start_offsets[j]: end_offsets[j]]
      token_offset = [i for i in range(start_offsets[j], end_offsets[j])]
      for char_off in token_offset:
        if char_off in data.iloc[i].position_probability.keys(): # if in a span
          span.append(data.position_probability.iloc[i][char_off])
        else: #char not in a span
          span.append(0)
      labels.append(np.mean(span)) #this token has toxicity = with the mean of its chars
      toks.append(token)
      offsets.append([i for i in range(start_offsets[j], end_offsets[j])])
    y.append(labels)
    X.append(toks)
    t_of.append(offsets)
  return X,y,t_of

from toxic_spans.ACL2022.models.are import *

data = pd.read_csv("toxic_spans/ACL2022/data/toxic_spans.csv")
data = prepare_dataset(data)

tokenizer = tf_text.UnicodeScriptTokenizer()
x, y,t = extract_xy(data, tokenizer)

data['tokens'], data['token_labels'], data['token_offsets'] = x, y, t

100%|██████████| 11006/11006 [40:40<00:00,  4.51it/s]


* Download data for augmentation 

In [7]:
!gdown --id 1ApFrfl3UDaAbYJ4GhuZIhLGUGHZUxiPH

Downloading...
From: https://drive.google.com/uc?id=1ApFrfl3UDaAbYJ4GhuZIhLGUGHZUxiPH
To: /content/5k_augmentation.csv
100% 10.6M/10.6M [00:00<00:00, 28.8MB/s]


* Download data for roc auc evaluation 

In [8]:
!gdown --id 1qN2s3d2qTNp4JuO_7GTjLmltWCatuUrt

Downloading...
From: https://drive.google.com/uc?id=1qN2s3d2qTNp4JuO_7GTjLmltWCatuUrt
To: /content/for_auc_eval.csv
100% 5.81M/5.81M [00:00<00:00, 24.4MB/s]


### Train BILSTM_ARE on a Random Train/dev/Test split

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *


data['toxicity'] = [1 for i in range(data.shape[0])]

#compute dummy labels for the attention loss 
dummy_labels = []
for i in tqdm(range(data.shape[0])):
  instance = []
  for j,token in enumerate(data.iloc[i].tokens):
    instance.append(0)
  dummy_labels.append(instance)
data['dummy_labels'] = dummy_labels

#prepare data for augmentation 
augmentation = pd.read_csv("5k_augmentation.csv")
augmentation.tokens = augmentation.tokens.apply(literal_eval)
augmentation.token_offsets = augmentation.token_offsets.apply(literal_eval)
augmentation.dummy_labels = augmentation.dummy_labels.apply(literal_eval)
augmentation.toxicity = augmentation.toxicity.apply(lambda x: 1 if x > 0.5 else 0)

#prepare dataset for roc auc eval 
auc_eval = pd.read_csv("for_auc_eval.csv") 
auc_eval.tokens = auc_eval.tokens.apply(literal_eval)
auc_eval.toxicity = auc_eval.toxicity.apply(lambda x: 1 if x > 0.5 else 0)


train, dev = train_test_split(data, test_size = 0.2, random_state = 0)
dev, test = train_test_split(dev, test_size = 0.5, random_state = 0)
train, dev, test = train.reset_index(), dev.reset_index(), test.reset_index()


model = BILSTM_ARE(patience = 5)

#augment training set
train = pd.concat([train, augmentation]).sample(frac = 1).reset_index()

#train the model 
hs = model.fit(train.tokens, train.toxicity, train.dummy_labels, dev.tokens, dev.toxicity, dev.dummy_labels)

100%|██████████| 11006/11006 [00:01<00:00, 5761.35it/s]


FFFFFF
Vocab size:  47270
Model: "model_6"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 inputs (InputLayer)            [(None, 128)]        0           []                               
                                                                                                  
 embedding_6 (Embedding)        (None, 128, 200)     9454000     ['inputs[0][0]']                 
                                                                                                  
 bidirectional_6 (Bidirectional  (None, 128, 256)    336896      ['embedding_6[0][0]']            
 )                                                                                                
                                                                                                  
 time_distributed_12 (TimeDistr  (None, 128, 128)    32896       [

### Evaluate

In [31]:
opt_th = model.finetune_att_threshold(dev.tokens, dev.token_offsets, dev.position)
pred_offsets = model.get_toxic_offsets(test.tokens, threshold=opt_th)
pred_char_offsets = model.get_toxic_char_offsets(test.token_offsets, pred_offsets)

f1 = np.mean([semeval2021.f1(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
pr = np.mean([precision(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
rec = np.mean([recall(p,g) for p,g in list(zip(pred_char_offsets, test.position))])

preds, _ = model.predict(auc_eval.tokens)
auc = roc_auc_score(auc_eval.toxicity, preds)

print("F1: ",f1)
print("Recall: ",rec)
print("Precision: " ,pr)
print("ROC AUC: ",auc)

Optimal threshold is:  0.95  with F1 score =  0.5691043343672841
F1:  0.558043992473185
Recall:  0.5524877592153589
Precision:  0.5731153496821072
ROC AUC:  0.9012764444444444
