### Install ToxicSpans

In [1]:
!git clone https://github.com/ipavlopoulos/toxic_spans

Cloning into 'toxic_spans'...
remote: Enumerating objects: 483, done.[K
remote: Counting objects: 100% (215/215), done.[K
remote: Compressing objects: 100% (163/163), done.[K
remote: Total 483 (delta 89), reused 142 (delta 45), pack-reused 268[K
Receiving objects: 100% (483/483), 5.40 MiB | 17.84 MiB/s, done.
Resolving deltas: 100% (219/219), done.


### Install Requirements

In [2]:
!pip install -r toxic_spans/ACL2022/requirements_for_toxic_spans_exps.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==2.7.0
  Downloading tensorflow-2.7.0-cp39-cp39-manylinux2010_x86_64.whl (489.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m489.7/489.7 MB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tensorflow-text==2.7.3
  Downloading tensorflow_text-2.7.3-cp39-cp39-manylinux2010_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m51.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.12.4
  Downloading transformers-4.12.4-py3-none-any.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m75.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy==1.21.4
  Downloading numpy-1.21.4-cp39-cp39-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.7/15.7 MB[0m [31m78.7 MB

### Imports

In [3]:
import numpy as np 
import pandas as pd
from ast import literal_eval
from tqdm import tqdm

### Define Metrics

In [4]:
from toxic_spans.SemEval2021.evaluation import semeval2021
from toxic_spans.SemEval2021.baselines import models

def precision(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(predictions_set)
  return float(nom)/float(denom)

def recall(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(gold_set)
  return float(nom)/float(denom)

### Method for Preparing the dataset (literal_eval some columns)

In [5]:
def prepare_dataset(dataset):
  dataset.probability = dataset.probability.apply(literal_eval)
  dataset.position = dataset.position.apply(literal_eval)
  dataset.text = dataset.text.apply(literal_eval)
  dataset['type'] = dataset['type'].apply(literal_eval)
  dataset.position_probability = dataset.position_probability.apply(literal_eval)
  if 'position_lbl'in dataset.columns:
    dataset.position_lbl = dataset.position_lbl.apply(literal_eval)
  return dataset

### Align tokens with token labels

In [7]:
#for each token extract the probabilistic label 
def extract_xy(data, tokenizer):
    """
    This method aligns x and y according to BERT's sub-tokens
    :param data: the dataframe (read toxic_spans.csv)
    :param tokenizer: bert's tokenizer
    :return: x and y aligned (subtokens aligned with subtokens toxicity labels)
    """
    
    x = [] #tokens (or subtokens)
    y = [] #token labels
    for i in tqdm(range(data.shape[0])):
      subtokens = []
      token_labels = []
      tokenized_batch : BatchEncoding = tokenizer(data.iloc[i].text_of_post)
      tokenized_text :Encoding = tokenized_batch[0]
      tokens = ['[CLS]'] + tokenizer.tokenize(data.iloc[i].text_of_post) + ['[SEP]']
      for j,token in enumerate(tokens):
        if j == 0 or j == len(tokens) - 1: #ignore ['CLS'] and ['SEP'] tokens
         continue
        else:
          (start, end) = tokenized_text.token_to_chars(j) #char offset of jth sub-token (in the original text)
          span_score = []
          for ch_offset in range(start,end):
            if ch_offset in data.iloc[i].position_probability.keys():
              span_score.append(data.iloc[i].position_probability[ch_offset])
            else:
              span_score.append(0)
          token_labels.append(np.mean(span_score))
          subtokens.append(token)
      x.append(subtokens)
      y.append(token_labels)
    return x, y 
    
from toxic_spans.ACL2022.models.are import *

model = BERT_ARE(patience = 5)


data = pd.read_csv("toxic_spans/ACL2022/data/toxic_spans.csv")
data = prepare_dataset(data)

tokenizer = model.tokenizer
x, y = extract_xy(data, tokenizer)

data['tokens'], data['token_labels'] = x, y

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|██████████| 11006/11006 [03:49<00:00, 48.04it/s]


* Download data for augmentation 

In [8]:
!gdown --id 1ApFrfl3UDaAbYJ4GhuZIhLGUGHZUxiPH

Downloading...
From: https://drive.google.com/uc?id=1ApFrfl3UDaAbYJ4GhuZIhLGUGHZUxiPH
To: /content/5k_augmentation.csv
100% 10.6M/10.6M [00:00<00:00, 27.3MB/s]


* Download data for roc auc evaluation 

In [9]:
!gdown --id 1qN2s3d2qTNp4JuO_7GTjLmltWCatuUrt

Downloading...
From: https://drive.google.com/uc?id=1qN2s3d2qTNp4JuO_7GTjLmltWCatuUrt
To: /content/for_auc_eval.csv
100% 5.81M/5.81M [00:00<00:00, 21.4MB/s]


### Train BERT_ARE on a Random Train/dev/Test split

In [11]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *


data['toxicity'] = [1 for i in range(data.shape[0])]

#prepare data for augmentation 
augmentation = pd.read_csv("5k_augmentation.csv")
augmentation.toxicity = augmentation.toxicity.apply(lambda x: 1 if x > 0.5 else 0)

#prepare dataset for roc auc eval 
auc_eval = pd.read_csv("for_auc_eval.csv") 
auc_eval.toxicity = auc_eval.toxicity.apply(lambda x: 1 if x > 0.5 else 0)


train, dev = train_test_split(data, test_size = 0.2, random_state = 0)
dev, test = train_test_split(dev, test_size = 0.5, random_state = 0)
train, dev, test = train.reset_index(), dev.reset_index(), test.reset_index()



#augment training set
train = pd.concat([train, augmentation]).sample(frac = 1).reset_index()

#train the model 
hs = model.fit(train.text_of_post, train.toxicity, dev.text_of_post, dev.toxicity)

  0%|          | 0/13804 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 13804/13804 [00:03<00:00, 4302.66it/s]
100%|██████████| 1101/1101 [00:00<00:00, 4664.30it/s]


Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_ids (InputLayer)         [(None, 128)]        0           []                               
                                                                                                  
 input_masks (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 segment_ids (InputLayer)       [(None, 128)]        0           []                               
                                                                                                  
 tf_bert_model_1 (TFBertModel)  TFBaseModelOutputWi  108310272   ['input_ids[0][0]',              
                                thPoolingAndCrossAt               'input_masks[0][0]',        



Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 00007: early stopping


### Evaluate

In [13]:
opt_th = model.finetune_att_threshold(dev.text_of_post, dev.position)
pred_offsets = model.get_toxic_offsets(test.text_of_post, threshold=opt_th)
pred_char_offsets = model.get_toxic_char_offsets(test.text_of_post, pred_offsets)

f1 = np.mean([semeval2021.f1(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
pr = np.mean([precision(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
rec = np.mean([recall(p,g) for p,g in list(zip(pred_char_offsets, test.position))])

preds = model.predict(auc_eval.tokens)
auc = roc_auc_score(auc_eval.toxicity, preds)

print("F1: ",f1)
print("Recall: ",rec)
print("Precision: " ,pr)
print("ROC AUC: ",auc)

100%|██████████| 3000/3000 [00:01<00:00, 1826.20it/s]


Stopped epoch:  6
F1:  0.4895549500454133
Recall:  0.4895549500454133
Precision:  0.4895549500454133
ROC AUC:  0.8760380000000001
