### Install ToxicSpans

In [1]:
!git clone https://github.com/ipavlopoulos/toxic_spans

Cloning into 'toxic_spans'...
remote: Enumerating objects: 437, done.[K
remote: Counting objects: 100% (173/173), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 437 (delta 63), reused 138 (delta 44), pack-reused 264[K
Receiving objects: 100% (437/437), 5.37 MiB | 7.13 MiB/s, done.
Resolving deltas: 100% (190/190), done.


### Install Requirements

In [2]:
!pip install -r toxic_spans/ACL2022/requirements_for_toxic_spans_exps.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==2.7.0
  Downloading https://us-python.pkg.dev/colab-wheels/public/tensorflow/tensorflow-2.7.0%2Bzzzcolab20220506150900-cp37-cp37m-linux_x86_64.whl (665.5 MB)
[K     |████████████████████████████████| 665.5 MB 21 kB/s 
[?25hCollecting tensorflow-text==2.7.3
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 4.2 MB/s 
[?25hCollecting transformers==4.12.4
  Downloading transformers-4.12.4-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 60.6 MB/s 
[?25hCollecting numpy==1.21.4
  Downloading numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 27.3 MB/s 
[?25hCollecting pandas==1.3.4
  Downloading pandas-1.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K  

### Imports

In [1]:
import numpy as np 
import pandas as pd
from ast import literal_eval
from tqdm import tqdm
from scipy.stats import sem

### Define Metrics

In [2]:
from toxic_spans.SemEval2021.evaluation import semeval2021
from toxic_spans.SemEval2021.baselines import models


def precision(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(predictions_set)
  return float(nom)/float(denom)

def recall(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(gold_set)
  return float(nom)/float(denom)

### Method for Preparing the dataset (literal_eval some columns)

In [3]:
def prepare_dataset(dataset):
  dataset.probability = dataset.probability.apply(literal_eval)
  dataset.position = dataset.position.apply(literal_eval)
  dataset.text = dataset.text.apply(literal_eval)
  dataset['type'] = dataset['type'].apply(literal_eval)
  dataset.position_probability = dataset.position_probability.apply(literal_eval)
  if 'position_lbl'in dataset.columns:
    dataset.position_lbl = dataset.position_lbl.apply(literal_eval)
  return dataset

### Method for Binarizing the Ground Truth

In [4]:
def ground_truth_to_binary(ground_truth):
  binarized = []
  for i,instance in enumerate(ground_truth):
    instance_labels = []
    for j,token_label in enumerate(instance):
      if token_label == -100:
        instance_labels.append(-100) #if we are using special tokens, we set the label to -100 (loss is not computed)
      elif token_label > 0.5:
        instance_labels.append(1)
      else:
        instance_labels.append(0)
    binarized.append(instance_labels)
  return binarized


### Align tokens with token labels

In [5]:
from toxic_spans.ACL2022.models.seq import *

#load data
data = pd.read_csv("toxic_spans/ACL2022/data/toxic_spans.csv")
data = prepare_dataset(data)

sess = tensorflow.compat.v1.Session(config=tensorflow.compat.v1.ConfigProto(log_device_placement=True))
model = BERT_SEQ(patience = 5, session=sess)
model.build()

#align x and y
data['subtokens'], data['subtoken_labels'] = model.extract_xy(data = data, tokenizer = model.tokenizer)
#binarize ground truth 
data.subtoken_labels = ground_truth_to_binary(data.subtoken_labels)


Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5



Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/502M [00:00<?, ?B/s]

Some layers from the model checkpoint at bert-base-cased were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at bert-base-cased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
100%|██████████| 11006/11006 [04:24<00:00, 41.56it/s]


### Train BERT_SEQ on a Random Train/dev/Test split

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

train, dev = train_test_split(data, test_size = 0.2, random_state = 0)
dev, test = train_test_split(dev, test_size = 0.5, random_state = 0)
train, dev, test = train.reset_index(), dev.reset_index(), test.reset_index()

hs = model.fit(train.text_of_post, list(train.subtoken_labels), dev.text_of_post, dev.subtoken_labels)

  0%|          | 0/8804 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 8804/8804 [00:01<00:00, 5594.54it/s]
100%|██████████| 1101/1101 [00:00<00:00, 5702.98it/s]


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 00018: early stopping


### Evaluate 

In [8]:
pred_offsets = model.get_toxic_offsets(list(test.text_of_post), 0.5)
pred_char_offsets = model.get_toxic_char_offsets(list(test.text_of_post), pred_offsets)
f1 = np.mean([semeval2021.f1(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
pr = np.mean([precision(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
rec = np.mean([recall(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
print("F1: ",f1)
print("Recall: ",rec)
print("Precision: ",pr)

100%|██████████| 1101/1101 [00:00<00:00, 5966.93it/s]


Stopped epoch:  17
F1:  0.5793681812237937
Recall:  0.5819030602295464
Precision:  0.5853350020075034
