### Install ToxicSpans

In [2]:
!git clone https://github.com/ipavlopoulos/toxic_spans

Cloning into 'toxic_spans'...
remote: Enumerating objects: 437, done.[K
remote: Counting objects: 100% (173/173), done.[K
remote: Compressing objects: 100% (123/123), done.[K
remote: Total 437 (delta 63), reused 138 (delta 44), pack-reused 264[K
Receiving objects: 100% (437/437), 5.37 MiB | 21.82 MiB/s, done.
Resolving deltas: 100% (190/190), done.


### Install Requirements

In [3]:
!pip install -r toxic_spans/ACL2022/requirements_for_toxic_spans_exps.txt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tensorflow==2.7.0
  Downloading https://us-python.pkg.dev/colab-wheels/public/tensorflow/tensorflow-2.7.0%2Bzzzcolab20220506150900-cp37-cp37m-linux_x86_64.whl (665.5 MB)
[K     |████████████████████████████████| 665.5 MB 20 kB/s 
[?25hCollecting tensorflow-text==2.7.3
  Downloading tensorflow_text-2.7.3-cp37-cp37m-manylinux2010_x86_64.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 5.1 MB/s 
[?25hCollecting transformers==4.12.4
  Downloading transformers-4.12.4-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 67.2 MB/s 
[?25hCollecting numpy==1.21.4
  Downloading numpy-1.21.4-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (15.7 MB)
[K     |████████████████████████████████| 15.7 MB 30.7 MB/s 
[?25hCollecting pandas==1.3.4
  Downloading pandas-1.3.4-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.3 MB)
[K  

### Imports

In [4]:
import numpy as np 
import pandas as pd
from ast import literal_eval
from tqdm import tqdm

### Define Metrics

In [5]:
from toxic_spans.SemEval2021.evaluation import semeval2021
from toxic_spans.SemEval2021.baselines import models

def precision(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(predictions_set)
  return float(nom)/float(denom)

def recall(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(gold_set)
  return float(nom)/float(denom)

### Method for Preparing the dataset (literal_eval some columns)

In [6]:
def prepare_dataset(dataset):
  dataset.probability = dataset.probability.apply(literal_eval)
  dataset.position = dataset.position.apply(literal_eval)
  dataset.text = dataset.text.apply(literal_eval)
  dataset['type'] = dataset['type'].apply(literal_eval)
  dataset.position_probability = dataset.position_probability.apply(literal_eval)
  if 'position_lbl'in dataset.columns:
    dataset.position_lbl = dataset.position_lbl.apply(literal_eval)
  return dataset

### Align tokens with token labels

In [7]:
#for each token extract the probabilistic label 
def extract_xy(data, tokenizer = None):
  X = []
  y = []
  t_of = []
  for i in tqdm(range(data.shape[0])):
    toks = []
    labels = []
    offsets = []
    (tokens, start_offsets, end_offsets) = tokenizer.tokenize_with_offsets(data.iloc[i].text_of_post)
    for j in  range(len(tokens)):
      span = []
      token = data.iloc[i].text_of_post[start_offsets[j]: end_offsets[j]]
      token_offset = [i for i in range(start_offsets[j], end_offsets[j])]
      for char_off in token_offset:
        if char_off in data.iloc[i].position_probability.keys(): # if in a span
          span.append(data.position_probability.iloc[i][char_off])
        else: #char not in a span
          span.append(0)
      labels.append(np.mean(span)) #this token has toxicity = with the mean of its chars
      toks.append(token)
      offsets.append([i for i in range(start_offsets[j], end_offsets[j])])
    y.append(labels)
    X.append(toks)
    t_of.append(offsets)
  return X,y,t_of

from toxic_spans.ACL2022.models.seq import *

data = pd.read_csv("toxic_spans/ACL2022/data/toxic_spans.csv")
data = prepare_dataset(data)

tokenizer = tf_text.UnicodeScriptTokenizer()
x, y,t = extract_xy(data, tokenizer)

data['tokens'], data['token_labels'], data['token_offsets'] = x, y, t

100%|██████████| 11006/11006 [33:41<00:00,  5.44it/s]


### Train BILSTM_SEQ on a Random Train/dev/Test split

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

train, dev = train_test_split(data, test_size = 0.2, random_state = 0)
dev, test = train_test_split(dev, test_size = 0.5, random_state = 0)
train, dev, test = train.reset_index(), dev.reset_index(), test.reset_index()

model = BILSTM_SEQ(patience = 5)
hs = model.fit(train.tokens, train.token_labels, dev.tokens, dev.token_labels)

Vocab size:  31873
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 128)]             0         
                                                                 
 embedding (Embedding)       (None, 128, 200)          6374600   
                                                                 
 dropout (Dropout)           (None, 128, 200)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 128, 256)         336896    
 l)                                                              
                                                                 
 layer_normalization (LayerN  (None, 128, 256)         512       
 ormalization)                                                   
                                                                 
 time_distributed (TimeDistr  (None, 128, 

### Evaluate

In [9]:
pred_offsets = model.get_toxic_offsets(test.tokens, 0.5)
pred_char_offsets = model.get_toxic_char_offsets(test.token_offsets, pred_offsets)

f1 = np.mean([semeval2021.f1(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
pr = np.mean([precision(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
rec = np.mean([recall(p,g) for p,g in list(zip(pred_char_offsets, test.position))])

print("F1: ",f1)
print("Recall: ",rec)
print("Precision: ",pr)

F1:  0.5952969505512168
Recall:  0.5975387932212997
Precision:  0.6033353567413513
