### Install Requirements

In [17]:
!pip install -q -U "tensorflow-text==2.8.*"
#!pip install pandas
#!pip install scipy
#!pip install sklearn

### Imports

In [18]:
import numpy as np 
import pandas as pd
from ast import literal_eval
from tqdm import tqdm

### Install ToxicSpans

In [19]:
!git clone https://github.com/ipavlopoulos/toxic_spans

fatal: destination path 'toxic_spans' already exists and is not an empty directory.


### Define Metrics

In [20]:
#!git clone https://github.com/ipavlopoulos/toxic_spans.git
!pip install lime
from toxic_spans.SemEval2021.evaluation import semeval2021
from toxic_spans.SemEval2021.baselines import models


def precision(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(predictions_set)
  return float(nom)/float(denom)

def recall(predictions, gold):
  if len(gold) == 0:
    return 1. if len(predictions) == 0 else 0.
  if len(predictions) == 0:
    return 0.
  predictions_set = set(predictions)
  gold_set = set(gold)
  nom = len(predictions_set.intersection(gold_set))
  denom = len(gold_set)
  return float(nom)/float(denom)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Method for Preparing the dataset (literal_eval some columns)

In [21]:
def prepare_dataset(dataset):
  dataset.probability = dataset.probability.apply(literal_eval)
  dataset.position = dataset.position.apply(literal_eval)
  dataset.text = dataset.text.apply(literal_eval)
  dataset['type'] = dataset['type'].apply(literal_eval)
  dataset.position_probability = dataset.position_probability.apply(literal_eval)
  if 'position_lbl'in dataset.columns:
    dataset.position_lbl = dataset.position_lbl.apply(literal_eval)
  return dataset

### Align tokens with token labels

In [22]:
#for each token extract the probabilistic label 
def extract_xy(data, tokenizer = None):
  X = []
  y = []
  t_of = []
  for i in tqdm(range(data.shape[0])):
    toks = []
    labels = []
    offsets = []
    (tokens, start_offsets, end_offsets) = tokenizer.tokenize_with_offsets(data.iloc[i].text_of_post)
    for j in  range(len(tokens)):
      span = []
      token = data.iloc[i].text_of_post[start_offsets[j]: end_offsets[j]]
      token_offset = [i for i in range(start_offsets[j], end_offsets[j])]
      for char_off in token_offset:
        if char_off in data.iloc[i].position_probability.keys(): # if in a span
          span.append(data.position_probability.iloc[i][char_off])
        else: #char not in a span
          span.append(0)
      labels.append(np.mean(span)) #this token has toxicity = with the mean of its chars
      toks.append(token)
      offsets.append([i for i in range(start_offsets[j], end_offsets[j])])
    y.append(labels)
    X.append(toks)
    t_of.append(offsets)
  return X,y,t_of



data = pd.read_csv("toxic_spans/ACL2022/data/toxic_spans.csv")
data = prepare_dataset(data)

import tensorflow_text as tf_text

tokenizer = tf_text.UnicodeScriptTokenizer()
x, y,t = extract_xy(data, tokenizer)

data['tokens'], data['token_labels'], data['token_offsets'] = x, y, t

100%|██████████| 11006/11006 [35:22<00:00,  5.19it/s]


### Train BILSTM_SEQ on a Random Train/dev/Test split

In [23]:
from toxic_spans.ACL2022.models.seq import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import *

train, dev = train_test_split(data, test_size = 0.2, random_state = 0)
dev, test = train_test_split(dev, test_size = 0.5, random_state = 0)
train, dev, test = train.reset_index(), dev.reset_index(), test.reset_index()

model = BILSTM_SEQ(patience = 5)
hs = model.fit(train.tokens, train.token_labels, dev.tokens, dev.token_labels)

Vocab size:  31873
Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 128)]             0         
                                                                 
 embedding (Embedding)       (None, 128, 200)          6374600   
                                                                 
 dropout_37 (Dropout)        (None, 128, 200)          0         
                                                                 
 bidirectional (Bidirectiona  (None, 128, 256)         336896    
 l)                                                              
                                                                 
 layer_normalization (LayerN  (None, 128, 256)         512       
 ormalization)                                                   
                                                                 
 time_distributed_1 (TimeDis  (None, 128

### Evaluate

In [25]:
pred_offsets = model.get_toxic_offsets(test.tokens, 0.5)
pred_char_offsets = model.get_toxic_char_offsets(test.token_offsets, pred_offsets)

f1 = np.mean([semeval2021.f1(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
pr = np.mean([precision(p,g) for p,g in list(zip(pred_char_offsets, test.position))])
rec = np.mean([recall(p,g) for p,g in list(zip(pred_char_offsets, test.position))])

print("F1: ",f1)
print("Recall: ",rec)
print("Precision: ",pr)

F1:  0.5785078652531558
Recall:  0.5809530643267494
Precision:  0.585381981022308
