# Requirements:


*   pandas
*   scipy
*   sklearn


# Install ToxicSpans

In [7]:
!git clone https://github.com/ipavlopoulos/toxic_spans.git

Cloning into 'toxic_spans'...
remote: Enumerating objects: 408, done.[K
remote: Counting objects: 100% (140/140), done.[K
remote: Compressing objects: 100% (120/120), done.[K
remote: Total 408 (delta 49), reused 53 (delta 13), pack-reused 268[K
Receiving objects: 100% (408/408), 5.32 MiB | 10.45 MiB/s, done.
Resolving deltas: 100% (179/179), done.


# Install SpanBERT

In [1]:
!git clone https://github.com/facebookresearch/SpanBERT.git

Cloning into 'SpanBERT'...
remote: Enumerating objects: 282, done.[K
remote: Counting objects: 100% (18/18), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 282 (delta 9), reused 9 (delta 5), pack-reused 264[K
Receiving objects: 100% (282/282), 372.39 KiB | 14.32 MiB/s, done.
Resolving deltas: 100% (161/161), done.


In [8]:
%cd SpanBERT

/content/SpanBERT


In [3]:
!pip install -r "/content/SpanBERT/requirements.txt"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting backcall==0.1.0
  Downloading backcall-0.1.0.zip (11 kB)
Collecting bitarray==1.2.0
  Downloading bitarray-1.2.0.tar.gz (48 kB)
[K     |████████████████████████████████| 48 kB 3.5 MB/s 
[?25hCollecting boto3==1.9.239
  Downloading boto3-1.9.239-py2.py3-none-any.whl (128 kB)
[K     |████████████████████████████████| 128 kB 13.8 MB/s 
[?25hCollecting botocore==1.12.239
  Downloading botocore-1.12.239-py2.py3-none-any.whl (5.7 MB)
[K     |████████████████████████████████| 5.7 MB 23.4 MB/s 
[?25hCollecting certifi==2019.9.11
  Downloading certifi-2019.9.11-py2.py3-none-any.whl (154 kB)
[K     |████████████████████████████████| 154 kB 67.1 MB/s 
Collecting decorator==4.4.1
  Downloading decorator-4.4.1-py2.py3-none-any.whl (9.2 kB)
Collecting docutils==0.15.2
  Downloading docutils-0.15.2-py3-none-any.whl (547 kB)
[K     |████████████████████████████████| 547 kB 63.1 MB/s 


# Convert Toxic Spans dataset to SQuAD format

In [21]:
def convert_csv_to_squad_format(df, train_split=True):
  """Convert a csv file containing toxic spans to the format of the SQuAD 
  dataset, as being used by SpanBERT. The difficulty is that SpanBERT assumes
  a single span in SQuAD examples... Then we chose to ignore multiple continuous
  spans.

  Parameters
  ----------
  df : pandas.dataframe
      Dataframe containing the dataset 
  train_split : bool, default=True
      Boolean indicating whether the dataset corresponds to the train split 
      or not. 

  Returns
  -------
  output_dict : dict
      Dictionnary containing the toxic span dataset in the SQuAD format. 
  """
  paragraphs = []

  for index, row in df.iterrows():
    spans = row["spans"]
    text = row["text"]
    id = str(index)
    ignore = False # ignore train posts with multiple continuous spans

    if spans:
        is_impossible = False
        # Test if there are multiple toxic spans for this comment.
        continuous = (sorted(spans) == list(range(min(spans), max(spans)+1)))
        
        toxic_span_start = spans[0]
        
        if continuous:
          toxic_text = text[toxic_span_start : spans[-1] + 1]
        
        elif not train_split:
          # Take the first continuous span for dev and test set.
          # Rarely happens in practice.
          for j in range(1, len(spans)):
            if spans[j] - spans[j-1] > 1:
                toxic_span_end = spans[j-1]
          toxic_text = text[toxic_span_start : toxic_span_end + 1]

        else: 
          ignore = True
          
        answers = [{"text": toxic_text, "answer_start": toxic_span_start}]

    else:
      answers = []
      is_impossible = True

    if not ignore:  
      paragraphs.append({"qas": [{"question": "", "id": id, "answers": answers, 
                                  "is_impossible": is_impossible}], 
                        "context": text})
  
  data = [{"title": "", "paragraphs": paragraphs}]
  output_dict = {"version":"v2.0", "data": data}
  return output_dict

In [22]:
from ast import literal_eval
import json
from pathlib import Path

import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
datafolder = "/content/toxic_spans/SemEval2021/data/" # @param { type: "string" }
datapath = Path(datafolder)
train_file_name = "tsd_train.csv" # @param { type: "string" }
train_datapath = datapath / train_file_name
test_file_name = "tsd_test.csv" # @param { type: "string" }
test_datapath = datapath / test_file_name

df_train = pd.read_csv(train_datapath)
df_train.spans = df_train.spans.apply(literal_eval)

df_test = pd.read_csv(test_datapath)
df_test.spans = df_test.spans.apply(literal_eval)


seed = 42
val_size = 0.1 # @param { type: "number" }
df_train, df_val = train_test_split(df_train, test_size=val_size, 
                                    random_state=seed)

with open(f"/content/valid.json", 'w') as json_file:
  json.dump(convert_csv_to_squad_format(df_val, train_split=False), json_file)
with open(f"/content/test.json", 'w') as json_file:
  json.dump(convert_csv_to_squad_format(df_test, train_split=False), json_file)
with open(f"/content/train.json", 'w') as json_file:
  json.dump(convert_csv_to_squad_format(df_train, train_split=True), json_file)

# Train SpanBERT on ToxicSpans

In [23]:
#data_train = f"/content/train.json"
#data_valid = f"/content/valid.json"
#data_test = f"/content/test.json"
#base = f"out_base"

model = "spanbert-base-cased" # @param { type: "string" }
%env MODEL = $model
train_file = "/content/train.json" # @param { type: "string" }
%env TRAIN_FILE = $train_file
dev_file = "/content/valid.json" # @param { type: "string" }
%env DEV_FILE = $dev_file
test_file = "/content/test.json" # @param { type: "string" }
%env TEST_FILE = $test_file
train_batch_size = 32 # @param { type: "integer" }
%env TRAIN_BATCH_SIZE = $train_batch_size
eval_batch_size = 32 # @param { type: "integer" }
%env EVAL_BATCH_SIZE = $eval_batch_size
learning_rate = 2e-5 # @param { type: "number" }
%env LEARNING_RATE = $learning_rate
num_train_epochs = 4 # @param { type: "integer" }
%env NUM_TRAIN_EPOCHS = $num_train_epochs
max_seq_length = 128 # @param { type: "integer" }
%env MAX_SEQ_LENGTH = $max_seq_length
doc_stride = 128 # @param { type: "integer" }
%env DOC_STRIDE = $doc_stride
eval_metric = "best_f1" # @param { type: "string" }
%env EVAL_METRIC = $eval_metric
output_dir = "/content/out_base" # @param { type: "string" }
%env OUTPUT_DIR = $output_dir


!python code/run_squad.py \
                          --do_train \
                          --do_eval \
                          --eval_test \
                          --model="${MODEL}" \
                          --train_file="${TRAIN_FILE}" \
                          --dev_file="${DEV_FILE}" \
                          --test_file="${TEST_FILE}" \
                          --train_batch_size="${TRAIN_BATCH_SIZE}" \
                          --eval_batch_size="${EVAL_BATCH_SIZE}"  \
                          --learning_rate="${LEARNING_RATE}" \
                          --num_train_epochs="${NUM_TRAIN_EPOCHS}" \
                          --max_seq_length="${MAX_SEQ_LENGTH}" \
                          --doc_stride="${DOC_STRIDE}" \
                          --eval_metric="${EVAL_METRIC}" \
                          --output_dir="${OUTPUT_DIR}" \
                          --version_2_with_negative

env: MODEL=spanbert-base-cased
env: TRAIN_FILE=/content/train.json
env: DEV_FILE=/content/valid.json
env: TEST_FILE=/content/test.json
env: TRAIN_BATCH_SIZE=32
env: EVAL_BATCH_SIZE=32
env: LEARNING_RATE=2e-05
env: NUM_TRAIN_EPOCHS=4
env: MAX_SEQ_LENGTH=128
env: DOC_STRIDE=128
env: EVAL_METRIC=best_f1
env: OUTPUT_DIR=out_base
07/07/2022 09:06:18 - INFO - __main__ - device: cuda, n_gpu: 1, 16-bits training: False
07/07/2022 09:06:18 - INFO - __main__ - Namespace(dev_file='/content/valid.json', do_eval=True, do_lower_case=False, do_train=True, doc_stride=128, eval_batch_size=32, eval_metric='best_f1', eval_per_epoch=10, eval_test=True, fp16=False, gradient_accumulation_steps=1, learning_rate=2e-05, loss_scale=0, max_answer_length=30, max_query_length=64, max_seq_length=128, model='spanbert-base-cased', n_best_size=20, no_cuda=False, num_train_epochs=4.0, output_dir='out_base', seed=42, test_file='/content/test.json', train_batch_size=32, train_file='/content/train.json', train_mode='rando

# Evaluate SpanBERT on Toxic Spans

## Convert SpanBERT predicions to ToxicSpans predicitons

In [61]:
# Load test set file (with ground truth) and SpanBERT predicitons 

df_test = pd.read_csv(test_datapath)
df_test.spans = df_test.spans.apply(literal_eval)
output_path = Path(output_dir)
output_filename = "predictions.json" # @param { type: "string" }
output_path = output_path / output_filename

with open(output_path) as pred_json:
    pred_data = json.load(pred_json)
    df_pred = pd.DataFrame.from_dict(pred_data, orient='index', columns=['pred'])

In [37]:
def convert_textpred_to_spanpred(comment_text, pred_text):
  """Convert a toxic string to the set of character offsets in the comment. 

  Parameters
  ----------
  comment_text : str
      The text of the comment 
  pred_text : str
      The test of the SpanBERT prediciton

  Returns
  -------
  list
      The list of character offsets detected as a toxic by SpanBERT 
  """

  if len(pred_text) > 0 and (pred_text in comment_text):
    return list(range(comment_text.index(pred_text), 
                      comment_text.index(pred_text) +len(pred_text)))
  return []

In [56]:
# Add predicitons to test dataframe
df_test.insert(2, 'predictions', df_pred['pred'].tolist())
df_test["pred_spans"] = df_test.apply(lambda row: convert_textpred_to_spanpred(row.text, 
                                                                               row.predictions), 
                                      axis=1)

In [60]:
# Compute F1 scores per post and average
from scipy.stats import sem

from toxic_spans.SemEval2021.evaluation.semeval2021 import f1

f1_scores = df_test.apply(lambda row: f1(row.pred_spans, row.spans), axis=1)
print(f"F1 score on test ToxicSpans: {f1_scores.mean():.4f} ± {sem(f1_scores):.4f}")

F1 score on test ToxicSpans: 0.6562 ± 0.0097
