In [None]:
!pip install utils/FARM --quiet

[K     |████████████████████████████████| 776.8MB 22kB/s 
[K     |████████████████████████████████| 133kB 51.9MB/s 
[K     |████████████████████████████████| 51kB 7.6MB/s 
[K     |████████████████████████████████| 14.2MB 19.8MB/s 
[K     |████████████████████████████████| 1.5MB 43.4MB/s 
[K     |████████████████████████████████| 327kB 46.8MB/s 
[K     |████████████████████████████████| 2.5MB 37.5MB/s 
[K     |████████████████████████████████| 1.2MB 46.0MB/s 
[K     |████████████████████████████████| 81kB 10.7MB/s 
[K     |████████████████████████████████| 7.6MB 32.7MB/s 
[K     |████████████████████████████████| 61kB 9.2MB/s 
[K     |████████████████████████████████| 81kB 283kB/s 
[K     |████████████████████████████████| 1.1MB 37.8MB/s 
[K     |████████████████████████████████| 174kB 51.8MB/s 
[K     |████████████████████████████████| 348kB 45.9MB/s 
[K     |████████████████████████████████| 153kB 58.1MB/s 
[K     |████████████████████████████████| 901kB 36.3MB/s 
[K

In [None]:
# Importing common utilities
import ast
import os
import time
import gc
import torch
import numpy as np
import pandas as pd

In [None]:
# FARM imports utils
from farm.modeling.tokenization import Tokenizer
from farm.data_handler.data_silo import DataSilo
from farm.modeling.language_model import LanguageModel
from farm.modeling.prediction_head import TextClassificationHead, TokenClassificationHead
from farm.modeling.adaptive_model import AdaptiveModel
from farm.modeling.optimization import initialize_optimizer
from farm.train import Trainer
from farm.utils import set_all_seeds, initialize_device_settings

05/30/2021 23:58:21 - INFO - farm.modeling.prediction_head -   Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex .


In [None]:
from farm.data_handler.processor import Processor
from tokenizers.pre_tokenizers import WhitespaceSplit
from farm.data_handler.samples import (
    Sample,
    SampleBasket,
)
from farm.data_handler.utils import expand_labels

class MTLProcessor(Processor):

    def __init__(
        self,
        tokenizer,
        max_seq_len,
        data_dir,
        train_filename,
        test_filename,
        delimiter,
        dev_split=0.0,
        dev_filename=None,
        label_list=None,
        metric=None,
        proxies=None,
        **kwargs
    ):
        self.delimiter = delimiter

        super(MTLProcessor, self).__init__(
            tokenizer=tokenizer,
            max_seq_len=max_seq_len,
            train_filename=train_filename,
            dev_filename=dev_filename,
            test_filename=test_filename,
            dev_split=dev_split,
            data_dir=data_dir,
            tasks={},
            proxies=proxies
        )

    def file_to_dicts(self, file: str) -> [dict]:
      dicts = list()
      df = pd.read_csv(file)
      for text, label, tokens in zip(df.post_tokens.values, df.post_label.values, df.toxic_tokens.values):
        columns = dict()
        text = ast.literal_eval(text)
        tokens = ast.literal_eval(tokens)
        columns["text"] = " ".join(text)
        columns["document_level_task_label"] = label # Key hard-coded
        columns["token_level_task_label"] = list(map(str, tokens)) # Key hard-coded
        dicts.append(columns)
      return dicts

    @staticmethod
    def _get_start_of_word(word_ids):
        words = np.array(word_ids)
        words[words == None] = -1
        start_of_word_single = [0] + list(np.ediff1d(words) > 0)
        start_of_word_single = [int(x) for x in start_of_word_single]
        return start_of_word_single

    # Most of the code is copied from NERProcessor - dataset_from_dicts()
    def dataset_from_dicts(self, dicts, indices=None, return_baskets=False, non_initial_token="X"):
      self.baskets = []
      self.pre_tokenizer = WhitespaceSplit()

      texts = [x["text"] for x in dicts]
      words_and_spans = [self.pre_tokenizer.pre_tokenize_str(x) for x in texts]
      words = [[x[0] for x in y] for y in words_and_spans]

      word_spans_batch = [[x[1] for x in y] for y in words_and_spans]

      tokenized_batch = self.tokenizer.batch_encode_plus(
          words,
          return_offsets_mapping=True,
          return_special_tokens_mask=True,
          return_token_type_ids=True,
          return_attention_mask=True,
          truncation=True,
          max_length=self.max_seq_len,
          padding="max_length",
          is_split_into_words=True,
      )

      for i in range(len(dicts)):
          tokenized = tokenized_batch[i]
          d = dicts[i]
          id_external = self._id_from_dict(d)
          if indices:
              id_internal = indices[i]
          else:
              id_internal = i

          input_ids = tokenized.ids
          segment_ids = tokenized.type_ids
          initial_mask = self._get_start_of_word(tokenized.words)
          assert len(initial_mask) == len(input_ids)

          padding_mask = tokenized.attention_mask

          if return_baskets:
              token_to_word_map = tokenized.words
              word_spans = word_spans_batch[i]
              tokenized_dict = {
                  "tokens": tokenized.tokens,
                  "word_spans": word_spans,
                  "token_to_word_map": token_to_word_map,
                  "start_of_word": initial_mask
              }
          else:
              tokenized_dict = {}

          feature_dict = {
              "input_ids": input_ids,
              "padding_mask": padding_mask,
              "segment_ids": segment_ids,
              "initial_mask": initial_mask,
          }

          for task_name, task in self.tasks.items():
              try:
                  label_name = task["label_name"]
                  labels_word = d[label_name]
                  label_list = task["label_list"]
                  label_tensor_name = task["label_tensor_name"]

                  if task["task_type"] == "classification":
                      label_ids = [label_list.index(labels_word)]
                  elif task["task_type"] == "ner":
                      labels_token = expand_labels(labels_word, initial_mask, non_initial_token)
                      label_ids = [label_list.index(lt) for lt in labels_token]
              except ValueError:
                  label_ids = None
                  problematic_labels = set(labels_token).difference(set(label_list))
                  print(f"[Task: {task_name}] Could not convert labels to ids via label_list!"
                                  f"\nWe found a problem with labels {str(problematic_labels)}")
              except KeyError:
                  label_ids = None
              if label_ids:
                  feature_dict[label_tensor_name] = label_ids

          curr_sample = Sample(id=None,
                                  clear_text=d,
                                  tokenized=tokenized_dict,
                                  features=[feature_dict])
          curr_basket = SampleBasket(id_internal=id_internal,
                                      raw=d,
                                      id_external=id_external,
                                      samples=[curr_sample])
          self.baskets.append(curr_basket)

      if indices and 0 not in indices:
          pass
      else:
          self._log_samples(1)

      dataset, tensor_names = self._create_dataset()
      ret = [dataset, tensor_names, self.problematic_sample_ids]
      if return_baskets:
          ret.append(self.baskets)
      return tuple(ret)

In [None]:
from sklearn.metrics import f1_score

def custom_f1_score(y_true, y_pred):
  f1_scores = []
  for t, p in zip(y_true, y_pred):
    f1_scores.append(f1_score(t, p, average='macro'))
  return {"f1 macro score" : sum(f1_scores) / len(f1_scores), "total" : len(f1_scores)}

In [None]:
from typing import List

def mtl_loss_agg(individual_losses: List[torch.Tensor], global_step=None, batch=None):
    loss = torch.sum(individual_losses[0]) + torch.sum(individual_losses[1])
    return loss

In [None]:
DO_LOWER_CASE = False
LANG_MODEL = "bert-base-uncased"
TRAIN_FILE = "/datasets/hatexplain_train.csv"
DEV_FILE = "/datasets/hatexplain_dev.csv"
TEST_FILE = "datasets/hatexplain_test.csv"
MAX_SEQ_LEN = 128
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
N_EPOCHS = 1
EMBEDS_DROPOUT_PROB = 0.1
EVALUATE_EVERY = 500
random_seed_list = [3, 42]
DEVICE, N_GPU = initialize_device_settings(use_cuda=True)

05/30/2021 23:58:36 - INFO - farm.utils -   Using device: CUDA 
05/30/2021 23:58:36 - INFO - farm.utils -   Number of GPUs: 1
05/30/2021 23:58:36 - INFO - farm.utils -   Distributed Training: False
05/30/2021 23:58:36 - INFO - farm.utils -   Automatic Mixed Precision: None


In [None]:
test_result_data = pd.read_csv("/datasets/hatexplain_test.csv", delimiter=",")
test_texts = []
for idx, text in enumerate(test_result_data.post_tokens.values):
  in_dict = {}
  text = ast.literal_eval(text)
  in_dict["text"] = " ".join(text)
  test_texts.append(in_dict)

In [None]:
LANG_MODEL, random_seed_list, N_EPOCHS

('bert-base-uncased', [3, 42], 1)

In [None]:
for random_seed in random_seed_list:
  # Clean up
  gc.collect()
  torch.cuda.empty_cache()

  # Set the random seed
  from farm.utils import set_all_seeds
  set_all_seeds(seed=random_seed)

  !rm -rf /content/early-stopping-model 

  tokenizer = Tokenizer.load(
    pretrained_model_name_or_path=LANG_MODEL,
    do_lower_case=DO_LOWER_CASE,
    # add_prefix_space=True, # For roberta only
    )
  
  NER_LABELS = ["X", "0", "1"]
  LABEL_LIST = ["normal", "offensive", "hatespeech"]

  processor = MTLProcessor(data_dir = ".", 
                              tokenizer=tokenizer,
                              max_seq_len=128,
                              train_filename=TRAIN_FILE,
                              test_filename=TEST_FILE,
                              dev_filename=DEV_FILE,
                              delimiter=",",
                              )
  

  from farm.evaluation.metrics import register_metrics
  register_metrics('f1_weighted', custom_f1_score)

  metric = 'f1_weighted'
  processor.add_task(name="document_level_task", label_list=LABEL_LIST, metric="acc", text_column_name="text", label_column_name="label", task_type="classification")
  processor.add_task(name="token_level_task", label_list=NER_LABELS, metric=metric, text_column_name="text", label_column_name="tokens", task_type="ner")

  data_silo = DataSilo(processor=processor,
                      batch_size=BATCH_SIZE
                      )
  

  from farm.train import EarlyStopping
  from pathlib import Path

  earlystopping = EarlyStopping(
                                metric="loss", mode="min",
                                save_dir=Path("./early-stopping-model"),
                                patience=10
                               )
  
  language_model = LanguageModel.load(LANG_MODEL)

  document_level_task_head = TextClassificationHead(num_labels=len(LABEL_LIST), task_name="document_level_task")
  token_level_task_head = TokenClassificationHead(num_labels=len(NER_LABELS), task_name="token_level_task")

  model = AdaptiveModel(
    language_model=language_model,
    prediction_heads=[document_level_task_head, token_level_task_head],
    embeds_dropout_prob=EMBEDS_DROPOUT_PROB,
    lm_output_types=["per_sequence", "per_token"],
    device=DEVICE,
    loss_aggregation_fn=mtl_loss_agg)
  
  model, optimizer, lr_schedule = initialize_optimizer(
    model=model,
    device=DEVICE,
    learning_rate=LEARNING_RATE,
    n_batches=len(data_silo.loaders["train"]),
    n_epochs=N_EPOCHS)


  trainer = Trainer(model=model,
                    optimizer=optimizer,
                    data_silo=data_silo,
                    epochs=N_EPOCHS,
                    n_gpu=N_GPU,
                    lr_schedule=lr_schedule,
                    device=DEVICE,
                    evaluate_every=EVALUATE_EVERY,
                    # early_stopping=earlystopping,
                    )

  model = trainer.train()

  from pathlib import Path
  save_dir = Path("/content/early-stopping-model")

  model.save(save_dir)
  processor.save(save_dir)

  from farm.infer import Inferencer

  model = Inferencer.load(save_dir, gpu=True)
  result = model.inference_from_dicts(dicts=test_texts)

  label_predictions_list, tokens_predictions_list = [], []
  for idx, chunk_res in enumerate(result):
    if idx % 2 == 0:
      label_predictions_list += chunk_res['predictions']
    else:
      tokens_predictions_list += chunk_res['predictions']

  # Tokens
  tokens_list = []
  for idx, pred_ind_list in enumerate(tokens_predictions_list):
    ind_list = []
    for val_dict in pred_ind_list:
      label_val = val_dict['label']
      ind_list.append(0 if label_val == 'X' else int(label_val))
    tokens_list.append(ind_list)
  test_result_data["seed_token" + str(random_seed)] = tokens_list 

  # Labels
  label_list = []
  for idx, pred_dict in enumerate(label_predictions_list):
    label_list.append(pred_dict['label'])
  test_result_data["seed_post" + str(random_seed)] = label_list

  # Clean up
  gc.collect()
  torch.cuda.empty_cache()

  print("Completed:", "seed_post" + str(random_seed))


Train epoch 0/0 (Cur. train loss: 144.3164): 100%|██████████| 361/361 [02:24<00:00,  2.51it/s]
Evaluating: 100%|██████████| 121/121 [00:20<00:00,  5.78it/s]
05/31/2021 00:02:57 - INFO - farm.eval -   

\\|//       \\|//      \\|//       \\|//     \\|//
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
***************************************************
***** EVALUATION | TEST SET | AFTER 361 BATCHES *****
***************************************************
\\|//       \\|//      \\|//       \\|//     \\|//
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

05/31/2021 00:02:57 - INFO - farm.eval -   
 _________ document_level_task _________
05/31/2021 00:02:57 - INFO - farm.eval -   loss: 0.7300639125236488
05/31/2021 00:02:57 - INFO - farm.eval -   task_name: document_level_task
05/31/2021 00:02:57 - INFO - farm.eval -   acc: 0.6857440166493236
05/31/2021 00:02:57 - INFO - farm.eval -   report: 
               precision    recall  f1-score   support

      normal     0.7603  

Completed: seed_post3


05/31/2021 00:04:53 - INFO - farm.modeling.tokenization -   Loading tokenizer of type 'BertTokenizer'
05/31/2021 00:04:55 - INFO - farm.data_handler.data_silo -   
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
05/31/2021 00:04:55 - INFO - farm.data_handler.data_silo -   LOADING TRAIN DATA
05/31/2021 00:04:55 - INFO - farm.data_handler.data_silo -   Loading train set from: /content/hatexplain_train.csv 
05/31/2021 00:04:56 - INFO - farm.data_handler.data_silo -   Got ya 2 parallel workers to convert 11535 dictionaries to pytorch datasets (chunksize = 1154)...
05/31/2021 00:04:56 - INFO - farm.data_handler.data_silo -    0    0 
05/31/2021 00:04:56 - INFO - farm.data_handler.data_silo -   /w\  /w\
05/31/2021 00:04:56 - INFO - farm.data_handler.data_silo -   /'\  / \
05/31/2021 00:04:56 - INFO - farm.data_handler.data_silo -     
05/31/2021 00:04:56 - INFO 

Completed: seed_post42


In [None]:
post_true_values = test_result_data.post_label.values
token_true_values = test_result_data.toxic_tokens.values

In [None]:
post_pred_values = []
for idx in range(len(post_true_values)):
  res_dict = {'offensive': 0, 'normal': 0, 'hatespeech': 0}

  res_dict[test_result_data.seed_post3.values[idx]] += 1
  res_dict[test_result_data.seed_post42.values[idx]] += 1

  res_dict = {k: v for k, v in sorted(res_dict.items(), key=lambda item: -item[1])}

  post_pred_values.append(list(res_dict)[0])

In [None]:
print("---- Post-level Results ----")
print("Seed 3:", f1_score(post_true_values, test_result_data.seed_post3.values, average="macro"))
print("Seed 42:", f1_score(post_true_values, test_result_data.seed_post42.values, average="macro"))
print("Overall (macro):", f1_score(post_true_values, post_pred_values, average="macro"))

---- Post-level Results ----
Seed 3: 0.6729483068631462
Seed 42: 0.6661457549112987
Overall (macro): 0.6769352480006362


In [None]:
def res_customr_f1(y_true, y_pred):
  f1_scores = []
  idx = 0
  for t, p in zip(y_true, y_pred):
    try:
      t = ast.literal_eval(t)
      cur = f1_score(t, p, average='macro')
      f1_scores.append(cur)
    except Exception as e:
      diff = len(t) - len(p)
      p = p + [0] * diff
      cur = f1_score(t, p, average='macro')
      f1_scores.append(cur)
    idx += 1
  return "Mean F1 (macro) score: " + str(sum(f1_scores) / len(f1_scores))

In [None]:
def majority_vote(results_df, random_seed_list):
  pred_list = []
  for idx in range(len(results_df)):
    indv_list = []
    for seed in random_seed_list:
      seed_name = "seed_token" + str(seed)
      seed_list = results_df[seed_name].values[idx]
      if len(indv_list) == 0:
        for i in range(len(seed_list)):
          indv_list.append(dict({0:0, 1:0}))
      for idx_sl, idv_tokens in enumerate(seed_list):
        indv_list[idx_sl][idv_tokens] += 1
    fresh_list = []
    for token_dict in indv_list:
      token_dict = {k: v for k, v in sorted(token_dict.items(), key=lambda item: -item[1])}
      fresh_list.append(list(token_dict)[0])
    pred_list.append(fresh_list)
  return pred_list

In [None]:
print("---- Token-level Results ----")
print("Seed 3:", res_customr_f1(token_true_values, test_result_data.seed_token3.values))
print("Seed 42:", res_customr_f1(token_true_values, test_result_data.seed_token42.values))
print("Overall Mean:", res_customr_f1(token_true_values, majority_vote(test_result_data, random_seed_list)))

---- Token-level Results ----
Seed 3: Mean F1 (macro) score: 0.8078636209838234
Seed 42: Mean F1 (macro) score: 0.8110865138957913
Overall Mean: Mean F1 (macro) score: 0.8122438285971888
