**File: train_LEOKA_hierarchical_multilabel.ipynb**\
Author: Amber Converse\
Purpose: This file trains a multi-label classification model on labeled stories from LEOKA using ConfliBERT English to generate features.

The first model uses the simple transformers library to replicate the same classification method used in the ConfliBERT English paper (ConfliBERT: A Pre-trained Language Model for Political Conflict and Violence (NAACL 2022)).

The second model uses a series of sequential neural networks using PyTorch as a point of comparison.

In [None]:
!pip install simpletransformers==0.63.11 "transformers==4.17.0" "scipy==1.10.1"
!pip install torch==1.7.1
!pip install scikit-learn==0.24.2

In [1]:
import os
import numpy as np
import pandas as pd
import ast
import json
import csv
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from simpletransformers.classification import MultiLabelClassificationModel, MultiLabelClassificationArgs
from seqeval.metrics import accuracy_score, classification_report, f1_score, precision_score, recall_score
from sklearn import metrics

In [2]:
# Data Processing

# If True, data is already processed into tsv format with binarized label vector.
# If False, data is processed from a JSON from Label Studio and exported as a tsv with labels encoded.
# Note: For JSON format, it is assumed a train/dev/test split has not been done.
from_tsv = False

json_files =   [f"LEOKA_team_{i}.json" for i in range(1,12) if i != 5 and i != 9]
train_files = []
dev_files = []
test_files = []

def read_tsvs(tsvs):
    data = []
    for tsv in tsvs:
        with open(tsv, 'r') as file:
            tsv_reader = csv.reader(file, delimiter="\t", quotechar='"')
            for row in tsv_reader:
                text = row[0]
                labels = [int(i) for i in row[1:]]
                data.append([text, labels])
    return data

def write_tsv(texts, labels, file_name):
    with open(file_name, 'w') as file:
        tsv_writer = csv.writer(file, delimiter="\t", quotechar='"')
        for i in range(len(texts)):
            tsv_writer.writerow([texts[i]] + list(labels[i]))

def read_jsons(jsons):
    
    text_labels = []
    invalid_annotations = 0
    no_ground_truth = 0
    
    for json_file in jsons:
        with open(json_file, 'r') as json_file:

            tasks = json.load(json_file)

            total_tasks = len(tasks)
            
            labels = []
            for task in tasks:
                
                task_annotations = []
                
                for annotator in task["annotations"]:
                
                    if annotator["ground_truth"]:
                        for annotation in annotator["result"]:
                            if annotation["type"] == "taxonomy":
                                for label in annotation["value"]["taxonomy"]:
                                    try:
                                        task_annotations.append(f"{label[0]}/{label[1]}")
                                    except:
                                        invalid_annotations += 1
                                    
                if task_annotations:
                    labels.append([task["data"]["text"], task_annotations])
                else:
                    no_ground_truth += 1

        text_labels += labels
    print(f"Discarded {invalid_annotations} invalid labels.")
    print(f"{no_ground_truth} had no ground truth.")
    return text_labels

if from_tsv:
    train_texts, train_labels = zip(*read_tsvs(train_files))
    dev_texts, dev_labels = zip(*read_tsvs(train_files))
    test_texts, test_labels = zip(*read_tsvs(train_files))
else:
    texts, labels = zip(*read_jsons(json_files))

    valid_labels = ["Killed/Feloniously Killed", "Killed/Accidentally Killed", \
                    "Assaulted/Injured", "Assaulted/Not injured", \
                    "Assignment/Conducting arrest", "Assignment/Citizen call", "Assignment/Drug warrant", \
                        "Assignment/Investigation", "Assignment/Police call", "Assignment/Traffic stop", \
                    "Situation/Ambush", "Situation/Encounter", "Situation/Pursuit", \
                    "Suspect/Arrested", "Suspect/Escaped", "Suspect/Injured", "Suspect/Killed", "Suspect/Suicide"]

    mlb = MultiLabelBinarizer()
    mlb.fit([valid_labels])

    labels = mlb.fit_transform(labels)

    train_texts, test_texts, train_labels, test_labels = train_test_split(texts,labels, random_state=4096,test_size=0.5, shuffle=True)
    dev_texts, test_texts, dev_labels, test_labels = train_test_split(test_texts,test_labels, random_state=4096,test_size=0.4, shuffle=True)

    write_tsv(train_texts, train_labels, "train.tsv")
    write_tsv(dev_texts, dev_labels, "dev.tsv")
    write_tsv(test_texts, test_labels, "test.tsv")

print(f"{len(train_texts)} tasks for training.\n{len(dev_texts)} for development.\n{len(test_texts)} for testing")

Discarded 13 invalid labels.
4 had no ground truth.
583 tasks for training.
349 for development.
234 for testing


In [3]:
# Define Models

# Simple Transformers
class LEOKA_ML_Classifier_ST:
    def __init__(self,
                 seed,
                 num_labels,
                 output_dir,
                 train_epochs,
                 max_seq_length,
                 train_batch_size,
                 do_lower_case,
                 report_per_epoch,
                 use_cuda,
                 architecture,
                 pretrained_model):

        model_args = MultiLabelClassificationArgs()
        model_args.manual_seed = seed
        model_args.best_model_dir = os.path.join(output_dir, "best_model", "")
        model_args.output_dir = output_dir
        model_args.num_train_epochs = train_epochs
        model_args.fp16 = False
        model_args.max_seq_length = max_seq_length
        model_args.train_batch_size = train_batch_size
        model_args.save_steps = -1
        model_args.use_multiprocessing = False
        model_args.do_lower_case = do_lower_case
        model_args.evaluate_during_training = True
        model_args.save_best_model = True
        model_args.save_eval_checkpoints = False
        model_args.overwrite_output_dir = True

        if not report_per_epoch:
            model_args.save_model_every_epoch = False
            model_args.no_save = True

        self.model = MultiLabelClassificationModel(architecture, pretrained_model,
                                              num_labels=num_labels, args=model_args,
                                              use_cuda=use_cuda)

    def train(self, train_df, dev_df):
        self.model.train_model(train_df, eval_df=dev_df)

    def predict(self, test_df):
        return self.model.predict(test_df.text.to_list())

# PyTorch Version, in development
class LEOKA_ML_Classifier_PT:
    def __init__():
        '''
        Stub
        '''
        pass


In [None]:
# Training

train_st = True
train_pt = False

train_df = pd.DataFrame(zip(train_texts,train_labels), columns = ["text","labels"])
dev_df = pd.DataFrame(zip(dev_texts,dev_labels), columns = ["text","labels"])
test_df = pd.DataFrame(zip(test_texts,test_labels), columns = ["text","labels"])
if train_st:
    conflibert_st_model = LEOKA_ML_Classifier_ST(4096,
                                      len(train_df.labels[0]),
                                      "st_model",
                                      20,
                                      128,
                                      100,
                                      True,
                                      False,
                                      False,
                                      "bert",
                                      "snowood1/ConfliBERT-scr-uncased")
    conflibert_st_model.train(train_df, dev_df)
    
    bert_st_model = LEOKA_ML_Classifier_ST(4096,
                                      len(train_df.labels[0]),
                                      "st_model",
                                      20,
                                      128,
                                      100,
                                      True,
                                      False,
                                      False,
                                      "bert",
                                      "google-bert/bert-base-uncased")
    bert_st_model.train(train_df, dev_df)

if train_pt:
    pass

In [None]:
# Evaluation

def evaluate_model(model, test_df, name=None):
    
    result_test, model_outputs, wrong_predictions = model.eval_model(test_df)
    
    # Collecting relevant results
    result = {k: float(v) for k, v in result_test.items()}
    y_true = [list(test_df.labels[i]) for i, pred in enumerate(predictions)]
    y_pred = [list(pred) for pred in predictions]

    # Computing performance through scikit metrics
    if name:
        print(f"\nEvaluating {name}:")
    print(f"Accuracy: {metrics.accuracy_score(y_true, y_pred)}")
    print(f"prec_micro_labelBased: {metrics.precision_score(y_true, y_pred, average='micro')}")
    print(f"prec_macro_labelBased: {metrics.precision_score(y_true, y_pred, average='macro')}")
    print(f"rec_micro_labelBased: {metrics.recall_score(y_true, y_pred, average='micro')}")
    print(f"rec_macro_labelBased: {metrics.recall_score(y_true, y_pred, average='macro')}")
    print(f"f1_micro_labelBased: {metrics.f1_score(y_true, y_pred, average='micro')}")
    print(f"f1_macro_labelBased: {metrics.f1_score(y_true, y_pred, average='macro')}")

evaluate_model(bert_st_model, test_df, name="BERT Base Uncased")
evaluate_model(bert_st_model, test_df, name="ConfliBERT English SCR Uncased")