**File: train_LEOKA_hierarchical_multiclass.ipynb**\
Author: Amber Converse\
Purpose: This file trains a multi-label classification sequential neural network on labeled stories from LEOKA using ConfliBERT English to generate features.

In [None]:
import numpy as np
import pandas as pd
import ast
import json
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import torch
import torch.nn as nn

In [None]:
# Data Processing

# If True, data is already processed into text\tlabel format.
# If False, data is processed from a JSON from Label Studio and exported as a tsv.
# Note: For JSON format, it is assumed a train/dev/test split has not been done.
from_tsv = False

json_files = []
train_files = []
dev_files = []
test_files = []

def read_tsvs(tsvs):
  text_label = []
  for tsv in tsvs:
    with open(tsv, 'r') as tsv_file:
      for line in tsv_file:
        line = row.split('\t')
        text_label.append([line[0], ast.literal_eval(line[1])])
    return text_label

def read_jsons(jsons):
  text_labels = []
  for json in jsons:
    with open(json, 'r') as json_file:

      tasks = json.load(json_file)

      total_tasks = len(tasks)
      tasks_taken = 0

      labels = []
      for task in tasks:
        task_annotations = []
        for annotation in task["annotations"]:
          if annotation["type"] == "taxonomy":
            cur_labels = []
            for label in annotation["value"]["taxonomy"]:
              cur_labels.append((labe[0],label[1]))
            task_annotations.append()
        if len({set(task_annotation) for task_annotation in task_annotations}) == 1:
          labels.append([task["text"], task_annotations[0]])
          tasks_taken += 1

      print(f"{tasks_taken/total_tasks*100}% of tasks accepted from json")

      text_labels += labels
    return text_labels

def separate_labels(labels):
  '''
  Separate labels into a dictionary with the format:

  {first-order label:
      {
        second-order label: indices (an array of indices with this label)
        indices (an array of indices with this label)
      }
  }

  returns the dictionary
  '''
  labels_dict = {}
  for i, annotations in enumerate(labels):
    for annotation in annotations:
      first_order = annotation[0]
      if not first_order in labels_dict:
        labels_dict[first_order] = {"indices":[]}
      labels_dict[first_order]["indices"].append(i)
      if len(annotation) > 1:
        second_order = annotation[1]
        if not second_order in labels_dict[first_order]:
          labels_dict[first_order][second_order] = []
        labels_dict[first_order][second_order].append(i)
  return labels_dict

if from_tsv:
  train_texts, train_labels = zip(*read_tsvs(train_files))
  dev_texts, dev_labels = zip(*read_tsvs(train_files))
  test_texts, test_labels = zip(*read_tsvs(train_files))
else:
  texts, labels = zip(*read_jsons(json_files))

  train_texts, train_labels, test_texts, test_labels = train_test_split(texts,labels, random_state=4096,test_size=0.5, shuffle=True)
  dev_texts, dev_labels, test_texts, test_labels = train_test_split(test_texts,test_labels, random_state=4096,test_size=0.4, shuffle=True)