In [None]:
!pip install transformers==3

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
from os import path
import pandas
import pandas as pd
pd.set_option('display.max_columns', None)

import transformers
import itertools
import torch
from collections import defaultdict
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from torch.utils import data
import json
import os
import copy

In [None]:
# configurations
BERT_MODEL_NAME = 'bert-base-uncased'
DUAL_SEQUENCE = False
BATCH_SIZE = 32
EPOCHS = 75
ITERATIONS = 10
SPLIT = 'random'
GRAIN = 'coarse'
REMOVE_NON_BINARY = False
REMOVE_NON_COMP = False
IMAGE_MODEL = 'ResNet152V2_10_pca_384_norm'
MULTIMODAL = True
FREEZE_BERT = True
JOIN_VAL_AND_TRAIN = False

VEC_COMBO_MODE = 'concatenate'
MULTI_COMBO_MODE = 'concatenate'

#This parameter toggles filtering of the data by the image model, 
# even in unimodal mode, to ensure comparable results to the multimodal experiments.
UNIMODAL_COMPARABLE = True

if UNIMODAL_COMPARABLE: 
  SAVE_PREDS_TO = f'results_filtered_on_{IMAGE_MODEL}'
else: 
  SAVE_PREDS_TO = 'results'

## False, 'generic', or 'natural'
EMB_COMP = False

if EMB_COMP:
  assert(not DUAL_SEQUENCE), 'For now, the use of dual_sequence with embedded compounds is not supported'

tokenizer = transformers.BertTokenizer.from_pretrained(BERT_MODEL_NAME)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
def prep_data():
  directory = '/content/drive/My Drive/Tratz_2011_data_comp_binary'
  data_file = f'tratz2011_cb_{GRAIN}_grained_{SPLIT}'
  data_path = path.join(directory, data_file)
    
  train_data = pd.read_csv(path.join(data_path, 'train.tsv'), sep='\t', header=None, index_col=None)
  test_data = pd.read_csv(path.join(data_path, 'test.tsv'), sep='\t', header=None, index_col=None)
  val_data = pd.read_csv(path.join(data_path, 'val.tsv'), sep='\t', header=None, index_col=None)

  train_data.columns = ['nc_mod', 'nc_head', 'nc_type']
  test_data.columns = ['nc_mod', 'nc_head', 'nc_type']
  val_data.columns = ['nc_mod', 'nc_head', 'nc_type']

  if not EMB_COMP:

    train_data['compound'] = train_data['nc_mod'] + " " + train_data['nc_head']
    test_data['compound'] = test_data['nc_mod'] + " " + test_data['nc_head']
    val_data['compound'] = val_data['nc_mod'] + " " + val_data['nc_head']
  elif EMB_COMP == 'generic':

    generic_sentence = 'we know for a fact that the phrase {} exists because we have seen it in use several times before'
    train_data['compound'] = [generic_sentence.format(m + ' ' + h) for m, h in zip(train_data['nc_mod'], train_data['nc_head'])] 
    test_data['compound'] = [generic_sentence.format(m + ' ' + h) for m, h in zip(test_data['nc_mod'], test_data['nc_head'])] 
    val_data['compound'] = [generic_sentence.format(m + ' ' + h) for m, h in zip(val_data['nc_mod'], val_data['nc_head'])] 
  elif EMB_COMP == 'natural':
    with open('/content/drive/My Drive/compound_sents_filtered_May-25-2021.json', 'r') as sentence_file:
      sentence_file.seek(0)
      sentence_dict = json.load(sentence_file)
    generic_sentence = 'we know for a fact that the phrase {} exists because we have seen it in use several times before'
    compound_sentences_train = [sentence_dict[m + ' ' + h][0] if m + ' ' + h in sentence_dict else generic_sentence.format(m + ' ' + h) for m, h in zip(train_data['nc_mod'], train_data['nc_head'])]
    print(len(train_data), len(compound_sentences_train))
    assert(len(compound_sentences_train) == len(train_data)), 'there are still some compounds in the training data that dont have sentences'
    train_data['compound'] = compound_sentences_train

    compound_sentences_test = [sentence_dict[m + ' ' + h] if m + ' ' + h in sentence_dict else generic_sentence.format(m + ' ' + h) for m, h in zip(test_data['nc_mod'], test_data['nc_head'])]
    assert(len(compound_sentences_test) == len(test_data)), 'there are still some compounds in the training data that dont have sentences'
    test_data['compound'] = compound_sentences_test

    compound_sentences_val = [sentence_dict[m + ' ' + h] if m + ' ' + h in sentence_dict else generic_sentence.format(m + ' ' + h) for m, h in zip(val_data['nc_mod'], val_data['nc_head'])]
    assert(len(compound_sentences_val) == len(val_data)), 'there are still some compounds in the training data that dont have sentences'
    val_data['compound'] = compound_sentences_val
  
  labels_to_ids = {label: index for index, label in enumerate(train_data.nc_type.unique())}

  ordered_labels = ['none']*len(labels_to_ids)
  for v, k in labels_to_ids.items():
    ordered_labels[k] = v


  train_data['label_id'] = train_data['nc_type'].replace(labels_to_ids)
  test_data['label_id'] = test_data['nc_type'].replace(labels_to_ids)
  val_data['label_id'] = val_data['nc_type'].replace(labels_to_ids)
  
  return train_data, test_data, val_data, labels_to_ids, ordered_labels

In [None]:
def get_image_model(model_name):
  with open(f'/content/drive/My Drive/word_to_img_vec_{model_name}.json', 'r') as sentence_file:
      model = json.load(sentence_file)
  return model
  

In [None]:
def prep_image_data(data: pd.DataFrame, model):
  if VEC_COMBO_MODE == 'add':
    f = lambda x: x.split(' ')
    vectors = list(data['compound'].map(lambda x: np.add(model[f(x)[0]], model[f(x)[1]])))
  elif VEC_COMBO_MODE == 'average':
    f = lambda x: x.split(' ')
    vectors = list(data['compound'].map(lambda x: np.average([model[f(x)[0]], model[f(x)[1]]], axis=0)))
  elif VEC_COMBO_MODE == 'concatenate':
    f = lambda x: x.split(' ')
    vectors = list(data['compound'].map(lambda x: np.concatenate((model[f(x)[0]], model[f(x)[1]]))))
  else: 
    raise ValueError('VEC_COMBO_MODE must be either `add` or `average`.')
  return vectors

In [None]:
# Removes words from the dataframe whose mod, head, or combined compound don't exist in a given model
def remove_non_existing(data: pd.DataFrame, model, filter_on: list):
    possible_to_filter_on = ['mod', 'head', 'compound']
    assert len(filter_on) > 0, 'Parameter filter_on must be a non-empty list'
    for word in filter_on:
        assert word in possible_to_filter_on, \
            f'Parameter filter_on must be a list containing at least one of the following: {possible_to_filter_on}'
    def compound_exists_in_model(compound):
        one_word_compound = ''.join(compound.split())
        underscore_compound = '_'.join(compound.split())
        hyphenated_compound = '-'.join(compound.split())
        two_word_compound = ' '.join(compound.split())

        if compound in model \
                or underscore_compound in model \
                or one_word_compound in model \
                or hyphenated_compound in model:
            return True
        else: return False

    def word_exists_in_model(word):
        if word in model:
            return True
        else:
            return False

    new_data = copy.deepcopy(data)
    if 'compound' in filter_on:
        new_data = new_data[new_data['compound'].map(compound_exists_in_model)]
    if 'mod' in filter_on:
        new_data = new_data[new_data['nc_mod'].map(word_exists_in_model)]
    if 'head' in filter_on:
        new_data = new_data[new_data['nc_head'].map(word_exists_in_model)]
    return new_data

In [None]:
class TratzDataset(data.Dataset):
  def __init__(self, nc_data, labels, tokenizer, max_len):
    if isinstance(nc_data, pandas.core.series.Series):
      self.compounds = nc_data.to_numpy()
      assert len(labels) == len(self.compounds), "Length of data list and labels list must be equal"
      self.dualsequence = False
    elif len(nc_data) == 2:
      self.mods = nc_data[0].to_numpy()
      self.heads = nc_data[1].to_numpy()
      assert len(self.mods) == len(self.heads), "Length of modifier list and head list must be equal"
      assert len(labels) == len(self.heads), "Length of data list and labels list must be equal"
      self.dualsequence = True
    else: raise ValueError("nc_data must either be a Pandas series (df column) or a tuple of two such Pandas series")

    self.labels = labels
    self.tokenizer = tokenizer
    self.max_len = max_len

  def __len__(self):
    if self.dualsequence == True: 
      return len(self.mods)
    else: return len(self.compounds)

  def __getitem__(self, i):
    if self.dualsequence == True:
      mod, head = str(self.mods[i]), str(self.heads[i])
      encoding = tokenizer.encode_plus(mod, head,
                                    max_length = self.max_len, 
                                    padding = 'max_length',
                                    return_tensors = 'pt',
                                    return_attention_mask = True,
                                    add_special_tokens = True,
                                    return_token_type_ids = True, 
                                    truncation = False
                                    )
      return {
          'compound' : str(mod) + ' ' + str(head),
          'input_ids' : encoding['input_ids'].flatten(),
          'attention_mask' : encoding['attention_mask'].flatten(),
          'token_type_ids' : encoding['token_type_ids'].flatten(),
          'labels' : torch.tensor(self.labels[i], dtype=torch.long)
      }
    else: 
      compound = str(self.compounds[i])
      encoding = tokenizer.encode_plus(compound,
                                    max_length = self.max_len, 
                                    padding = 'max_length',
                                    return_tensors = 'pt',
                                    return_attention_mask = True,
                                    add_special_tokens = True,
                                    return_token_type_ids = True, 
                                    truncation = False
                                    )
      return {
        'compound' : str(compound),
        'input_ids' : encoding['input_ids'].flatten(),
        'attention_mask' : encoding['attention_mask'].flatten(),
        'token_type_ids' : encoding['token_type_ids'].flatten(),
        'labels' : torch.tensor(self.labels[i], dtype=torch.long)
      }

In [None]:
class ImageVectorsDataset(data.Dataset):
  def __init__(self, img_vectors):
    self.img_vectors = img_vectors

  def __len__(self):
    return len(self.img_vectors)

  def __getitem__(self, i):
    return {'img_vector' : self.img_vectors[i]}

In [None]:
def make_text_data_loader(df: pd.DataFrame, col_names: list, tokenizer: transformers.BertTokenizer, max_len: int, batch_size: int):
  if len(col_names) == 1:
    compound_col = col_names[0]
    dataset = TratzDataset(
      nc_data=df[compound_col],
      labels=df.label_id.to_numpy(), 
      tokenizer=tokenizer,
      max_len=max_len
      )
  elif len(col_names) == 2:
    mod_col, head_col = col_names[0], col_names[1]
    dataset = TratzDataset(
      nc_data=(df[mod_col], df[head_col]),
      labels=df.label_id.to_numpy(), 
      tokenizer=tokenizer,
      max_len=max_len
      )   
  else: raise ValueError("You must pass either one or two column names!")

  return data.DataLoader(
      dataset,
      batch_size=batch_size,
      num_workers=2
  )  

In [None]:
def make_image_data_loader(vectors, batch_size):
  dataset = ImageVectorsDataset(vectors)
  return data.DataLoader(
      dataset, 
      batch_size=batch_size,
      num_workers=2
  )

In [None]:
def prep_data_loaders(train_data, test_data, val_data):
  # Making all the data loaders
  assert not (DUAL_SEQUENCE and EMB_COMP), 'both DUAL_SEQUENCE and EMB_COMP cannot be True'
  if DUAL_SEQUENCE:
    col_names=['nc_mod', 'nc_head']
  else: 
    col_names=['compound']

  train_dl = make_text_data_loader(train_data, col_names=col_names, tokenizer=tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
  test_dl = make_text_data_loader(test_data, col_names=col_names, tokenizer=tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)
  val_dl = make_text_data_loader(val_data, col_names=col_names, tokenizer=tokenizer, max_len=MAX_LEN, batch_size=BATCH_SIZE)

  return train_dl, test_dl, val_dl

In [None]:
def determine_max_len(data):
  lengths = [len(tokenizer.encode(c)) for c in data['compound']]
  max_len_index = lengths.index(max(lengths))
  longest_input = data.iloc[max_len_index]['compound']
  tokens = tokenizer.encode(longest_input)
  return len(tokens)

In [None]:
class NounCompoundClassifier(torch.nn.Module):
  def __init__(self, num_classes, image_model=None, math_mode='add'):
    super(NounCompoundClassifier, self).__init__()
    self.bert = transformers.BertModel.from_pretrained(BERT_MODEL_NAME)
    if FREEZE_BERT:
      for param in self.bert.parameters():
        param.requires_grad = False
    self.drop = torch.nn.Dropout(p=0.3)
    linear_input_dims = self.bert.config.hidden_size if not MULTIMODAL else self.bert.config.hidden_size*2
    self.out = torch.nn.Linear(linear_input_dims, num_classes)
    self.softmax = torch.nn.Softmax(dim=1)
    self.multimodal = MULTIMODAL

  def forward(self, input_ids, attention_mask, token_type_ids, img_vectors=None):
    _, pooled_output = self.bert(
        input_ids = input_ids, 
        attention_mask = attention_mask, 
        token_type_ids=token_type_ids
    )
    if self.multimodal:
      if MULTI_COMBO_MODE == 'concatenate':
        pooled_output = torch.cat((pooled_output, img_vectors), dim=1)
    output = self.drop(pooled_output.float())
    output = self.out(output)
    return self.softmax(output)

In [None]:
def train_epoch(
    model, 
    data_loader,
    image_data_loader, 
    model_loss, 
    optimizer,
    scheduler,
    num_examples
):
  model = model.train()
  losses = []
  correct_predictions = 0

  if isinstance(image_data_loader, type(None)):
    for d in data_loader:
      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      token_type_ids = d['token_type_ids'].to(device)
      labels = d['labels'].to(device)

      outputs = model( 
          input_ids=input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids
      )
      _, predictions = torch.max(outputs, dim=1)
      loss = model_loss(outputs, labels)
      correct_predictions += torch.sum(predictions == labels)
      losses.append(loss.item())
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()
  else:
    for d, img_d in zip(data_loader, image_data_loader):
      input_ids = d['input_ids'].to(device)
      attention_mask = d['attention_mask'].to(device)
      token_type_ids = d['token_type_ids'].to(device)
      labels = d['labels'].to(device)
      image_vectors = img_d['img_vector'].to(device)

      outputs = model( 
          input_ids=input_ids,
          attention_mask=attention_mask,
          token_type_ids=token_type_ids,
          img_vectors=image_vectors
        )

      _, predictions = torch.max(outputs, dim=1)
      loss = model_loss(outputs, labels)
      correct_predictions += torch.sum(predictions == labels)
      losses.append(loss.item())
      loss.backward()
      torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

      optimizer.step()
      scheduler.step()
      optimizer.zero_grad()
  return correct_predictions.double() / num_examples, np.mean(losses)

In [None]:
def train(model, scheduler, optimizer, model_loss, train_data, train_dl, train_img_dl, save_model=False):
  history = defaultdict(list)
  best_accuracy = 0
  for epoch in range(EPOCHS):
    print(f'Epoch {epoch+1} of {EPOCHS}')
    train_acc, train_loss = train_epoch(
      model=model,
      data_loader=train_dl, 
      image_data_loader=train_img_dl,
      model_loss=model_loss,
      optimizer=optimizer,
      scheduler=scheduler,
      num_examples=len(train_data)
  )
  print(f'Train loss: {train_loss} -- train accuracy: {train_acc}')
  print('-'*20)
  history['train_acc'].append(train_acc)
  history['train_loss'].append(train_loss)

  if save_model:
    model_name = f'{BERT_MODEL_NAME}_{SPLIT}_{GRAIN}_{EPOCHS}_{BATCH_SIZE}_{MAX_LEN}_{DUAL_SEQUENCE}_{EMB_COMP}'
    model_path = f'/content/drive/My Drive/{model_name}.bin' 
    torch.save(model.state_dict(), model_path)
  return history
  

In [None]:
def evaluate(model, data_loader, image_data_loader, model_loss, num_examples):
  model.eval()
  losses = []
  correct_predictions = 0
  all_predictions = []
  all_labels = []

  with torch.no_grad():
    if not MULTIMODAL: 
      for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        token_type_ids = d['token_type_ids'].to(device)
        labels = d['labels'].to(device)

        outputs = model( 
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )
        _, predictions = torch.max(outputs, dim=1)
        all_predictions.append(predictions)
        all_labels.append(labels)
        loss = model_loss(outputs, labels)
        correct_predictions += torch.sum(predictions == labels)
        losses.append(loss.item())
    else:
      for d, img_d in zip(data_loader, image_data_loader):
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        token_type_ids = d['token_type_ids'].to(device)
        labels = d['labels'].to(device)
        image_vectors = img_d['img_vector'].to(device)

        outputs = model( 
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids, 
            img_vectors=image_vectors
        )
        _, predictions = torch.max(outputs, dim=1)
        all_predictions.append(predictions)
        all_labels.append(labels)
        loss = model_loss(outputs, labels)
        correct_predictions += torch.sum(predictions == labels)
        losses.append(loss.item())
  return correct_predictions.double() / num_examples, np.mean(losses), all_predictions, all_labels

In [None]:
def save_predictions(filename, data_list, predictions_list, labels_to_ids, save_preds_to):
  for i in range(len(predictions_list)):
    data = data_list[i]
    print(f'length of prediction series in iteration {i}: {len(predictions_list[i])}')
    preds = list(predictions_list[i])
    true_labels = list(data['nc_type'])
    predictions_dict = {'true_labels': true_labels, 'predicted_labels' : preds}
    print(list(predictions_list[i])[-5:])
    results_path =  f'/content/drive/My Drive/{save_preds_to}'
    if not os.path.exists(results_path):
      os.mkdir(results_path)
    results_final_path= f'{results_path}/{filename}'
    if not os.path.exists(results_final_path):
      os.mkdir(results_final_path)
    new_filename = f'{results_final_path}/{i}.json'
    with open(new_filename, 'w') as outfile:
      json.dump(predictions_dict, outfile)

In [None]:
def update_csv_with_results(history, val_acc, val_f1, val_loss, test_acc, test_f1, test_loss):

  train_acc = history['train_acc'][-1].cpu().numpy() if device == 'cuda' else history['train_acc'][-1]
  train_loss = history['train_loss'][-1]
  val_acc = val_acc.cpu().numpy() if device == 'cuda' else val_acc
  test_acc = test_acc.cpu().numpy() if device == 'cuda' else test_acc


  new_results = {
      'model_name' : BERT_MODEL_NAME,
      'comp_embedded' : EMB_COMP,
      'dual_sequence' : DUAL_SEQUENCE,
      'split' : SPLIT,
      'grain' : GRAIN,
      'non_binary_removed' : REMOVE_NON_BINARY,
      'non_comp_removed' : REMOVE_NON_COMP,
      'train_acc_final' : train_acc,
      'train_loss_final' : train_loss,
      'val_acc' : val_acc,
      'val_loss' : val_loss,
      'val_f1_weighted' : val_f1,
      'test_acc' : test_acc,
      'test_loss' : test_loss,
      'test_f1_weighted' : test_f1,
      'epochs' : EPOCHS,
      'batch_size' : BATCH_SIZE,
      'max_len' : MAX_LEN,
      'padding' : 'to max len',
      'optimizer' : 'AdamW',
      'learning rate' : 2e-5,
      'correct_bias' : False,
      'loss_function' : 'CrossEntropy'
  }

  csv_filename = '/content/drive/My Drive/Inga\'s Thesis Folder/results_filtered/filtered_BERT_results_unimodal_auto.csv'
  if path.exists(csv_filename):
    results = pd.read_csv(csv_filename)
    results = results.append(new_results, ignore_index=True)
  else:
    results = pd.DataFrame(new_results, index=[0])

  print(results)
  results.to_csv(csv_filename, index=False)
  print("SAVED RESULTS")

In [None]:
def main(mode: str, iterations=1):
  if MULTIMODAL or UNIMODAL_COMPARABLE:
    image_model = get_image_model(IMAGE_MODEL)
    if 'apps' in image_model:
      print('ITS IN THERE!!')
    else:
      print('NOO SORRY')
 
  if mode == 'train':
    train_data, test_data, val_data, labels_to_ids, ordered_labels = prep_data()
    print('Length of train data:', len(train_data))
    print('Length of test data:', len(test_data))
    print('Length of val data:', len(val_data))

    if JOIN_VAL_AND_TRAIN:
      train_data = pd.concat([train_data, val_data])

    if MULTIMODAL or UNIMODAL_COMPARABLE:
      train_data = remove_non_existing(train_data, image_model, filter_on=['mod', 'head'])
      test_data = remove_non_existing(test_data, image_model, filter_on=['mod', 'head'])
      if not JOIN_VAL_AND_TRAIN:
        val_data = remove_non_existing(val_data, image_model, filter_on=['mod', 'head'])
      print('FILTERED DATA BY IMAGE MODEL')
      print(f'train length is now {len(train_data)}')
      print(f'test length is now {len(test_data)}')
      if not JOIN_VAL_AND_TRAIN:
        print(f'val length is now {len(val_data)}')

    train_dl, test_dl, val_dl = prep_data_loaders(train_data, test_data, val_data)

    if MULTIMODAL:
      train_img_data = prep_image_data(train_data, image_model)
      test_img_data = prep_image_data(test_data, image_model)
      
      if not JOIN_VAL_AND_TRAIN:
        val_img_data = prep_image_data(val_data, image_model)

      train_img_dl = make_image_data_loader(train_img_data, BATCH_SIZE)
      test_img_dl = make_image_data_loader(test_img_data, BATCH_SIZE)
      
      if not JOIN_VAL_AND_TRAIN:
        val_img_dl = make_image_data_loader(val_img_data, BATCH_SIZE)

    num_classes = len(labels_to_ids)

    total_accuracy_test = 0
    total_f1_test = 0
    total_loss_test = 0

    total_accuracy_val = 0
    total_f1_val = 0
    total_loss_val = 0

    all_test_preds = []
    all_test_data = []

    all_val_preds = []
    all_val_data = []

    for i in range(iterations):
      
      model = NounCompoundClassifier(num_classes=num_classes)
      model.to(device)

      if not FREEZE_BERT:
        optimizer = transformers.AdamW(model.parameters(), lr=2e-5, correct_bias=False)
      elif FREEZE_BERT:
        optimizer = transformers.AdamW(filter(lambda p: p.requires_grad, model.out.parameters()), lr=0.1, correct_bias=False)
      num_steps = len(train_dl) * EPOCHS
      scheduler = transformers.get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps = 0,
        num_training_steps = num_steps
      )
      model_loss = torch.nn.CrossEntropyLoss().to(device)
      history = train(model, 
                      scheduler=scheduler, 
                      optimizer=optimizer, 
                      model_loss=model_loss, 
                      train_data=train_data, 
                      train_dl=train_dl,
                      train_img_dl=train_img_dl if MULTIMODAL else None
                      )

      ### TEST DATA ###

      test_acc, test_loss, test_preds, test_labels = evaluate(
        model=model,
        data_loader=test_dl,
        image_data_loader=test_img_dl if MULTIMODAL else None,
        model_loss=model_loss,
        num_examples=len(test_data)
      )

      print(GRAIN, SPLIT)
      print(f'test loss: {test_loss} -- test accuracy: {test_acc}\n')

      

      test_preds_lst = [int(x) for ts in test_preds for x in ts]
      print('length of test preds:', len(test_preds_lst))
      test_labels_lst = [int(x) for ts in test_labels for x in ts]
      unique_label_ids = np.unique(train_data.label_id)
      print(unique_label_ids, '\n', ordered_labels)
      #test_cls_report = classification_report(test_labels_lst, test_preds_lst, labels = unique_label_ids, target_names=ordered_labels, zero_division=0)
      test_f1 = f1_score(test_labels_lst, test_preds_lst, labels=unique_label_ids, average='weighted')
      #print('{0:*^80}'.format(' TEST CLASSFICATION REPORT '))
      #print(test_cls_report)

      total_loss_test += test_loss
      total_accuracy_test += test_acc
      total_f1_test += test_f1

      ids_to_labels = {v: k for k, v in labels_to_ids.items()}
      test_preds_series = pd.Series(test_preds_lst).replace(ids_to_labels)
      print('length of test_preds_series: ', len(test_preds_series))
      all_test_preds.append(test_preds_series)
      all_test_data.append(test_data)


      ### VAL DATA ###
      if not JOIN_VAL_AND_TRAIN:
        val_acc, val_loss, val_preds, val_labels = evaluate(
          model=model,
          data_loader=val_dl,
          image_data_loader=val_img_dl if MULTIMODAL else None,
          model_loss=model_loss,
          num_examples=len(val_data)
        )

        print(f'Val loss: {val_loss} -- val accuracy: {val_acc}\n')

        val_preds_lst = [int(x) for ts in val_preds for x in ts]
        val_labels_lst = [int(x) for ts in val_labels for x in ts]
        unique_preds_val = np.unique(np.array(val_preds_lst))
        unique_labels_val = np.unique(np.array(val_labels_lst))
        #val_cls_report = classification_report(val_labels_lst, val_preds_lst, labels=unique_label_ids, target_names=ordered_labels, zero_division=0)
        val_f1 = f1_score(val_labels_lst, val_preds_lst, labels=unique_label_ids, average='weighted')
        #print('{0:*^80}'.format(' VALIDATION CLASSFICATION REPORT '))
        #print(val_cls_report)

        total_loss_val += val_loss
        total_accuracy_val += val_acc
        total_f1_val += val_f1

        val_preds_series = pd.Series(val_preds_lst).replace(ids_to_labels)
        all_val_preds.append(val_preds_series)
        all_val_data.append(val_data)
      
      del model

      ### SAVE GENERAL RESULTS TO A CSV FILE ###
      #update_csv_with_results(history, val_acc, val_f1, val_loss, test_acc, test_f1, test_loss)

    if MULTIMODAL:
      mode_name = IMAGE_MODEL
    else:
      mode_name = 'unimodal'
    if FREEZE_BERT:
      freeze = 'frozen'
    else:
      freeze = 'not_frozen'
    test_preds_name = f'BERT_cb_predictions_test_{mode_name}_{freeze}_{BERT_MODEL_NAME}_{SPLIT}_{GRAIN}_{EPOCHS}_{BATCH_SIZE}_{MAX_LEN}_{DUAL_SEQUENCE}_{EMB_COMP}_{VEC_COMBO_MODE}_{MULTI_COMBO_MODE}'
    save_predictions(test_preds_name, all_test_data, all_test_preds, labels_to_ids, SAVE_PREDS_TO)

    avg_accuracy_test = total_accuracy_test / iterations
    avg_loss_test = total_loss_test / iterations
    avg_f1_test = total_f1_test / iterations

    if not JOIN_VAL_AND_TRAIN:
      val_preds_name = f'BERT_cb_predictions_val_{mode_name}_{freeze}_{BERT_MODEL_NAME}_{SPLIT}_{GRAIN}_{EPOCHS}_{BATCH_SIZE}_{MAX_LEN}_{DUAL_SEQUENCE}_{EMB_COMP}_{VEC_COMBO_MODE}_{MULTI_COMBO_MODE}'
      save_predictions(val_preds_name, all_val_data, all_val_preds, labels_to_ids, SAVE_PREDS_TO)
      
      avg_accuracy_val = total_accuracy_val / iterations
      avg_loss_val = total_loss_val / iterations
      avg_f1_val = total_f1_val / iterations
    
    print(f'AVERAGED RESULTS FOR {SPLIT} - {GRAIN} - {EPOCHS} epochs - freeze {FREEZE_BERT} - multimodal {MULTIMODAL} - {IMAGE_MODEL}')
    print(f'TEST RESULTS:')
    print(f'Average accuracy: {avg_accuracy_test} \nAverage loss: {avg_loss_test} \nAverage F1: {avg_f1_test} \n')
    if not JOIN_VAL_AND_TRAIN:
      print(f'VAL RESULTS:')
      print(f'Average accuracy: {avg_accuracy_val} \nAverage loss: {avg_loss_val} \nAverage F1: {avg_f1_val} \n')



In [None]:
train_data, test_data, val_data, labels_to_ids, ordered_labels = prep_data()
all_data = pd.concat([train_data, test_data, val_data])
MAX_LEN = determine_max_len(all_data)
#MAX_LEN = 189
print(f'max len: {MAX_LEN}')

main(mode='train', iterations=ITERATIONS)
