<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/TRAC-2-notebooks/grayscaling/TRAC_2_Track_A_Grayscaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TRAC-2 - Track A w/ Grayscaling
---
[Code Source](https://github.com/ainagari/scalar_adjs)

* [BERT Knows Punta Cana is not just beautiful, it’s gorgeous:
Ranking Scalar Adjectives with Contextualised Representations](https://aclanthology.org/2020.emnlp-main.598.pdf)\
*[Scalar Adjective Identification and Multilingual Ranking
](https://arxiv.org/abs/2105.01180)\
*[Identifying and Ordering Scalar Adjectives Using Lexical Substitution](https://www.proquest.com/openview/aade435a5bbdcf41e2b8c24e648826cc/1.pdf?pq-origsite=gscholar&cbl=18750)\
*[A Gold Standard for Scalar Adjectives](https://aclanthology.org/L16-1424/)


In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


<a id='section01'></a>
# Load Modules

In [2]:
#!pip install -U nltk
import nltk; nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
%%capture
!pip install -q transformers

In [4]:
%%capture
!pip install datasets
import datasets 
from datasets import load_dataset, Dataset, DatasetDict

In [5]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [6]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [7]:
from nltk.corpus import wordnet as wn
import gzip
import pickle
import numpy as np
import sys
from scipy.spatial.distance import cosine
from operator import itemgetter
from collections import defaultdict
#from pymagnitude import *
import argparse

import itertools

In [8]:
# import eda script from github
!git clone https://github.com/ainagari/scalar_adjs

fatal: destination path 'scalar_adjs' already exists and is not an empty directory.


In [9]:
# import fucntions from scalar_adjs
import sys

# sys.path is a list of absolute path strings
sys.path.append('/content/scalar_adjs/')

from read_scalar_datasets import read_scales


In [10]:
!ls

drive  sample_data  scalar_adjs


In [11]:
# get the greyscaling augmentation script from our repo
!git clone https://github.com/ipietri/w266_Final_Project
#sys.path.append('/content/')

Cloning into 'w266_Final_Project'...
fatal: could not read Username for 'https://github.com': No such device or address


<a id='section02'></a>
# Import and Preprocess Data

In [12]:
trac2_task_a = pd.read_csv('/content/drive/MyDrive/w266/task_A_data_oversampled.csv')

print('TASK A: ',trac2_task_a.shape)


trac2_dev = pd.read_csv('/content/drive/MyDrive/w266/trac2_eng_dev.csv')
print('dev: ',trac2_dev.shape)
print("TASK A unique sentiments: ", trac2_task_a['Sub-task A'].unique())

TASK A:  (10125, 2)
dev:  (1066, 4)
TASK A unique sentiments:  ['NAG' 'CAG' 'OAG']


In [13]:
# create a dev dataset for track a and rename columns
trac2_task_a_dev = trac2_dev[['Text','Sub-task A']]
trac2_task_a.rename(columns = {'Text':'text', 'Sub-task A': 'label'}, inplace = True)
trac2_task_a_dev.rename(columns = {'Text':'text', 'Sub-task A': 'label'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [14]:
# remove NaNs and return without missing values in response_text

# TRACK A
# train 
trac2_task_a.dropna(subset = ['text'], inplace=True)

# dev
trac2_task_a_dev.dropna(subset = ['text'], inplace=True)

print('train: ',trac2_task_a.shape)
print('dev: ',trac2_task_a_dev.shape)

train:  (10125, 2)
dev:  (1066, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [15]:
# Dummy variables

# task A
task_a_labels = {'NAG':0, 'OAG': 1, 'CAG':2}
trac2_task_a['label'] = trac2_task_a['label'].map(task_a_labels).astype(int)
trac2_task_a_dev['label'] = trac2_task_a_dev['label'].map(task_a_labels).astype(int)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


# Greyscale
Adapted from Scalar Adj Code # [Extract Relevant Text](https://github.com/ainagari/scalar_adjs/blob/master/extract_flickr_scalar.py)



In [16]:
import pickle
import pdb
import spacy
import os
import collections

nlp = spacy.load("en_core_web_sm")

language_str = "en" #set to english -- one dataset in English only

### Identify the location of every word present the three types of scales
Extract and save out as a dictionary for every example that contains at least one scaled word. Adapted from extract_flickr_scalar.py

In [17]:
rankings = dict()
datanames = ["demelo", "crowd", "wilkinson"]

import dill
filename = "/content/drive/MyDrive/w266/scales_with_milder_option.pickle"
scales_with_milder_option = dill.load(open(filename, "rb"))

In [18]:
rankings = dict()
datanames = ["demelo", "crowd", "wilkinson"]
for dataname in datanames:
    r = read_scales("/content/scalar_adjs/data/" + dataname + "/gold_rankings/")
    rankings[dataname] = r

my_words = set()
for dataname in rankings:
    for scale in rankings[dataname]:
        for word in rankings[dataname][scale]:
            words = word.split(" || ")
            for w in words:
                my_words.add(w)

word_sentence_dict = dict()
for word in my_words:
    word_sentence_dict[word] = set()

def accepted_pos(pos):
    if pos in ["ADJ","ADV", "ADP","VERB","DET"]:
        return True
    return False


## Identify the position of the word which exists in any of the scales

  Since there are ties in our scales
  the milder word may be one or more words.
  So if the original word is foo || bar and the 
  milder word is foolish || barish this 
  {foo: foolish, foo: barish, bar: foolish,  bar: barish}

In [19]:
# create a nested dictionary with every scale, scale list with equalities, and all words in the scale

import collections
scales_dict = collections.defaultdict(dict)

for dataname in datanames:
  for scale_file_name, scale in rankings[dataname].items():
    words_in_scale = []
    for ws in scale:
      # split if there are ties
      words_in_scale.extend(ws.split(" || "))
    scales_dict[dataname][str(scale)] = tuple(words_in_scale)

In [20]:
def locate_scale_word(test_col, label_col):
  '''
  return a column with any scale_words 
  in the text column and their position
  '''
  more_test = collections.defaultdict(lambda: collections.defaultdict(dict))
  for data_name in datanames:
      for scale_name, words in scales_dict[data_name].items():
        for word in words:
          # convert text into a list of words to avoid partial matches found using .find()
          
          sentence_words = test_col.replace("'", "") 
          sentence_words = sentence_words.lower().split()
          
          if word in sentence_words:
            pos = sentence_words.index(word)
            # assume only one word to be replaced
            more_test[data_name][word]['position'] = int(pos)
            more_test[data_name][word]['milder_words'] = scales_with_milder_option[data_name][word]
            

  return more_test

In [21]:
# convert relevant df columns to dictionary

# task A
trac2_task_a['new_col'] = trac2_task_a.apply(lambda x: locate_scale_word(x['text'], x['label']), axis = 1)
dict_from_df_a = trac2_task_a.to_dict('index')


# Augment

In [72]:
print(dict_from_df.get(512))

{'text': '@Achyuth Thouta I DO AGREE..  \nSouth Indians are GREAT', 'label': 0, 'new_col': defaultdict(<function locate_scale_word.<locals>.<lambda> at 0x7fe634a419e0>, {'wilkinson': defaultdict(<class 'dict'>, {'great': {'position': 9, 'milder_words': ['good']}}), 'demelo': defaultdict(<class 'dict'>, {}), 'crowd': defaultdict(<class 'dict'>, {})})}


In [None]:
%cd "/content/drive/MyDrive/w266"
%run grey_scale_augmentation.ipynb
labels_list, augmented_text_list, id_list = augment_greyscaling(dict_from_df_a, datanames, 'text', 'label')




In [23]:
# append the augmented data plus labels

new_df = pd.DataFrame({'id':id_list, 'text':augmented_text_list, 
              'label':labels_list})
print("Number of augmented examples: ", len(new_df))

# add an example id column
trac2_task_a['id'] = trac2_task_a.index

# add labels indicating original vs augmented examples
trac2_task_a['is_og'] = 1
new_df['is_og'] = 0

# append to the original examples and create new augmented dataframe
train_df_aug = new_df.append(trac2_task_a)


Number of augmented examples:  11966


In [26]:
train_df_aug

Unnamed: 0,id,text,label,is_og,new_col
0,3,What the fuck was this? I respect shwetabh and...,0,0,
1,3,What the fuck was this? I respect shwetabh and...,0,0,
2,4,interested authorities should bring arundathi ...,0,0,
3,5,It seems like these people want to be known no...,0,0,
4,16,Yeah man **Fuck Bollywood** and become **matur...,0,0,
...,...,...,...,...,...
10120,10120,Ki ato ranu mondol...ranu mondol.....disgustin...,1,1,"{'demelo': {}, 'crowd': {}, 'wilkinson': {}}"
10121,10121,I hate ranu mandal song,1,1,"{'demelo': {}, 'crowd': {}, 'wilkinson': {}}"
10122,10122,You are out of your mind idiot \nYou showing ...,1,1,"{'demelo': {'negative': {'position': 10, 'mild..."
10123,10123,Who is Arundhati roy?? Is she above constituti...,1,1,"{'demelo': {'shut': {'position': 32, 'milder_w..."


In [27]:
# save out greyscale adjusted dataset
train_df_aug.to_csv('/content/drive/MyDrive/w266/grey_scaled_augmented_oversampled_subtask_a_train_data.csv', index=False)
print("Total number of examples: ", len(train_df_aug))

Total number of examples:  22091


# Pre-process - Transform into Hugging Friendly Format

In [None]:
trac2_task_a_aug = pd.read_csv('/content/drive/MyDrive/w266/grey_scaled_augmented_oversampled_subtask_a_train_data.csv')
trac2_task_a_aug = trac2_task_a_aug.drop(['new_col'], axis = 1)


In [None]:
# change to dataset to work with Huggingface transformer & remove unused columns

train_dataset = Dataset.from_pandas(trac2_task_a_aug)
dev_dataset = Dataset.from_pandas(trac2_task_a_dev)
dev_dataset = dev_dataset.remove_columns(column_names= '__index_level_0__')

# combine into a DataDictionary for huggingface use
dataset = DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset 
})


dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'text'],
        num_rows: 20664
    })
    dev: Dataset({
        features: ['text', 'label'],
        num_rows: 1066
    })
})

# Tokenize 

In [None]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
max_length = 150
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length = int(max_length))

def tokenize(batch):
    return tokenizer(batch["text"], padding=True, truncation=True)

dataset_encoded = dataset.map(tokenize, batched=True, batch_size=None)    

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_length": 150,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from c

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

# Model

In [None]:
from transformers import AutoModelForSequenceClassification
num_labels = 3
epochs = 2
iterations = 5
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))
dataset_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size":

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_weighted = f1_score(labels, preds, average="weighted")
    f1_macro = f1_score(labels, preds, average = 'macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1_weighted, "f1_macro": f1_macro} 

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(dataset_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=epochs,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                 # metric_for_best_model="f1_macro",
                                 # weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False
                                  )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
from sklearn.metrics import classification_report

accuracy_list = []
weighted_f1_score_list = []
macro_f1_score_list = []
NAG_f1_score = []
OAG_f1_score = []
CAG_f1_score = []

for i in range(iterations):
  try:
    del trainer
    del results
    del cr
  except: pass


  trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=dataset_encoded["train"],
                  eval_dataset=dataset_encoded["dev"])
  trainer.train()
  results = trainer.evaluate()

  # append macro metrics to lists
  accuracy_list.append(results.get('eval_accuracy'))
  weighted_f1_score_list.append(results.get("eval_f1"))
  macro_f1_score_list.append(results.get("eval_f1_macro"))

  trainer.predict(dataset_encoded["dev"])
  # append the class-level F1 scores
  outputs = trainer.predict(dataset_encoded["dev"])
  predictions = outputs.predictions.argmax(1)
  labels = dataset_encoded["dev"]['label']
  cr = classification_report(labels, predictions, digits=3, output_dict=True)
  NAG_f1_score.append(cr.get('0').get("f1-score"))
  OAG_f1_score.append(cr.get('1').get("f1-score"))
  CAG_f1_score.append(cr.get('2').get("f1-score"))
  
  print(f'---------------------------Iteration {i+1} Complete---------------------------\n')

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 20664
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5166


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.109,1.303187,0.788931,0.785937,0.584464
2,0.0302,1.522091,0.80394,0.786761,0.568868


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-2583
Configuration saved in results/checkpoint-2583/config.json
Model weights saved in results/checkpoint-2583/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-5166
Configuration saved in results/checkpoint-5166/config.json
Model weights saved in results/checkpoint-5166/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-2583 (score: 1.303187370300293).
The following columns in the evaluation se

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 20664
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5166


---------------------------Iteration 1 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0384,1.860684,0.75985,0.773819,0.572661
2,0.0177,1.775616,0.788931,0.779904,0.552732


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-2583
Configuration saved in results/checkpoint-2583/config.json
Model weights saved in results/checkpoint-2583/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-5166
Configuration saved in results/checkpoint-5166/config.json
Model weights saved in results/checkpoint-5166/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-5166 (score: 1.7756156921386719).
The following columns in the evaluation s

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 20664
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5166


---------------------------Iteration 2 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0099,2.158555,0.774859,0.780187,0.570642
2,0.0127,2.007052,0.797373,0.792885,0.57996


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-2583
Configuration saved in results/checkpoint-2583/config.json
Model weights saved in results/checkpoint-2583/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-5166
Configuration saved in results/checkpoint-5166/config.json
Model weights saved in results/checkpoint-5166/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-5166 (score: 2.007052183151245).
The following columns in the evaluation se

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 20664
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5166


---------------------------Iteration 3 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0132,2.179323,0.785178,0.785715,0.581594
2,0.0041,2.134393,0.786116,0.788062,0.578146


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-2583
Configuration saved in results/checkpoint-2583/config.json
Model weights saved in results/checkpoint-2583/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-5166
Configuration saved in results/checkpoint-5166/config.json
Model weights saved in results/checkpoint-5166/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-5166 (score: 2.134392738342285).
The following columns in the evaluation se

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running training *****
  Num examples = 20664
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5166


---------------------------Iteration 4 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0243,2.194261,0.788931,0.789152,0.584637
2,0.0046,2.174869,0.79925,0.788663,0.565519


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-2583
Configuration saved in results/checkpoint-2583/config.json
Model weights saved in results/checkpoint-2583/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Evaluation *****
  Num examples = 1066
  Batch size = 8
Saving model checkpoint to results/checkpoint-5166
Configuration saved in results/checkpoint-5166/config.json
Model weights saved in results/checkpoint-5166/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


Loading best model from results/checkpoint-5166 (score: 2.1748692989349365).
The following columns in the evaluation s

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: text.
***** Running Prediction *****
  Num examples = 1066
  Batch size = 8


---------------------------Iteration 5 Complete---------------------------



# Evaluate

In [None]:
import statistics

print("%15s %s (%s)" % ("","Mean", "StDev"))

print("-"*29)
print("Macro Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Accuracy",
    round(statistics.mean(accuracy_list),3),
    round(statistics.stdev(accuracy_list),3)))
print(f"%15s %5s (%s)" %("Macro F1",
    round(statistics.mean(macro_f1_score_list),3),
    round(statistics.stdev(macro_f1_score_list),3)))
print(f"%15s %5s (%s)" %("Weighted F1",
    round(statistics.mean(weighted_f1_score_list),3),
    round(statistics.stdev(weighted_f1_score_list),3)))

print("-"*29)
print("Class Scores")
print("-"*29)

print(f"%15s %s (%s)" %("NAG",
    round(statistics.mean(NAG_f1_score),3),
    round(statistics.stdev(NAG_f1_score),3)))
print(f"%15s %5s (%s)" %("OAG",
    round(statistics.mean(OAG_f1_score),3),
    round(statistics.stdev(OAG_f1_score),3)))
print(f"%15s %5s (%s)" %("CAG",
    round(statistics.mean(CAG_f1_score),3),
    round(statistics.stdev(CAG_f1_score),3)))

                Mean (StDev)
-----------------------------
Macro Scores
-----------------------------
       Accuracy 0.792 (0.006)
       Macro F1 0.572 (0.013)
    Weighted F1 0.787 (0.005)
-----------------------------
Class Scores
-----------------------------
            NAG 0.89 (0.005)
            OAG 0.471 (0.02)
            CAG 0.355 (0.042)
