<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/Experiments_Excluding_Irrelevant_and_Mixed_Sentiment_Examples/RtGender_BERT_Easy_Data_Augmentation_Techniques_Removed_%26_Mixed_Sentiment_Irrelevant.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

RtGender - Annotations - Sentiment w/ Easy Data Augmentation Techniques
Removed Irrelevant & Mixed Sentiment Examples\
With oversampled data\
[Source](https://github.com/jasonwei20/eda_nlp)

[Paper that explores these techniques](https://arxiv.org/abs/1901.11196)


In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


<a id='section01'></a>
## Load Modules

In [2]:
!pip install -U nltk
import nltk; nltk.download('wordnet')

Collecting nltk
  Downloading nltk-3.6.5-py3-none-any.whl (1.5 MB)
[?25l[K     |▎                               | 10 kB 40.1 MB/s eta 0:00:01[K     |▌                               | 20 kB 47.8 MB/s eta 0:00:01[K     |▊                               | 30 kB 42.0 MB/s eta 0:00:01[K     |█                               | 40 kB 25.6 MB/s eta 0:00:01[K     |█▏                              | 51 kB 16.2 MB/s eta 0:00:01[K     |█▍                              | 61 kB 14.2 MB/s eta 0:00:01[K     |█▋                              | 71 kB 13.2 MB/s eta 0:00:01[K     |█▉                              | 81 kB 14.7 MB/s eta 0:00:01[K     |██                              | 92 kB 13.6 MB/s eta 0:00:01[K     |██▎                             | 102 kB 14.7 MB/s eta 0:00:01[K     |██▌                             | 112 kB 14.7 MB/s eta 0:00:01[K     |██▊                             | 122 kB 14.7 MB/s eta 0:00:01[K     |███                             | 133 kB 14.7 MB/s eta 0:00:01

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
%%capture
#!pip install transformers==3.0.2
!pip install -q transformers

In [4]:
%%capture
!pip install datasets
import datasets 
from datasets import load_dataset, Dataset, DatasetDict

In [5]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [6]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

<a id='section02'></a>
## Import and Reshape Data

In [7]:
train_df = pd.read_csv('/content/drive/MyDrive/w266/train_oversampled.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_dev.csv')

print('train_shape: ',train_df.shape)
print('dev_shape: ',dev_df.shape)

train_shape:  (21184, 9)
dev_shape:  (2303, 9)


In [8]:
# there are NaNs in the dev dataset remove 
nan_values = dev_df[dev_df.isna().any(axis=1)] 
print(nan_values)

# return without missing values in response_text
dev_df.dropna(subset = ["response_text"], inplace=True)

print("Train shape", train_df.shape)
print("Dev shape", dev_df.shape)

      Unnamed: 0         source op_gender  ...   relevance label labels_4
830         2576  facebook_wiki         M  ...  Irrelevant     1        1
1664        2722  facebook_wiki         W  ...  Irrelevant     1        1

[2 rows x 9 columns]
Train shape (21184, 9)
Dev shape (2301, 9)


In [9]:
print("Unique sentiments: ", train_df['sentiment'].unique())

Unique sentiments:  ['Positive' 'Mixed' 'Neutral' 'Negative']


In [10]:
# remove mixed sentiment and irrelevant examples
train_df  = train_df[train_df['relevance'] != 'Irrelevant']
dev_df = dev_df[dev_df['relevance'] != 'Irrelevant']
train_df  = train_df[train_df['sentiment'] != 'Mixed']
dev_df = dev_df[dev_df['sentiment'] != 'Mixed']

print('updated train_shape: ',train_df.shape)
print('updated dev_shape: ',dev_df.shape)
print("Unique sentiments: ", train_df['sentiment'].unique())

updated train_shape:  (13576, 9)
updated dev_shape:  (1821, 9)
Unique sentiments:  ['Positive' 'Neutral' 'Negative']


# Run through Easy Data Augmentation Techniques package
[augment](https://github.com/jasonwei20/eda_nlp/blob/master/code/augment.py)


#### Import Modules

In [11]:
# import eda script from github
!git clone https://github.com/jasonwei20/eda_nlp

Cloning into 'eda_nlp'...
remote: Enumerating objects: 396, done.[K
remote: Counting objects: 100% (17/17), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 396 (delta 6), reused 8 (delta 4), pack-reused 379[K
Receiving objects: 100% (396/396), 20.42 MiB | 25.16 MiB/s, done.
Resolving deltas: 100% (189/189), done.


In [12]:
import sys
# sys.path is a list of absolute path strings
sys.path.append('/content/eda_nlp/code/')

from eda import *


#### Random deletion & Synonym Replacement
Randomly delete words from the sentence with probability p\
Run through the EDA three times.

In [50]:
def run_del_and_syn_replacement(train_df, iterations):
  augmented_sentences = []
  labels_list = []
  alpha_sr=0.1

  for i in range(iterations):
    for idx in range(len(train_df)):
      example = train_df.iloc[idx]
      labels_list.append(example.labels_4)
      sentence = example.response_text
      
      words = sentence.split(' ')
      # apply random deletion 
      a_words = random_deletion(words, 0.1)
      
      # replace a word with synonyms
      num_words = len(a_words)
      n_sr = max(1, int(alpha_sr*num_words))
      b_words = synonym_replacement(a_words, n_sr)
      augmented_sentences.append(' '.join(b_words))
      
  return augmented_sentences, labels_list


In [56]:
 augmented_sentences, labels_list = run_del_and_syn_replacement(train_df, 3)
 print("number of synthetic examples: ", len(labels_list)/len(train_df))

 

number of synthetic examples:  3.0


In [58]:
# combine with original dataframe
augmented = pd.DataFrame({'response_text': augmented_sentences,
                           'labels_4': labels_list})
original = train_df[['labels_4', 'response_text']]

train_df_aug = original.append(augmented)
print("number of total examples: ", len(train_df_aug))


number of total examples:  54304


# Transform to HuggingFace friendly format

In [59]:
# change to dataset to work with Huggingface transformer & remove unused columns

from datasets import load_dataset
train_dataset = Dataset.from_pandas(train_df_aug)
dev_dataset = Dataset.from_pandas(dev_df)

train_dataset = train_dataset.remove_columns(column_names= '__index_level_0__')
dev_dataset = dev_dataset.remove_columns(column_names= columns_to_remove)
dev_dataset = dev_dataset.remove_columns(column_names= '__index_level_0__')

# rename sentiment to labels
train_dataset = train_dataset.rename_column("labels_4", "label")
dev_dataset = dev_dataset.rename_column("labels_4", "label")

In [60]:
# combine into a DataDictionary for huggingface use
rtg_dataset = DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset 
})

rtg_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'response_text'],
        num_rows: 54304
    })
    dev: Dataset({
        features: ['response_text', 'label'],
        num_rows: 1821
    })
})

## Tokenize

In [61]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [62]:
# find the P99 of length for response_text and set that as the max length 
max_length = train_df['response_text'].astype(str).map(len).quantile(0.99)
print(f"99th %tile of response_text length: {max_length}")

99th %tile of response_text length: 286.0


In [63]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length = max_length)

def tokenize(batch):
    return tokenizer(batch["response_text"], padding=True, truncation=True)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_length": 286.0,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from

In [64]:
rtg_encoded = rtg_dataset.map(tokenize, batched=True, batch_size=None)
rtg_encoded['train'].features

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'response_text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

# Model

In [65]:
from transformers import AutoModelForSequenceClassification
num_labels = 3
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.5",
  "type_vocab_size":

In [68]:
rtg_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])


In [69]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_weighted = f1_score(labels, preds, average="weighted")
    f1_macro = f1_score(labels, preds, average = 'macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1_weighted, "f1_macro": f1_macro} 

In [70]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(rtg_encoded["train"]) // (batch_size *3)
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=True,
                                 # metric_for_best_model="f1_macro",
                                 # weight_decay=0.01,
                                  evaluation_strategy="steps",
                                  save_strategy="steps",
                                  disable_tqdm=False
                                  )

using `logging_steps` to initialize `eval_steps` to 500
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [71]:
from sklearn.metrics import classification_report

accuracy_list = []
weighted_f1_score_list = []
macro_f1_score_list = []
negative_f1_score = []
neutral_f1_score = []
positive_f1_score = []


for i in range(5):
  try:
    del trainer
    del results
    del cr
  except: pass


  trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=rtg_encoded["train"],
                  eval_dataset=rtg_encoded["dev"])
  trainer.train()
  results = trainer.evaluate()

  # append macro metrics to lists
  accuracy_list.append(results.get('eval_accuracy'))
  weighted_f1_score_list.append(results.get("eval_f1"))
  macro_f1_score_list.append(results.get("eval_f1_macro"))

  trainer.predict(rtg_encoded["dev"])
  # append the class-level F1 scores
  outputs = trainer.predict(rtg_encoded["dev"])
  predictions = outputs.predictions.argmax(1)
  labels = rtg_encoded["dev"]['label']
  cr = classification_report(labels, predictions, digits=3, output_dict=True)
  negative_f1_score.append(cr.get('0').get("f1-score"))
  neutral_f1_score.append(cr.get('1').get("f1-score"))
  positive_f1_score.append(cr.get('2').get("f1-score"))

  print(f'---------------------------Iteration {i+1} Complete---------------------------\n')

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 54304
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13576


Step,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
500,0.7555,0.617552,0.74849,0.754031,0.691613
1000,0.6075,0.617604,0.75508,0.757117,0.695015
1500,0.5079,0.661896,0.760571,0.760942,0.698577
2000,0.4734,0.701221,0.760022,0.763452,0.704021
2500,0.4363,0.892119,0.749588,0.744314,0.672996
3000,0.3999,0.805987,0.762219,0.759872,0.69343
3500,0.3759,0.863002,0.762219,0.766185,0.708597
4000,0.3714,0.91427,0.759473,0.758337,0.692721
4500,0.3657,0.960575,0.750137,0.756638,0.700215
5000,0.306,1.018252,0.764964,0.760953,0.698559


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation **

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 54304
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13576


---------------------------Iteration 1 Complete---------------------------



Step,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
500,0.3095,1.225323,0.710599,0.724204,0.665406
1000,0.6048,0.616496,0.762219,0.763128,0.703144
1500,0.4875,0.701295,0.750686,0.750171,0.684892
2000,0.4572,0.751717,0.766612,0.765937,0.705459
2500,0.4137,0.959337,0.744646,0.742063,0.672775
3000,0.3925,0.865063,0.75453,0.75132,0.685529
3500,0.3564,0.96771,0.757825,0.758432,0.696252
4000,0.3529,0.992793,0.760022,0.75513,0.691194
4500,0.3424,1.028133,0.75453,0.756382,0.695206
5000,0.3192,1.09777,0.758375,0.747485,0.681382


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation **

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 54304
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13576


---------------------------Iteration 2 Complete---------------------------



Step,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
500,0.1192,1.755886,0.70291,0.717951,0.66008
1000,0.1889,1.40112,0.747392,0.750329,0.688682
1500,0.5154,0.671573,0.753981,0.755287,0.691384
2000,0.4358,0.803046,0.753981,0.757434,0.695656
2500,0.4088,0.974176,0.764415,0.756091,0.687088
3000,0.3836,0.853063,0.760571,0.754081,0.687259
3500,0.3472,1.077794,0.758924,0.761387,0.700392
4000,0.3404,1.060785,0.746842,0.743735,0.674564
4500,0.3397,1.089638,0.755629,0.753291,0.690008
5000,0.2949,1.127014,0.760571,0.756279,0.693682


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation **

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 54304
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13576


---------------------------Iteration 3 Complete---------------------------



Step,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
500,0.0529,1.777084,0.742998,0.746182,0.681517
1000,0.0933,1.732805,0.757825,0.751023,0.68632
1500,0.2014,1.599025,0.750686,0.750194,0.684224
2000,0.4776,0.860748,0.76112,0.761415,0.697476
2500,0.3914,1.054086,0.752334,0.742416,0.669168
3000,0.3902,0.940853,0.752334,0.746013,0.676071
3500,0.3344,1.118318,0.751785,0.756944,0.697065
4000,0.3289,1.08215,0.753432,0.751015,0.685712
4500,0.3298,1.148284,0.749588,0.748596,0.68365
5000,0.2793,1.259982,0.753981,0.740736,0.670309


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation **

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 54304
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 13576


---------------------------Iteration 4 Complete---------------------------



Step,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
500,0.0396,1.886578,0.747941,0.748635,0.681478
1000,0.0591,2.106729,0.749039,0.733189,0.656463
1500,0.1039,1.833354,0.758924,0.753284,0.689305
2000,0.1963,1.48251,0.743548,0.749776,0.690779
2500,0.4348,1.028764,0.751785,0.736282,0.66308
3000,0.3709,1.043404,0.756727,0.746115,0.679115
3500,0.3219,1.24182,0.739154,0.746149,0.686199
4000,0.291,1.154687,0.758924,0.75515,0.691757
4500,0.31,1.22213,0.750686,0.748012,0.684362
5000,0.2616,1.300305,0.751785,0.742565,0.676386


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 1821
  Batch size = 8
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation **

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 1821
  Batch size = 8


---------------------------Iteration 5 Complete---------------------------



In [72]:
import statistics

print("%15s %s (%s)" % ("","Mean", "StDev"))

print("-"*29)
print("Macro Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Accuracy",
    round(statistics.mean(accuracy_list),3),
    round(statistics.stdev(accuracy_list),3)))
print(f"%15s %5s (%s)" %("Macro F1",
    round(statistics.mean(macro_f1_score_list),3),
    round(statistics.stdev(macro_f1_score_list),3)))
print(f"%15s %5s (%s)" %("Weighted F1",
    round(statistics.mean(weighted_f1_score_list),3),
    round(statistics.stdev(weighted_f1_score_list),3)))

print("-"*29)
print("Class Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Positive",
    round(statistics.mean(positive_f1_score),3),
    round(statistics.stdev(positive_f1_score),3)))
print(f"%15s %5s (%s)" %("Neutral",
    round(statistics.mean(neutral_f1_score),3),
    round(statistics.stdev(neutral_f1_score),3)))
print(f"%15s %5s (%s)" %("Negative",
    round(statistics.mean(negative_f1_score),3),
    round(statistics.stdev(negative_f1_score),3)))


                Mean (StDev)
-----------------------------
Macro Scores
-----------------------------
       Accuracy 0.756 (0.006)
       Macro F1 0.689 (0.015)
    Weighted F1 0.754 (0.011)
-----------------------------
Class Scores
-----------------------------
       Positive 0.862 (0.002)
        Neutral 0.555 (0.056)
       Negative 0.651 (0.016)
