<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/RtGender_Annotations_Sentiment_Grayscaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RtGender - Annotations - Sentiment w/ Grayscaling
[Code Source](https://github.com/ainagari/scalar_adjs)

* [BERT Knows Punta Cana is not just beautiful, it’s gorgeous:
Ranking Scalar Adjectives with Contextualised Representations](https://aclanthology.org/2020.emnlp-main.598.pdf)\
*[Scalar Adjective Identification and Multilingual Ranking
](https://arxiv.org/abs/2105.01180)\
*[Identifying and Ordering Scalar Adjectives Using Lexical Substitution](https://www.proquest.com/openview/aade435a5bbdcf41e2b8c24e648826cc/1.pdf?pq-origsite=gscholar&cbl=18750)\
*[A Gold Standard for Scalar Adjectives](https://aclanthology.org/L16-1424/)


In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


<a id='section01'></a>
## Load Modules

In [2]:
#!pip install -U nltk
import nltk; nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
%%capture
#!pip install transformers==3.0.2
!pip install -q transformers
#!pip install pymagnitude

In [4]:
%%capture
!pip install datasets
import datasets 
from datasets import load_dataset, Dataset, DatasetDict

In [5]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [6]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [7]:
from nltk.corpus import wordnet as wn
import gzip
import pickle
import numpy as np
import sys
from scipy.spatial.distance import cosine
from operator import itemgetter
from collections import defaultdict
#from pymagnitude import *
import argparse

import itertools

In [8]:
# import eda script from github
!git clone https://github.com/ainagari/scalar_adjs

Cloning into 'scalar_adjs'...
remote: Enumerating objects: 854, done.[K
remote: Counting objects: 100% (438/438), done.[K
remote: Compressing objects: 100% (189/189), done.[K
remote: Total 854 (delta 91), reused 358 (delta 45), pack-reused 416[K
Receiving objects: 100% (854/854), 13.47 MiB | 4.40 MiB/s, done.
Resolving deltas: 100% (129/129), done.


In [9]:
# import fucntions from scalar_adjs
import sys

# sys.path is a list of absolute path strings
sys.path.append('/content/scalar_adjs/')

from read_scalar_datasets import read_scales


<a id='section02'></a>
# Import and Reshape Data

In [10]:
train_df = pd.read_csv('/content/drive/MyDrive/w266/train_oversampled.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_dev.csv')

print('train_shape: ',train_df.shape)
print('dev_shape: ',dev_df.shape)

train_shape:  (21184, 9)
dev_shape:  (2303, 9)


In [11]:
train_df

Unnamed: 0.1,Unnamed: 0,source,op_gender,post_text,response_text,sentiment,relevance,label,labels_4
0,3845,facebook_congress,W,Im reading the 3/1 GAO report that finds billi...,Thank you Congresswoman Bass. Keep up the grea...,Positive,ContentPoster,2,2
1,9743,fitocracy,M,Being followed by the famous DBJ? Quite an honor.,"Well, I am very honored you feel so honored",Positive,Content,2,2
2,13041,ted,W,"Penelope Boston gave a talk about Planets, exp...",Her opinions seem driven by wishful thinking. ...,Mixed,Content,1,3
3,4265,facebook_congress,W,Congress must act to help the 41 million Ameri...,There's no other way out of the enormity excep...,Positive,Content,2,2
4,13145,ted,W,"Pardis Sabeti gave a talk about Africa, big pr...",What were the benefits of the larger community...,Mixed,Content,1,3
...,...,...,...,...,...,...,...,...,...
21179,6561,facebook_congress,W,It was terrific to have the chance to hear fro...,"""Committed to making sure we don't lose our he...",Negative,Poster,0,0
21180,3829,facebook_congress,M,Johnny will join Tim Bryant on WGAU 1340 AM (A...,Both Isakson and Chambliss voted to TABLE Rand...,Negative,Poster,0,0
21181,11700,reddit,W,It wouldnt disintegrate you.,I think a magic beam of pure light would disin...,Negative,Content,0,0
21182,5668,facebook_congress,M,Our contest for a chance to attend a special c...,I'd rather have a root canal . . .,Negative,ContentPoster,0,0


In [12]:
# there are NaNs in the dev dataset remove 
nan_values = dev_df[dev_df.isna().any(axis=1)] 
print(nan_values)

# return without missing values in response_text
dev_df.dropna(subset = ["response_text"], inplace=True)

print("Train shape", train_df.shape)
print("Dev shape", dev_df.shape)

      Unnamed: 0         source op_gender  ...   relevance label labels_4
830         2576  facebook_wiki         M  ...  Irrelevant     1        1
1664        2722  facebook_wiki         W  ...  Irrelevant     1        1

[2 rows x 9 columns]
Train shape (21184, 9)
Dev shape (2301, 9)


In [13]:
print("Unique sentiments: ", train_df['sentiment'].unique())

Unique sentiments:  ['Positive' 'Mixed' 'Neutral' 'Negative']


# Greyscale
Adapted from Scalar Adj Code



# [Extract Relevant Text](https://github.com/ainagari/scalar_adjs/blob/master/extract_flickr_scalar.py)

In [14]:
import pickle
import pdb
import spacy
import os

nlp = spacy.load("en_core_web_sm")

language_str = "en" #set to english -- one dataset in English only

### Identify the location of every word present the three types of scales
Extract and save out as a dictionary for every example that contains at least one scaled word. Adapted from extract_flickr_scalar.py

In [15]:
rankings = dict()
datanames = ["demelo", "crowd", "wilkinson"]
for dataname in datanames:
    r = read_scales("/content/scalar_adjs/data/" + dataname + "/gold_rankings/")
    rankings[dataname] = r

my_words = set()
for dataname in rankings:
    for scale in rankings[dataname]:
        for word in rankings[dataname][scale]:
            words = word.split(" || ")
            for w in words:
                my_words.add(w)

word_sentence_dict = dict()
for word in my_words:
    word_sentence_dict[word] = set()

def accepted_pos(pos):
    if pos in ["ADJ","ADV", "ADP","VERB","DET"]:
        return True
    return False



## Identify the position of the word which exists in any of the scales

## Toy example

In [16]:
mini_train_df = train_df[1:100]
mini_train_df.head()

Unnamed: 0.1,Unnamed: 0,source,op_gender,post_text,response_text,sentiment,relevance,label,labels_4
1,9743,fitocracy,M,Being followed by the famous DBJ? Quite an honor.,"Well, I am very honored you feel so honored",Positive,Content,2,2
2,13041,ted,W,"Penelope Boston gave a talk about Planets, exp...",Her opinions seem driven by wishful thinking. ...,Mixed,Content,1,3
3,4265,facebook_congress,W,Congress must act to help the 41 million Ameri...,There's no other way out of the enormity excep...,Positive,Content,2,2
4,13145,ted,W,"Pardis Sabeti gave a talk about Africa, big pr...",What were the benefits of the larger community...,Mixed,Content,1,3
5,743,facebook_wiki,M,Happy New Day from CNN's Studio 71 where the L...,I'm looking forward to seeing some *Good News*...,Mixed,Content,1,3


## Create Scaled Dictionaries with Grey Scaling

In [17]:
# create a nested dictionary with every scale, scale list with equalities, and all words in the scale

import collections

scales_dict = collections.defaultdict(dict)

for dataname in datanames:
  for scale_file_name, scale in rankings[dataname].items():
    words_in_scale = []
    for ws in scale:
      # split if there are ties
      words_in_scale.extend(ws.split(" || "))
    scales_dict[dataname][str(scale)] = tuple(words_in_scale)


  Since there are ties in our scales
  the milder word may be one or more words.
  So if the original word is foo || bar and the 
  milder word is foolish || barish this 
  {foo: foolish, foo: barish, bar: foolish,  bar: barish}

In [18]:
scales_with_milder_option = collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(dict)))

### iterate through temp dictionaries and create a master dictionary
for data_name in datanames:
  for scale_name, words in scales_dict[data_name].items():
    '''
    Sample input: scale_name = ['harmful', 'toxic', 'deadly']
    Sample output: {'deadly': ['harmful', 'toxic'], 'toxic': ['harmful']}
    '''
    ### convert key from string to list
    # drop first and last characters which are brackets[]
    scale_name = scale_name[2:-1].replace("'", "")
    scale_name = scale_name.split(", ") 

    while len(scale_name) > 1:
      most_extreme_words = scale_name[-1].split(" || ")
      
      milder_words = scale_name[:-1]
      milder_list = []
      for words in milder_words:
        milder_list.extend(words.split(" || "))
      instance = dict(itertools.product(most_extreme_words, milder_list))

      for k in most_extreme_words:
          scales_with_milder_option[data_name][k] = milder_list
      
      # drop most extreme terms
      scale_name.pop()

In [146]:
def locate_scale_word(test_col, label_col):
  '''
  return a column with any scale_words 
  in the text column and their position
  '''
  more_test = collections.defaultdict(lambda: collections.defaultdict(dict))
  for data_name in datanames:
      for scale_name, words in scales_dict[data_name].items():
        for word in words:
          # convert text into a list of words to avoid partial matches found using .find()
          
          sentence_words = test_col.replace("'", "") 
          sentence_words = sentence_words.lower().split(" ")
          
          if word in sentence_words:
            pos = sentence_words.index(word)
         #   more_test[data_name]['label'] = label_col
            # assume only one word to be replaced
            more_test[data_name][word]['position'] = int(pos)
          #  more_test[data_name]['sentence'][word] = test_col
            more_test[data_name][word]['milder_words'] = scales_with_milder_option[data_name][word]
            

  return more_test

train_df['new_col'] = train_df.apply(lambda x: locate_scale_word(x['response_text'], x['labels_4']), axis = 1)
train_df.iloc[1]['new_col']

defaultdict(<function __main__.locate_scale_word.<locals>.<lambda>>, {})

In [147]:
# convert relevant df columns to dictionary
dict_from_df = train_df[['response_text', 'labels_4','new_col']].T.to_dict()

In [148]:
dict_from_df[3]

{'labels_4': 2,
 'new_col': defaultdict(<function __main__.locate_scale_word.<locals>.<lambda>>,
             {'demelo': defaultdict(dict,
                          {'clean': {'milder_words': defaultdict(dict, {}),
                            'position': 17},
                           'far': {'milder_words': defaultdict(dict, {}),
                            'position': 38}})}),
 'response_text': "There's no other way out of the enormity except to forgive the debt and start with a clean slate. Otherwise, this is going to lead to the next collapse in the U.S. economy, which will result in far, far more cost to every American."}

In [149]:
dict_from_df[3]['response_text']

"There's no other way out of the enormity except to forgive the debt and start with a clean slate. Otherwise, this is going to lead to the next collapse in the U.S. economy, which will result in far, far more cost to every American."

In [150]:
sentence_words = dict_from_df[3]['response_text'].replace("'", "")
sentence_words = sentence_words.split(" ") 
sentence_words.index('clean')

17

In [151]:
for i in dict_from_df[8]['new_col']['crowd']['pretty']['milder_words']:
  print(i)

cute


# Augment

In [154]:
from transformers import BertTokenizer, BertConfig, BertModel, AutoTokenizer, AutoModel, FlaubertTokenizer, FlaubertModel, AutoConfig, FlaubertConfig

language_str = "en"
#whether we exclude the last bpe of words when words are split into multiple wordpieces
exclude_last_bpe ="True"
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
#sentences = train_df['response_text']

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/bert-base-uncased/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer_config.json from cache at /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79
loading file https://huggingface.co/bert-base-uncased/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc7043

In [155]:
# from extract_representations.py
def special_tokenization(sentence, tokenizer, model_name):
    map_ori_to_bert = []
    if "flaubert" in model_name:
        tok_sent = ['<s>']
    else:
        tok_sent = ['[CLS]']

    for orig_token in sentence.split():
        current_tokens_bert_idx = [len(tok_sent)]
        bert_token = tokenizer.tokenize(orig_token) # tokenize
        tok_sent.extend(bert_token) # add to my new tokens
        if len(bert_token) > 1: # if the new token has been 'wordpieced'
            extra = len(bert_token) - 1
            for i in range(extra):
                current_tokens_bert_idx.append(current_tokens_bert_idx[-1]+1) # list of new positions of the target word in the new tokenization
        map_ori_to_bert.append(tuple(current_tokens_bert_idx))

    if "flaubert" in model_name:
        tok_sent.append('</s>')
    else:
        tok_sent.append('[SEP]')

    return tok_sent, map_ori_to_bert


In [156]:
import copy
infos = []
final_dict = []
labels_list = []
augmented_text_list = []

#collections.defaultdict(lambda: collections.defaultdict(dict))
# collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(dict))))

for id, values in dict_from_df.items():
  num_positions = 0
  for data_name in datanames:
    # number of word positions for replacement
    num_positions += len(dict_from_df[id]['new_col'][data_name])

    if num_positions == 0:
      continue

    else:
      for word, values in dict_from_df[id]['new_col'][data_name].items():
        num_mild_words = len(dict_from_df[id]['new_col'][data_name][word]['milder_words'])
        
        if num_mild_words == 0:
          continue

        else:
          # assume the word is only in one location in the example text
          position_scaleword = dict_from_df[id]['new_col'][data_name][word]['position']
          cinstance = dict()
          cinstance['response_text'] = copy.deepcopy(dict_from_df[id]['response_text'])

          # convert text to a list
          sentence_words = dict_from_df[id]['response_text'].replace("'", "") 
          sentence_words = sentence_words.split(" ")
          for scale_word in dict_from_df[id]['new_col'][data_name][word]['milder_words']:

            # change a to an and vice versa depending on first letter of the scaleword 
            if sentence_words[position_scaleword-1] == "a" and scale_word[0] in "aeiou":
              sentence_words[position_scaleword-1] = "an"
            elif sentence_words[position_scaleword-1] == "an" and scale_word[0] not in "aeiou":
              sentence_words[position_scaleword-1] = "a"
            
            # and replace the scaleword
            sentence_words[position_scaleword] = scale_word
              
            # scaleword_position = cinstance["position"]
            cinstance["position"] = [position_scaleword]# = [cinstance[0].get("position")]
            
            ## extract and tokenize the original sentence
            example = ' '.join(sentence_words)

            # add augmented text to final dictionary
            test_df.loc[len(test_df)] = [dict_from_df[id]['labels_4'], 
                                         ' '.join(sentence_words)]
            labels_list.append(dict_from_df[id]['labels_4'])
            augmented_text_list.append(' '.join(sentence_words))

            bert_tokenized_sentence, mapp = special_tokenization(example, tokenizer, model_name)
            
            current_positions = cinstance['position']
            if len(current_positions) == 1:
                bert_position = mapp[cinstance['position'][0]] # this is a list of positions (it might have been split into wordpieces)
            elif len(current_positions) > 1:
              bert_position = []
              for p in current_positions:
                  bert_position.extend(mapp[p])
            
            cinstance[id] = id
            cinstance["bert_tokenized_sentence"] = bert_tokenized_sentence
            cinstance["bert_position"] = bert_position
            cinstance["scale"] = scale
            cinstance["lemma"] = scale_word
            infos.append(cinstance)


# Pre-process 

In [157]:
# append the augmented data plus labels

test_list = list(zip(labels_list, augmented_text_list))
train_df_aug = train_df.append(pd.DataFrame(test_list,
                                      columns = ['labels_4', 'response_text']),
                         ignore_index = True)
# save out greyscale adjusted dataset
train_df_aug.to_csv('/content/drive/MyDrive/w266/grey_scaled_augmented_oversampled_train_data.csv', index=False)

train_df_aug = train_df_aug.drop(['Unnamed: 0', 'source', 'op_gender', 'post_text', 'sentiment', 'relevance', 'label',  'new_col'], axis = 1)

## Transform into Hugging Friendly Format

In [158]:
# change to dataset to work with Huggingface transformer & remove unused columns
columns_to_remove = ['op_gender', 'source', 'Unnamed: 0', 'relevance', 'sentiment', 'post_text','label']

train_dataset = Dataset.from_pandas(train_df_aug)
dev_dataset = Dataset.from_pandas(dev_df)

dev_dataset = dev_dataset.remove_columns(column_names= columns_to_remove)
dev_dataset = dev_dataset.remove_columns(column_names= '__index_level_0__')


# rename labels_4 to labels
train_dataset = train_dataset.rename_column("labels_4", "label")
dev_dataset = dev_dataset.rename_column("labels_4", "label")

In [159]:
train_dataset = Dataset.from_pandas(train_df_aug)
dev_dataset = Dataset.from_pandas(dev_df)

dev_dataset = dev_dataset.remove_columns(column_names= columns_to_remove)
dev_dataset = dev_dataset.remove_columns(column_names= '__index_level_0__')

# rename sentiment to labels
train_dataset = train_dataset.rename_column("labels_4", "label")
dev_dataset = dev_dataset.rename_column("labels_4", "label")

In [160]:
# combine into a DataDictionary for huggingface use
rtg_dataset = DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset 
})

rtg_dataset

DatasetDict({
    train: Dataset({
        features: ['response_text', 'label'],
        num_rows: 40817
    })
    dev: Dataset({
        features: ['response_text', 'label'],
        num_rows: 2301
    })
})

# Tokenize 

In [161]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
max_length = train_df['response_text'].astype(str).map(len).quantile(0.99).astype(int)
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length = int(max_length))

def tokenize(batch):
    return tokenizer(batch["response_text"], padding=True, truncation=True)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_length": 289,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.12.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

loading file https://huggingface.co/bert-base-uncased/resolve/main/vocab.txt from c

In [162]:
rtg_encoded = rtg_dataset.map(tokenize, batched=True, batch_size=None)
rtg_encoded['train'].features

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'response_text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

# Model

In [163]:
from transformers import AutoModelForSequenceClassification
num_labels = 4
epochs = 2
iterations = 5
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_ve

In [164]:
rtg_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# check the type for each feature
rtg_encoded["dev"].features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'response_text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [165]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_weighted = f1_score(labels, preds, average="weighted")
    f1_macro = f1_score(labels, preds, average = 'macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1_weighted, "f1_macro": f1_macro} 

In [166]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(rtg_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=epochs,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=False,
                                 # metric_for_best_model="f1_macro",
                                 # weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                #  save_strategy="epoch",
                                  disable_tqdm=False
                                  )

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [167]:
from sklearn.metrics import classification_report

accuracy_list = []
weighted_f1_score_list = []
macro_f1_score_list = []
negative_f1_score = []
neutral_f1_score = []
mixed_f1_score = []
positive_f1_score = []


for i in range(iterations):
  try:
    del trainer
    del results
    del cr
  except: pass


  trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=rtg_encoded["train"],
                  eval_dataset=rtg_encoded["dev"])
  trainer.train()
  results = trainer.evaluate()

  # append macro metrics to lists
  accuracy_list.append(results.get('eval_accuracy'))
  weighted_f1_score_list.append(results.get("eval_f1"))
  macro_f1_score_list.append(results.get("eval_f1_macro"))

  trainer.predict(rtg_encoded["dev"])
  # append the class-level F1 scores
  outputs = trainer.predict(rtg_encoded["dev"])
  predictions = outputs.predictions.argmax(1)
  labels = rtg_encoded["dev"]['label']
  cr = classification_report(labels, predictions, digits=3, output_dict=True)
  negative_f1_score.append(cr.get('0').get("f1-score"))
  neutral_f1_score.append(cr.get('1').get("f1-score"))
  positive_f1_score.append(cr.get('2').get("f1-score"))
  mixed_f1_score.append(cr.get('3').get("f1-score"))


  print(f'---------------------------Iteration {i+1} Complete---------------------------\n')

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 40817
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10206


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.281,1.548057,0.664059,0.655926,0.55099
2,0.1225,1.973267,0.67362,0.665973,0.563731


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights save

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 40817
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10206


---------------------------Iteration 1 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0661,2.541944,0.639722,0.639758,0.534619
2,0.062,2.501655,0.662321,0.651957,0.547058


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights save

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 40817
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10206


---------------------------Iteration 2 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0295,2.82322,0.654498,0.637388,0.521529
2,0.0281,2.888439,0.651456,0.64171,0.534598


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights save

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 40817
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10206


---------------------------Iteration 3 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0246,3.305914,0.626249,0.630576,0.527584
2,0.0292,3.207432,0.652325,0.644541,0.542503


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights save

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running training *****
  Num examples = 40817
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 10206


---------------------------Iteration 4 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0242,3.403502,0.642764,0.637392,0.535186
2,0.0261,3.387181,0.654933,0.643076,0.536609


Saving model checkpoint to results/checkpoint-500
Configuration saved in results/checkpoint-500/config.json
Model weights saved in results/checkpoint-500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1000
Configuration saved in results/checkpoint-1000/config.json
Model weights saved in results/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-1500
Configuration saved in results/checkpoint-1500/config.json
Model weights saved in results/checkpoint-1500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2000
Configuration saved in results/checkpoint-2000/config.json
Model weights saved in results/checkpoint-2000/pytorch_model.bin
Saving model checkpoint to results/checkpoint-2500
Configuration saved in results/checkpoint-2500/config.json
Model weights saved in results/checkpoint-2500/pytorch_model.bin
Saving model checkpoint to results/checkpoint-3000
Configuration saved in results/checkpoint-3000/config.json
Model weights save

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8


---------------------------Iteration 5 Complete---------------------------



# Evaluate

In [168]:
import statistics

print("%15s %s (%s)" % ("","Mean", "StDev"))

print("-"*29)
print("Macro Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Accuracy",
    round(statistics.mean(accuracy_list),3),
    round(statistics.stdev(accuracy_list),3)))
print(f"%15s %5s (%s)" %("Macro F1",
    round(statistics.mean(macro_f1_score_list),3),
    round(statistics.stdev(macro_f1_score_list),3)))
print(f"%15s %5s (%s)" %("Weighted F1",
    round(statistics.mean(weighted_f1_score_list),3),
    round(statistics.stdev(weighted_f1_score_list),3)))

print("-"*29)
print("Class Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Positive",
    round(statistics.mean(positive_f1_score),3),
    round(statistics.stdev(positive_f1_score),3)))
print(f"%15s %5s (%s)" %("Neutral",
    round(statistics.mean(neutral_f1_score),3),
    round(statistics.stdev(neutral_f1_score),3)))
print(f"%15s %5s (%s)" %("Negative",
    round(statistics.mean(negative_f1_score),3),
    round(statistics.stdev(negative_f1_score),3)))
print(f"%15s %5s (%s)" %("Mixed",
    round(statistics.mean(mixed_f1_score),3),
    round(statistics.stdev(mixed_f1_score),3)))

                Mean (StDev)
-----------------------------
Macro Scores
-----------------------------
       Accuracy 0.659 (0.009)
       Macro F1 0.545 (0.012)
    Weighted F1 0.649 (0.01)
-----------------------------
Class Scores
-----------------------------
       Positive 0.804 (0.005)
        Neutral  0.57 (0.018)
       Negative 0.574 (0.02)
          Mixed 0.231 (0.021)


# PARKING LOT

In [250]:
num_of_sentences = 0

for l in mini_train_df['response_text']:       
      l = l.strip()#.split("\t")[1]
      sentence_tokens = tuple(l.split())
      # if len(sentence_tokens) > 100:
      #      continue
#       # first check if any of my words is present. otherwise is not worth tagging it.
      found = False
      for token in sentence_tokens:
        if token in my_words:
          found = True
          break
      if found:
        doc = nlp(l)
        new_tokenization = []
        for token in doc:
            new_tokenization.append(token.text)
        if "double-decker" in sentence_tokens:
            pdb.set_trace()
        for i, token in enumerate(doc):
            if token.text in my_words and accepted_pos(token.pos_):
                word_sentence_dict[token.text].add((tuple(new_tokenization), i)) # sentence and position 
                num_of_sentences +=1
print(num_of_sentences)

69


In [284]:
# check if a scale word exists in a response_text, if so capture position
# text_column = mini_train_df['response_text']
# label_column = mini_train_df['labels_4']
position_list = []
def na(text_column, label_column):
  scale_word_position = collections.defaultdict(dict) #collections.defaultdict(lambda: collections.defaultdict(lambda: collections.defaultdict(dict)))
  for data_name in datanames:
    for scale_name, words in scales_dict[data_name].items():
      #for word in words:          
          
          # -1 indicates the value is not present
         # position = text_column.find(str(word))
          # if position == -1:
          #   #if the word doesn't exist add an empty list
          #   scale_word_position[data_name]= [] #[str(scale_name)][word] 
          # else:
          #   # add the text and the position
          #instance = dict()
          #   instance['sentence_words'] = text_column 
         # instance['position'] = int(position)
            # instance['label'] = label_column
            # add replacement greyscaled words
            #instance['milder_words'] = scales_with_milder_option[data_name][word]

        #  scale_word_position[data_name][word]= instance #instance[str(scale_name)][word] 
        return words
#scale_word_position
# add scale information           

In [251]:
dict_for_lm = dict()

for dataname in rankings:
  for scale in rankings[dataname]:	
      words_in_scale = []

      for ws in rankings[dataname].get(scale):
        # split and add words that are equally weighted in each scale
        words_in_scale.extend(ws.split(" || "))
      words_in_scale = tuple(words_in_scale)
      dict_for_lm[words_in_scale] = dict()

      for word in word_sentence_dict:
          if word in words_in_scale:
              dict_for_lm[words_in_scale][word] = []
              for sentence, position in word_sentence_dict[word]:
                  instance = dict()
                  instance['sentence_words'] = sentence 
                  instance['position'] = int(position)
                  instance['label'] = 
                  dict_for_lm[words_in_scale][word].append(instance)

#save out
pickle.dump(dict_for_lm, open("/content/drive/MyDrive/w266/unfiltered_rtgender_scalar_sentences_for_lm.pkl","wb"))


### scalrel_extract_representations.py

In [None]:
filename = "/content/drive/MyDrive/w266/unfiltered_rtgender_scalar_sentences_for_lm.pkl"#"/content/scalar_adjs/scal-rel/relational_sentences.pkl"
data = pickle.load(open(filename, "rb"))

data

### adapted from extract_representations.py

In [108]:
def check_correct_token_mapping(bert_tokenized_sentence, positions, word):
    berttoken = ''
    for p in positions:
        berttoken += bert_tokenized_sentence[p].strip("##")
    if berttoken.lower() == word.lower():
        return True
    else:
        return False

In [109]:
for scale, values in data.items():
    for i, item in values.items():
      if len(item) != 0: 
        sentence_words = item[0].get("sentence_words")

        # extract and tokenize the original sentence
        example = ' '.join(sentence_words)
        bert_tokenized_sentence, mapp = special_tokenization(example, tokenizer, model_name)
        bert_position = mapp[item[0].get("position")]
        if not check_correct_token_mapping(bert_tokenized_sentence, bert_position, i):
                sys.out("Tokenization mismatch!")
        cinstance = dict()
        cinstance['adj'] = i
     #   cinstance['class'] = data[adj]['class']
        cinstance['sentence_words'] = sentence_words
        cinstance["bert_tokenized_sentence"] = bert_tokenized_sentence
        cinstance["bert_position"] = bert_position
        infos.append(cinstance)

NameError: ignored

## Replace with alternative scale word
adapted from extract_representations.py

In [None]:
def extract_representations(infos, tokenizer, model_name):
    reps = []
    if model_name in ["bert-base-uncased", "bert-base-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased"]:
        config_class, model_class = BertConfig, BertModel        
    elif "flaubert" in model_name:
        config_class, model_class = FlaubertConfig, FlaubertModel
    elif "greek" in model_name or "spanish" in model_name:
        config_class, model_class = AutoConfig, AutoModel

    config = config_class.from_pretrained(model_name, output_hidden_states=True)
    model = model_class.from_pretrained(model_name, config=config)

    model.eval()
    with torch.no_grad():
        for info in infos:
            tok_sent = info['bert_tokenized_sentence']            
            input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tok_sent)]).to(device)            
            outputs = model(input_ids)            
            if "flaubert" in model_name:
                hidden_states = outputs[1]
            else:
                hidden_states = outputs[2]
            if not exclude_last_bpe: #args.exclude_last_bpe:
                bpositions = info["bert_position"]
            else:
                if len(info["bert_position"]) == 1:
                    bpositions = info["bert_position"]
                if len(info["bert_position"]) > 1:
                    bpositions = info["bert_position"][:-1]                    
            
            reps_for_this_instance = dict()                
            for i, w in enumerate(info["bert_tokenized_sentence"]):
                if i in bpositions: 
                    for l in range(len(hidden_states)): # all layers
                        if l not in reps_for_this_instance:
                            reps_for_this_instance[l] = []
                        reps_for_this_instance[l].append((w, hidden_states[l][0][i].cpu()))                        
            reps.append(reps_for_this_instance)            

    return reps, model

In [110]:
torch.set_default_tensor_type(torch.cuda.FloatTensor)
torch.manual_seed(0)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
batch_size = 1 

In [112]:
try:
  sys.argv=['']
  del sys
except: pass


parser = argparse.ArgumentParser()
args = parser.parse_args()

##Extract Representations

In [113]:
#"whether we exclude the last bpe of words when words are split into multiple wordpieces"
exclude_last_bpe = True
reps, model = extract_representations(infos, tokenizer, model_name)

NameError: ignored

In [None]:
def aggregate_reps(reps_list, hidden_size):
    '''This function averages representations of a word that has been split into wordpieces.'''
    reps = torch.zeros([len(reps_list), hidden_size])
    for i, wrep in enumerate(reps_list):
        w, rep = wrep
        reps[i] = rep

    if len(reps) > 1:
        reps = torch.mean(reps, axis=0)
    reps = reps.view(hidden_size)

    return reps.cpu()


In [None]:
# added Bert embeddings for each example (scaled and not) to data 
for rep, instance in zip(reps, infos):
    scale = instance["scale"]
    lemma = instance["lemma"]
    for scale, values in data.items():
      for i, ins2 in values.items():
        if len(ins2) != 0: 
          if  ins2[0].get("sentence_words") == instance["sentence_words"]:
              ins2[0]["representations"] = dict()
              ins2[0]["representations"][lemma] = dict()
              for l in rep:
                ins2[0]['representations'][lemma][l] = aggregate_reps(rep[l], model.config.hidden_size)

In [277]:
# save out
args.data_dir = "/content/drive/MyDrive/w266/"

if not exclude_last_bpe:
    bpe_str = "all-bpes"
else:
    bpe_str = "exclude-last-bpes"

language_str = "en"
out_fn = args.data_dir + "scalar_embeddings_" + language_str + "_" + bpe_str + ".pkl" 
pickle.dump(data, open(out_fn, "wb"))

## Predictions

In [101]:
import pymagnitude
from predict import load_rankings, assign_ranking_numbers, extract_gold_mildest_extreme

Collecting pymagnitude
  Downloading pymagnitude-0.1.143.tar.gz (5.4 MB)
[K     |████████████████████████████████| 5.4 MB 15.3 MB/s 
[?25hBuilding wheels for collected packages: pymagnitude
  Building wheel for pymagnitude (setup.py) ... [?25l[?25hdone
  Created wheel for pymagnitude: filename=pymagnitude-0.1.143-cp37-cp37m-linux_x86_64.whl size=360988340 sha256=f5e72968f54bd59c786582d261cc90d6ea0741c852306b61285279324759021c
  Stored in directory: /root/.cache/pip/wheels/0e/96/d6/b765a1ce34517c193d764b634b1ff7db5e1dcfea2520f17273
Successfully built pymagnitude
Installing collected packages: pymagnitude
Successfully installed pymagnitude-0.1.143


In [102]:
#import scale rankings
datanames = ['demelo', 'crowd', 'wilkinson']
rankings, adjs_by_dataset = load_rankings("/content/scalar_adjs/data/",\
                                          datanames=datanames)
#import scalar embeddings 
data_dir = "/content/drive/MyDrive/w266/"
scalar_dataset = pickle.load(open(data_dir + "scalar_embeddings_" + language_str + "_exclude-last-bpes" + ".pkl", "rb"))

In [103]:
scalar_dataset[('smart', 'intelligent')]

{'intelligent': [], 'smart': []}

In [None]:
#def calculate_diff_vector(data, scalar_dataset, dataname, X=10,  avoid_overlap=True):
    #### dataset used to build diffvec - dataset for which we will make predictions
dataname = 'crowd'
avoid_overlap = 'True'


relevant_dds = [dataname + "-" + d for d in datanames if d != dataname]

diff_vectors_by_layer = dict()
pairs_by_layer = dict()
for dd in relevant_dds:
    diff_vectors_by_layer[dd] = dict()
    pairs_by_layer[dd] = dict()
    for layer in range(1, 13):
        diff_vectors_by_layer[dd][layer] = []
        pairs_by_layer[dd][layer] = []

missing = 0

for scale in scalar_dataset:
    ordered_words, mildest_words, extreme_words = extract_gold_mildest_extreme(data[scale])
    if tuple(ordered_words) not in data:
        missing += 1
        continue
    mildest_word = mildest_words[0]
    print(data[scale])
    print(mildest_word)
    print("-"*80)
    # extreme_word = extreme_words[0]
    # for dd in relevant_dds:
    #         if avoid_overlap:
    #             if mildest_word in adjs_by_dataset[dd.split("-")[1]] or extreme_word in adjs_by_dataset[dd.split("-")[1]]:
    #                 continue
    #         diff_vectors_one_scale = dict()
    #         pairs_one_scale = dict()
    #         for layer in range(1, 13):
    #             diff_vectors_one_scale[layer] = []
    #             pairs_one_scale[layer] = []
    #         for key, instance in scalar_dataset[tuple(ordered_words)].items():
    #           if len(instance) != 0:
    #             #for layer in range(1,13):
    #             print(ordered_words)
    #             print(instance[0]) #[0]['representations'].keys()
    #             print("-"*80)
                    # mild_rep = instance[0]['representations'][mildest_word][layer].numpy()
                    # extreme_rep = instance[0]['representations'][extreme_word][layer].numpy()
                    # diffvec_ex = extreme_rep - mild_rep
                    # diff_vectors_one_scale[layer].append(diffvec_ex)
                    # print(diff_vectors_one_scale)

    #         for layer in range(1, 13):
    #             av_ex = np.average(diff_vectors_one_scale[layer], axis=0)
    #             diff_vectors_by_layer[dd][layer].append(av_ex)

    # final_diff_vector_by_layer = dict()
    # for dd in diff_vectors_by_layer:
    #     final_diff_vector_by_layer[dd] = dict()
    #     for layer in range(1,13):
    #         av_ex = np.average(diff_vectors_by_layer[dd][layer], axis=0)
    #         final_diff_vector_by_layer[dd][layer] = av_ex

    # print('missing scales:', missing)

In [None]:
scalar_dataset

In [None]:
for key, instance in scalar_dataset[tuple(ordered_words)].items():
              if len(instance) != 0:
                for layer in range(1,13):
                    print(instance[0]['representations'].keys())

In [None]:
# add to train df
train_df_aug = train_df
train_df_aug['augmented_response_text'] = augmented_sentences_2
train_df_aug

# Transform to HuggingFace friendly format

In [None]:
# change to dataset to work with Huggingface transformer & remove unused columns
columns_to_remove = ['op_gender', 'source', 'Unnamed: 0', 'relevance', 'sentiment','post_text', 'label']

from datasets import load_dataset
train_dataset = Dataset.from_pandas(train_df_aug)
dev_dataset = Dataset.from_pandas(dev_df)

train_dataset = train_dataset.remove_columns(column_names= columns_to_remove)
dev_dataset = dev_dataset.remove_columns(column_names= columns_to_remove)
dev_dataset = dev_dataset.remove_columns(column_names= '__index_level_0__')

# rename sentiment to labels
train_dataset = train_dataset.rename_column("labels_4", "label")
dev_dataset = dev_dataset.rename_column("labels_4", "label")

In [None]:
# combine into a DataDictionary for huggingface use
rtg_dataset = DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset 
})

rtg_dataset

DatasetDict({
    train: Dataset({
        features: ['response_text', 'label', 'augmented_response_text'],
        num_rows: 10746
    })
    dev: Dataset({
        features: ['response_text', 'label'],
        num_rows: 2301
    })
})

In [None]:
output_model_file = '/content/drive/MyDrive/w266/pytorch_bert_rtgender_easy_data_aug.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved


In [None]:
 # for dataname in datanames:
  # for scale_name, scale_values in scales_dict[dataname].items():
#scale_name = str("'big', 'substantial || major', 'tremendous || staggering'")

# create scales with ranks of words from extremes to middle ground

def update_dict(dataname, word, mod_term_pos):
  '''return dictionary with milder words'''
  key_word = word.split(" || ")  
  value_word = scale_name[mod_term_pos].split(" || ")
  instance = dict(itertools.product(key_word,value_word))
  for k,v in instance.items():
    scales_with_milder_option[dataname][k] = v
  return scales_with_milder_option
### scale attributes
# scale_size = len(scale_name) #number of ranks in scale
# # does the scale have an even number of elements
# even_scale = (scale_size %2 ==0)
# # find the position of the middle term  
# # adjust down 1 for indexing  
# midpoint = scale_size // 2 -1
# # exclude scales with fewer than 3 ranks  
    # if scale_size <3:
    #   continue

    # else:
    #   if even_scale:
    #     if current_pos <= midpoint:
    #       mod_term_pos = current_pos + 1 # find the position of the next term
    #       # scales_with_milder_option = update_dict(dataname, word, mod_term_pos)
    #       key_word = word.split(" || ")  
    #       value_word = scale_name[mod_term_pos].split(" || ")
    #       instance = dict(itertools.product(key_word,value_word))
    #       print(instance)# for k,v in instance.items():
    #       #   scales_with_milder_option[dataname][k] = v

    #     elif current_pos >= midpoint:
    #       mod_term_pos = current_pos - 1 # find the position of the next term
    #       # scales_with_milder_option = update_dict(dataname, word, mod_term_pos)
    #       key_word = word.split(" || ")  
    #       value_word = scale_name[mod_term_pos].split(" || ")

    #       instance = dict(itertools.product(key_word,value_word))
    #       print(instance)# for k,v in instance.items():
    #       #   scales_with_milder_option[dataname][k] = v

    #   else:  
    #     if current_pos < midpoint:
    #       mod_term_pos = current_pos + 1 # find the position of the next term
    #       # scales_with_milder_option = update_dict(dataname, word, mod_term_pos)
    #       key_word = word.split(" || ")  
    #       value_word = scale_name[mod_term_pos].split(" || ")
    #       instance = dict(itertools.product(key_word,value_word))
    #       print(instance)# for k,v in instance.items():
    #       #   scales_with_milder_option[dataname][k] = v

    #     elif current_pos > midpoint:
    #       mod_term_pos = current_pos - 1 # find the position of the next term
    #       # scales_with_milder_option = update_dict(dataname, word, mod_term_pos)
    #       key_word = word.split(" || ")  
    #       value_word = scale_name[mod_term_pos].split(" || ")
    #       instance = dict(itertools.product(key_word,value_word))
    #       print(instance)# for k,v in instance.items():
    #       #   scales_with_milder_option[dataname][k] = v