<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/RtGender_Annotations_Sentiment_Grayscaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RtGender - Annotations - Sentiment w/ Grayscaling
[Code Source](https://github.com/ainagari/scalar_adjs)

* [BERT Knows Punta Cana is not just beautiful, it’s gorgeous:
Ranking Scalar Adjectives with Contextualised Representations](https://aclanthology.org/2020.emnlp-main.598.pdf)\
*[Scalar Adjective Identification and Multilingual Ranking
](https://arxiv.org/abs/2105.01180)\
*[Identifying and Ordering Scalar Adjectives Using Lexical Substitution](https://www.proquest.com/openview/aade435a5bbdcf41e2b8c24e648826cc/1.pdf?pq-origsite=gscholar&cbl=18750)\
*[A Gold Standard for Scalar Adjectives](https://aclanthology.org/L16-1424/)


In [None]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


<a id='section01'></a>
## Load Modules

In [None]:
!pip install -U nltk
import nltk; nltk.download('wordnet')



[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
%%capture
#!pip install transformers==3.0.2
!pip install -q transformers

In [None]:
%%capture
!pip install datasets
import datasets 
from datasets import load_dataset, Dataset, DatasetDict

In [None]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [None]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:

#from read_scalar_datasets import read_scales
from nltk.corpus import wordnet as wn
import gzip
import pickle
import numpy as np
import sys
from scipy.spatial.distance import cosine
from operator import itemgetter
from collections import defaultdict
#from pymagnitude import *
import argparse

<a id='section02'></a>
## Import and Reshape Data

In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_train.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_dev.csv')

print('train_shape: ',train_df.shape)
print('dev_shape: ',dev_df.shape)

train_shape:  (10746, 9)
dev_shape:  (2303, 9)


In [None]:
train_df

Unnamed: 0.1,Unnamed: 0,source,op_gender,post_text,response_text,sentiment,relevance,label,labels_4
0,3845,facebook_congress,W,Im reading the 3/1 GAO report that finds billi...,Thank you Congresswoman Bass. Keep up the grea...,Positive,ContentPoster,2,2
1,9743,fitocracy,M,Being followed by the famous DBJ? Quite an honor.,"Well, I am very honored you feel so honored",Positive,Content,2,2
2,13041,ted,W,"Penelope Boston gave a talk about Planets, exp...",Her opinions seem driven by wishful thinking. ...,Mixed,Content,1,3
3,4265,facebook_congress,W,Congress must act to help the 41 million Ameri...,There's no other way out of the enormity excep...,Positive,Content,2,2
4,13145,ted,W,"Pardis Sabeti gave a talk about Africa, big pr...",What were the benefits of the larger community...,Mixed,Content,1,3
...,...,...,...,...,...,...,...,...,...
10741,14128,ted,M,"Bjarke Ingels gave a talk about architecture, ...",Brillant!! Ingels has a terrific future ahead ...,Positive,Content,2,2
10742,5589,facebook_congress,W,I was honored to meet with Eliseo Medina and F...,The Democrats view this as another way to use ...,Negative,Content,0,0
10743,10672,reddit,W,SO YOU LIKE STACKING CUPS?! DO WE HAVE A GREAT...,Is this real?? Well at least this kid will be ...,Mixed,Content,1,3
10744,4839,facebook_congress,M,Try this Brian Schatz FB bumper sticker - an e...,EH BRIAN WEA MY STICKA N WAT OBAMA STAY ON UM ...,Neutral,Irrelevant,1,1


In [None]:
# there are NaNs in the dev dataset remove 
nan_values = dev_df[dev_df.isna().any(axis=1)] 
print(nan_values)

# return without missing values in response_text
dev_df.dropna(subset = ["response_text"], inplace=True)

print("Train shape", train_df.shape)
print("Dev shape", dev_df.shape)

      Unnamed: 0         source op_gender  ...   relevance label labels_4
830         2576  facebook_wiki         M  ...  Irrelevant     1        1
1664        2722  facebook_wiki         W  ...  Irrelevant     1        1

[2 rows x 9 columns]
Train shape (10746, 9)
Dev shape (2301, 9)


In [None]:
print("Unique sentiments: ", train_df['sentiment'].unique())

Unique sentiments:  ['Positive' 'Mixed' 'Neutral' 'Negative']


# Run through Scalar Adj



In [None]:

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument("--exclude_last_bpe", action="store_true", type=str,
                        help="whether we exclude the last piece when a word is split into multiple wordpieces."
                             "Otherwise, we use the representations of all pieces.")
    args = parser.parse_args()

    if not args.exclude_last_bpe:
        bpe_str = "all-bpes"
    else:
        bpe_str = "exclude-last-bpes"

    model_name = "bert-base-uncased"
    tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
    #filename = "scal-rel/relational_sentences.pkl"
   # out_fn = "relational_ctxtembeds_" + bpe_str + ".pkl"

    

    torch.set_default_tensor_type(torch.cuda.FloatTensor)
    torch.manual_seed(0)
    device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
    batch_size = 1  # do not change this


    infos = []
    for adj in data:
        for instance in data[adj]['sentences'][:10]:
            sentence_words = instance['sentence_words']
            if '' in sentence_words:
                print(sentence_words)
            bert_tokenized_sentence, mapp = special_tokenization(" ".join(sentence_words), tokenizer, model_name)
            bert_position = mapp[instance['position']]  # this is a list of positions
            if not check_correct_token_mapping(bert_tokenized_sentence, bert_position, adj):
                sys.out("Tokenization mismatch!")
            cinstance = dict()
            cinstance['adj'] = adj
            cinstance['class'] = data[adj]['class']
            cinstance['sentence_words'] = sentence_words
            cinstance["bert_tokenized_sentence"] = bert_tokenized_sentence
            cinstance["bert_position"] = bert_position
            infos.append(cinstance)

    #### EXTRACTING REPRESENTATIONS
    reps, model = extract_representations(infos, tokenizer, model_name)

    for rep, instance in zip(reps, infos):
        adj = instance["adj"]
        clas = instance["class"]
        for ins2 in data[adj]['sentences']:
            if ins2["sentence_words"] == instance["sentence_words"]:
                if "representations" not in ins2:
                    ins2["representations"] = dict()
                for l in rep:
                    ins2['representations'][l] = aggregate_reps(rep[l], hidden_size=model.config.hidden_size)

    pickle.dump(data, open(out_fn, "wb"))

#### Import Modules

In [None]:
# import eda script from github
!git clone https://github.com/ainagari/scalar_adjs

Cloning into 'scalar_adjs'...
remote: Enumerating objects: 851, done.[K
remote: Counting objects: 100% (435/435), done.[K
remote: Compressing objects: 100% (186/186), done.[K
remote: Total 851 (delta 89), reused 358 (delta 45), pack-reused 416[K
Receiving objects: 100% (851/851), 13.47 MiB | 2.18 MiB/s, done.
Resolving deltas: 100% (127/127), done.


# [Extract Relevant Text](https://github.com/ainagari/scalar_adjs/blob/master/extract_flickr_scalar.py)

In [None]:
import sys
sys.path.append('/content/scalar_adjs/')

#from read_scalar_datasets import read_scales
import pickle
import pdb
import spacy


nlp = spacy.load("en_core_web_sm")

In [None]:
#from read_scalar_datasets.py

import os

def read_scales(dirname):
    '''
    Read all comparable pairs from the same scale from scal term files in dirname;
    keep track of which file each pair comes from
    Returns: a dict of {filename: set([(w1,w2),...])}
    '''
    termsfiles = [os.path.join(dirname, f) for f in os.listdir(dirname)]
    rankings = dict()
    for tf in termsfiles:
        tf_clean = os.path.basename(tf).replace('.terms','')
        ranking = []
        with open(tf, 'r') as fin:
            for line in fin:
                __, w = line.strip().split('\t')
                ranking.append(w)

        rankings[tf_clean] = ranking
    return rankings


### Identify the location of every word present the three types of scales
Extract and save out as a dictionary for every example that contains at least one scaled word. Adapted from extract_flickr_scalar.py

In [None]:
rankings = dict()
datanames = ["demelo", "crowd", "wilkinson"]
for dataname in datanames:
    r = read_scales("/content/scalar_adjs/data/" + dataname + "/gold_rankings/")
    rankings[dataname] = r

my_words = set()
for dataname in rankings:
    for scale in rankings[dataname]:
        for word in rankings[dataname][scale]:
            words = word.split(" || ")
            for w in words:
                my_words.add(w)

word_sentence_dict = dict()
for word in my_words:
    word_sentence_dict[word] = set()

def accepted_pos(pos):
    if pos in ["ADJ","ADV", "ADP","VERB","DET"]: # or "DET" in pos or "VERB" in pos:
        return True
    return False



In [None]:
num_of_sentences = 0

for l in train_df['response_text'][1:20]:       
      l = l.strip()#.split("\t")[1]
      sentence_tokens = tuple(l.split())
      if len(sentence_tokens) > 100:
           continue
#       # first check if any of my words is present. otherwise is not worth tagging it.
      found = False
      for token in sentence_tokens:
        if token in my_words:
          found = True
          break
      if found:
        doc = nlp(l)
        new_tokenization = []
        for token in doc:
            new_tokenization.append(token.text)
        if "double-decker" in sentence_tokens:
            pdb.set_trace()
        for i, token in enumerate(doc):
            if token.text in my_words and accepted_pos(token.pos_):
                word_sentence_dict[token.text].add((tuple(new_tokenization), i)) # sentence and position 
                num_of_sentences +=1
print(num_of_sentences)

148


In [None]:
dict_for_lm = dict()

for dataname in rankings:
  for scale in rankings[dataname]:	
      words_in_scale = []

      for ws in rankings[dataname].get(scale):
        # split and add words that are equally weighted in each scale
        words_in_scale.extend(ws.split(" || "))
      words_in_scale = tuple(words_in_scale)
      dict_for_lm[words_in_scale] = dict()

      for word in word_sentence_dict:
          if word in words_in_scale:
              dict_for_lm[words_in_scale][word] = []
              for sentence, position in word_sentence_dict[word]:
                  instance = dict()
                  instance['sentence_words'] = sentence 
                  instance['position'] = int(position)
                  dict_for_lm[words_in_scale][word].append(instance)

#save out
pickle.dump(dict_for_lm, open("/content/drive/MyDrive/w266/unfiltered_rtgender_scalar_sentences_for_lm.pkl","wb"))


In [None]:
dict_for_lm

### scalrel_extract_representations.py

In [None]:
filename = "/content/drive/MyDrive/w266/unfiltered_rtgender_scalar_sentences_for_lm.pkl"#"/content/scalar_adjs/scal-rel/relational_sentences.pkl"
data = pickle.load(open(filename, "rb"))



In [275]:
from transformers import BertTokenizer, BertConfig, BertModel, AutoTokenizer, AutoModel, FlaubertTokenizer, FlaubertModel, AutoConfig, FlaubertConfig

language_str = "en"
#whether we exclude the last bpe of words when words are split into multiple wordpieces
exclude_last_bpe ="True"
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True)
sentences = train_df['response_text']

### adapted from extract_representations.py

In [None]:
def special_tokenization(sentence, tokenizer, model_name):
    map_ori_to_bert = []
    if "flaubert" in model_name:
        tok_sent = ['<s>']
    else:
        tok_sent = ['[CLS]']

    for orig_token in sentence.split():
        current_tokens_bert_idx = [len(tok_sent)]
        bert_token = tokenizer.tokenize(orig_token) # tokenize
        tok_sent.extend(bert_token) # add to my new tokens
        if len(bert_token) > 1: # if the new token has been 'wordpieced'
            extra = len(bert_token) - 1
            for i in range(extra):
                current_tokens_bert_idx.append(current_tokens_bert_idx[-1]+1) # list of new positions of the target word in the new tokenization
        map_ori_to_bert.append(tuple(current_tokens_bert_idx))

    if "flaubert" in model_name:
        tok_sent.append('</s>')
    else:
        tok_sent.append('[SEP]')

    return tok_sent, map_ori_to_bert


In [None]:
def check_correct_token_mapping(bert_tokenized_sentence, positions, word):
    berttoken = ''
    for p in positions:
        berttoken += bert_tokenized_sentence[p].strip("##")
    if berttoken.lower() == word.lower():
        return True
    else:
        return False

In [None]:
for scale, values in data.items():
    for i, item in values.items():
      if len(item) != 0: 
        sentence_words = item[0].get("sentence_words")

        # extract and tokenize the original sentence
        example = ' '.join(sentence_words)
        bert_tokenized_sentence, mapp = special_tokenization(example, tokenizer, model_name)
        bert_position = mapp[item[0].get("position")]
        if not check_correct_token_mapping(bert_tokenized_sentence, bert_position, i):
                sys.out("Tokenization mismatch!")
        cinstance = dict()
        cinstance['adj'] = i
     #   cinstance['class'] = data[adj]['class']
        cinstance['sentence_words'] = sentence_words
        cinstance["bert_tokenized_sentence"] = bert_tokenized_sentence
        cinstance["bert_position"] = bert_position
        infos.append(cinstance)

## Replace with alternative scale word
adapted from extract_representations.py

In [None]:
def extract_representations(infos, tokenizer, model_name):
    reps = []
    if model_name in ["bert-base-uncased", "bert-base-cased", "bert-base-multilingual-uncased", "bert-base-multilingual-cased"]:
        config_class, model_class = BertConfig, BertModel        
    elif "flaubert" in model_name:
        config_class, model_class = FlaubertConfig, FlaubertModel
    elif "greek" in model_name or "spanish" in model_name:
        config_class, model_class = AutoConfig, AutoModel

    config = config_class.from_pretrained(model_name, output_hidden_states=True)
    model = model_class.from_pretrained(model_name, config=config)

    model.eval()
    with torch.no_grad():
        for info in infos:
            tok_sent = info['bert_tokenized_sentence']            
            input_ids = torch.tensor([tokenizer.convert_tokens_to_ids(tok_sent)]).to(device)            
            outputs = model(input_ids)            
            if "flaubert" in model_name:
                hidden_states = outputs[1]
            else:
                hidden_states = outputs[2]
            if not exclude_last_bpe: #args.exclude_last_bpe:
                bpositions = info["bert_position"]
            else:
                if len(info["bert_position"]) == 1:
                    bpositions = info["bert_position"]
                if len(info["bert_position"]) > 1:
                    bpositions = info["bert_position"][:-1]                    
            
            reps_for_this_instance = dict()                
            for i, w in enumerate(info["bert_tokenized_sentence"]):
                if i in bpositions: 
                    for l in range(len(hidden_states)): # all layers
                        if l not in reps_for_this_instance:
                            reps_for_this_instance[l] = []
                        reps_for_this_instance[l].append((w, hidden_states[l][0][i].cpu()))                        
            reps.append(reps_for_this_instance)            

    return reps, model

In [None]:
torch.set_default_tensor_type(torch.cuda.FloatTensor)
torch.manual_seed(0)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
batch_size = 1 

In [None]:
import copy
infos = []
for scale, values in data.items():
    for i, item in values.items():
      if len(item) != 0: 
        sentence_words = list(item[0].get("sentence_words"))
        position_scaleword = item[0].get("position")
        for scaleword in scale:
          # copy over as a dictionary
          cinstance = copy.deepcopy(item[0])
          # change a to an and vice versa depending on first letter of the scaleword
          if sentence_words[position_scaleword-1] == "a" and scaleword[0] in "aeiou":
            sentence_words[position_scaleword-1] = "an"
          elif sentence_words[position_scaleword-1] == "an" and scaleword[0] not in "aeiou":
            sentence_words[position_scaleword-1] = "a"
          
          # and replace the scaleword
          sentence_words[position_scaleword] = scaleword
          scaleword_position = cinstance["position"]
          cinstance["position"] = [scaleword_position]# = [cinstance[0].get("position")]
          # extract and tokenize the original sentence
          example = ' '.join(sentence_words)
          bert_tokenized_sentence, mapp = special_tokenization(example, tokenizer, model_name)
          current_positions = cinstance['position']
          if len(current_positions) == 1:
              bert_position = mapp[cinstance['position'][0]] # this is a list of positions (it might have been split into wordpieces)
          elif len(current_positions) > 1:
            bert_position = []
            for p in current_positions:
                bert_position.extend(mapp[p])

          cinstance["bert_tokenized_sentence"] = bert_tokenized_sentence
          cinstance["bert_position"] = bert_position
          cinstance["scale"] = scale
          cinstance["lemma"] = scaleword
          infos.append(cinstance)

In [None]:
!pip install argparse

Collecting argparse
  Using cached argparse-1.4.0-py2.py3-none-any.whl (23 kB)
Installing collected packages: argparse
Successfully installed argparse-1.4.0


In [None]:
import argparse

try:
  sys.argv=['']
  del sys
except: pass


parser = argparse.ArgumentParser()
args = parser.parse_args()

###Extract Representations

In [None]:
#"whether we exclude the last bpe of words when words are split into multiple wordpieces"
exclude_last_bpe = True
reps, model = extract_representations(infos, tokenizer, model_name)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def aggregate_reps(reps_list, hidden_size):
    '''This function averages representations of a word that has been split into wordpieces.'''
    reps = torch.zeros([len(reps_list), hidden_size])
    for i, wrep in enumerate(reps_list):
        w, rep = wrep
        reps[i] = rep

    if len(reps) > 1:
        reps = torch.mean(reps, axis=0)
    reps = reps.view(hidden_size)

    return reps.cpu()


In [None]:
# added Bert embeddings for each example (scaled and not) to data 
for rep, instance in zip(reps, infos):
    scale = instance["scale"]
    lemma = instance["lemma"]
    for scale, values in data.items():
      for i, ins2 in values.items():
        if len(ins2) != 0: 
          if  ins2[0].get("sentence_words") == instance["sentence_words"]:
              ins2[0]["representations"] = dict()
              ins2[0]["representations"][lemma] = dict()
              for l in rep:
                ins2[0]['representations'][lemma][l] = aggregate_reps(rep[l], model.config.hidden_size)

In [277]:
# save out
args.data_dir = "/content/drive/MyDrive/w266/"

if not exclude_last_bpe:
    bpe_str = "all-bpes"
else:
    bpe_str = "exclude-last-bpes"

language_str = "en"
out_fn = args.data_dir + "scalar_embeddings_" + language_str + "_" + bpe_str + ".pkl" 
pickle.dump(data, open(out_fn, "wb"))

# PARKING LOT

# Wilkinson adjectives

In [None]:
def get_scale_words(scale):
    words = []
    for w in scale:
        words.extend(w.split(" || "))
    return words

In [None]:
def load_rankings(data_dir = "/content/scalar_adjs/data/", datanames="wilkinson"):
    rankings = dict()
    adjs_by_dataset = dict()
    for dataname in datanames:
        rankings[dataname] = read_scales(data_dir + datanames + "/gold_rankings/")
    for dataname in rankings:
        adjs_by_dataset[dataname] = set()
        for scale in rankings[dataname]:
            adjs_by_dataset[dataname].update(get_scale_words(rankings[dataname][scale]))

    return rankings, adjs_by_dataset

In [None]:
def assign_ranking_numbers(ordered_pred):
    ranknum = 0
    # take care of possible ties (especially for the sense baseline)
    ordered_rank_word_score = []
    for w, score in ordered_pred:
        if ordered_rank_word_score:
            if score != ordered_rank_word_score[-1][2]:
                ranknum +=1
        ordered_rank_word_score.append((ranknum, w, score))

    wordscores_by_rank = dict()
    for rank, w, score in ordered_rank_word_score:
        if rank not in wordscores_by_rank:
            wordscores_by_rank[rank] = []
        wordscores_by_rank[rank].append((w, score))

    return wordscores_by_rank


#### Get Ranking
Find the rank of the word in the Wilkinson dataset


In [None]:
# apply the main data augmentation function in the eda module

augmented_sentences = []

for sentence in train_df['response_text']:
  words = sentence.split(' ')
  a_words = random_deletion(words, 0.1)
  augmented_sentences.append(' '.join(a_words))


#### Replace with Synonyms

In [None]:
augmented_sentences_2 = []
alpha_sr=0.1

if (alpha_sr > 0):
  for sentence in augmented_sentences:
    words = sentence.split(' ')
    num_words = len(sentence)
    n_sr = max(1, int(alpha_sr*num_words))
    a_words = synonym_replacement(words, n_sr)
    augmented_sentences_2.append(' '.join(a_words))

In [None]:
# add to train df
train_df_aug = train_df
train_df_aug['augmented_response_text'] = augmented_sentences_2
train_df_aug

Unnamed: 0.1,Unnamed: 0,source,op_gender,post_text,response_text,sentiment,relevance,label,labels_4,augmented_response_text
0,3845,facebook_congress,W,Im reading the 3/1 GAO report that finds billi...,Thank you Congresswoman Bass. Keep up the grea...,Positive,ContentPoster,2,2,thank you congressman Bass. living up the groovy
1,9743,fitocracy,M,Being followed by the famous DBJ? Quite an honor.,"Well, I am very honored you feel so honored",Positive,Content,2,2,unity am very you flavor so honored
2,13041,ted,W,"Penelope Boston gave a talk about Planets, exp...",Her opinions seem driven by wishful thinking. ...,Mixed,Content,1,3,Her impelled by aspirant thinking. She there t...
3,4265,facebook_congress,W,Congress must act to help the 41 million Ameri...,There's no other way out of the enormity excep...,Positive,Content,2,2,There's no other right smart out of the outrag...
4,13145,ted,W,"Pardis Sabeti gave a talk about Africa, big pr...",What were the benefits of the larger community...,Mixed,Content,1,3,What were the of the orotund community? She di...
...,...,...,...,...,...,...,...,...,...,...
10741,14128,ted,M,"Bjarke Ingels gave a talk about architecture, ...",Brillant!! Ingels has a terrific future ahead ...,Positive,Content,2,2,Brillant!! Ingels has a wonderful future tense...
10742,5589,facebook_congress,W,I was honored to meet with Eliseo Medina and F...,The Democrats view this as another way to use ...,Negative,Content,0,0,The aspect this as some other manner to purpos...
10743,10672,reddit,W,SO YOU LIKE STACKING CUPS?! DO WE HAVE A GREAT...,Is this real?? Well at least this kid will be ...,Mixed,Content,1,3,be this real?? comfortably to the lowest degre...
10744,4839,facebook_congress,M,Try this Brian Schatz FB bumper sticker - an e...,EH BRIAN WEA MY STICKA N WAT OBAMA STAY ON UM ...,Neutral,Irrelevant,1,1,BRIAN WEA MY STICKA due north WAT OBAMA stick ...


# Transform to HuggingFace friendly format

In [None]:
# change to dataset to work with Huggingface transformer & remove unused columns
columns_to_remove = ['op_gender', 'source', 'Unnamed: 0', 'relevance', 'sentiment','post_text', 'label']

from datasets import load_dataset
train_dataset = Dataset.from_pandas(train_df_aug)
dev_dataset = Dataset.from_pandas(dev_df)

train_dataset = train_dataset.remove_columns(column_names= columns_to_remove)
dev_dataset = dev_dataset.remove_columns(column_names= columns_to_remove)
dev_dataset = dev_dataset.remove_columns(column_names= '__index_level_0__')

# rename sentiment to labels
train_dataset = train_dataset.rename_column("labels_4", "label")
dev_dataset = dev_dataset.rename_column("labels_4", "label")

In [None]:
# combine into a DataDictionary for huggingface use
rtg_dataset = DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset 
})

rtg_dataset

DatasetDict({
    train: Dataset({
        features: ['response_text', 'label', 'augmented_response_text'],
        num_rows: 10746
    })
    dev: Dataset({
        features: ['response_text', 'label'],
        num_rows: 2301
    })
})

## Tokenize

In [None]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# find the P99 of length for response_text and set that as the max length 
max_length = train_df['response_text'].astype(str).map(len).quantile(0.99)
print(f"99th %tile of response_text length: {max_length}")

99th %tile of response_text length: 287.0


In [None]:
from transformers import AutoTokenizer

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length = max_length)

def tokenize(batch):
    return tokenizer(batch["response_text"], padding=True, truncation=True)

DEBUG:filelock:Attempting to acquire lock 140135155123536 on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
DEBUG:filelock:Lock 140135155123536 acquired on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock


Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140135155123536 on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
DEBUG:filelock:Lock 140135155123536 released on /root/.cache/huggingface/transformers/c1d7f0a763fb63861cc08553866f1fc3e5a6f4f07621be277452d26d71303b7e.20430bd8e10ef77a7d2977accefe796051e01bc2fc4aa146bc862997a1a15e79.lock
DEBUG:filelock:Attempting to acquire lock 140135913014736 on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock
DEBUG:filelock:Lock 140135913014736 acquired on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock


Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140135913014736 on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock
DEBUG:filelock:Lock 140135913014736 released on /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e.lock
DEBUG:filelock:Attempting to acquire lock 140135154818640 on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
DEBUG:filelock:Lock 140135154818640 acquired on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140135154818640 on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
DEBUG:filelock:Lock 140135154818640 released on /root/.cache/huggingface/transformers/45c3f7a79a80e1cf0a489e5c62b43f173c15db47864303a55d623bb3c96f72a5.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99.lock
DEBUG:filelock:Attempting to acquire lock 140135155186640 on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock
DEBUG:filelock:Lock 140135155186640 acquired on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock


Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140135155186640 on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock
DEBUG:filelock:Lock 140135155186640 released on /root/.cache/huggingface/transformers/534479488c54aeaf9c3406f647aa2ec13648c06771ffe269edabebd4c412da1d.7f2721073f19841be16f41b0a70b600ca6b880c8f3df6f3535cbc704371bdfa4.lock


In [None]:
rtg_encoded = rtg_dataset.map(tokenize, batched=True, batch_size=None)
rtg_encoded['train'].features

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'augmented_response_text': Value(dtype='string', id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'response_text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

# Model

In [None]:
from transformers import AutoModelForSequenceClassification
num_labels = 4
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))

DEBUG:filelock:Attempting to acquire lock 140135127666960 on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
DEBUG:filelock:Lock 140135127666960 acquired on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock


Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140135127666960 on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
DEBUG:filelock:Lock 140135127666960 released on /root/.cache/huggingface/transformers/a8041bf617d7f94ea26d15e218abd04afc2004805632abc0ed2066aa16d50d04.faf6ea826ae9c5867d12b22257f9877e6b8367890837bd60f7c54a29633f7f2f.lock
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or wi

In [None]:
rtg_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])
rtg_encoded["dev"].features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'response_text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [None]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_weighted = f1_score(labels, preds, average="weighted")
    f1_macro = f1_score(labels, preds, average = 'macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1_weighted, "f1_macro": f1_macro} 

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(rtg_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=2,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=False,
                                 # metric_for_best_model="f1_macro",
                                 # weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  disable_tqdm=False
                                  )

In [None]:
from sklearn.metrics import classification_report

accuracy_list = []
weighted_f1_score_list = []
macro_f1_score_list = []
negative_f1_score = []
neutral_f1_score = []
mixed_f1_score = []
positive_f1_score = []


for i in range(5):
  try:
    del trainer
    del results
    del cr
  except: pass


  trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=rtg_encoded["train"],
                  eval_dataset=rtg_encoded["dev"])
  trainer.train()
  results = trainer.evaluate()

  # append macro metrics to lists
  accuracy_list.append(results.get('eval_accuracy'))
  weighted_f1_score_list.append(results.get("eval_f1"))
  macro_f1_score_list.append(results.get("eval_f1_macro"))

  trainer.predict(rtg_encoded["dev"])
  # append the class-level F1 scores
  outputs = trainer.predict(rtg_encoded["dev"])
  predictions = outputs.predictions.argmax(1)
  labels = rtg_encoded["dev"]['label']
  cr = classification_report(labels, predictions, digits=3, output_dict=True)
  negative_f1_score.append(cr.get('0').get("f1-score"))
  neutral_f1_score.append(cr.get('1').get("f1-score"))
  positive_f1_score.append(cr.get('2').get("f1-score"))
  mixed_f1_score.append(cr.get('3').get("f1-score"))

  print(f'---------------------------Iteration {i+1} Complete---------------------------\n')

The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: augmented_response_text, response_text.
***** Running training *****
  Num examples = 10746
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2688


Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.8423,0.81321,0.690569,0.672266,0.552235
2,0.6428,0.864906,0.6897,0.673108,0.55909


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-1344
Configuration saved in results/checkpoint-1344/config.json
Model weights saved in results/checkpoint-1344/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-2688
Configuration saved in results/checkpoint-2688/config.json
Model weights saved in results/checkpoint-2688/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequence

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: augmented_response_text, response_text.
***** Running training *****
  Num examples = 10746
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2688


---------------------------Iteration 1 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.4343,1.11746,0.67362,0.674032,0.569764
2,0.3812,1.39942,0.669709,0.665128,0.558108


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-1344
Configuration saved in results/checkpoint-1344/config.json
Model weights saved in results/checkpoint-1344/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-2688
Configuration saved in results/checkpoint-2688/config.json
Model weights saved in results/checkpoint-2688/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequence

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: augmented_response_text, response_text.
***** Running training *****
  Num examples = 10746
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2688


---------------------------Iteration 2 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.2182,1.93892,0.666667,0.664644,0.55935
2,0.2021,2.004549,0.665797,0.664539,0.559865


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-1344
Configuration saved in results/checkpoint-1344/config.json
Model weights saved in results/checkpoint-1344/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-2688
Configuration saved in results/checkpoint-2688/config.json
Model weights saved in results/checkpoint-2688/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequence

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: augmented_response_text, response_text.
***** Running training *****
  Num examples = 10746
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2688


---------------------------Iteration 3 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0983,2.508683,0.653194,0.65631,0.546596
2,0.0868,2.470627,0.664059,0.663032,0.560102


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-1344
Configuration saved in results/checkpoint-1344/config.json
Model weights saved in results/checkpoint-1344/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-2688
Configuration saved in results/checkpoint-2688/config.json
Model weights saved in results/checkpoint-2688/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequence

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the training set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: augmented_response_text, response_text.
***** Running training *****
  Num examples = 10746
  Num Epochs = 2
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 2688


---------------------------Iteration 4 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.0603,2.739846,0.666667,0.66625,0.564209
2,0.0598,2.778995,0.668405,0.665162,0.562406


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-1344
Configuration saved in results/checkpoint-1344/config.json
Model weights saved in results/checkpoint-1344/pytorch_model.bin
The following columns in the evaluation set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Evaluation *****
  Num examples = 2301
  Batch size = 8
Saving model checkpoint to results/checkpoint-2688
Configuration saved in results/checkpoint-2688/config.json
Model weights saved in results/checkpoint-2688/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


The following columns in the evaluation set  don't have a corresponding argument in `BertForSequence

The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8
The following columns in the test set  don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: response_text.
***** Running Prediction *****
  Num examples = 2301
  Batch size = 8


---------------------------Iteration 5 Complete---------------------------



In [None]:
import statistics

print("%15s %s (%s)" % ("","Mean", "StDev"))

print("-"*29)
print("Macro Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Accuracy",
    round(statistics.mean(accuracy_list),3),
    round(statistics.stdev(accuracy_list),3)))
print(f"%15s %5s (%s)" %("Macro F1",
    round(statistics.mean(macro_f1_score_list),3),
    round(statistics.stdev(macro_f1_score_list),3)))
print(f"%15s %5s (%s)" %("Weighted F1",
    round(statistics.mean(weighted_f1_score_list),3),
    round(statistics.stdev(weighted_f1_score_list),3)))

print("-"*29)
print("Class Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Positive",
    round(statistics.mean(positive_f1_score),3),
    round(statistics.stdev(positive_f1_score),3)))
print(f"%15s %5s (%s)" %("Neutral",
    round(statistics.mean(neutral_f1_score),3),
    round(statistics.stdev(neutral_f1_score),3)))
print(f"%15s %5s (%s)" %("Negative",
    round(statistics.mean(negative_f1_score),3),
    round(statistics.stdev(negative_f1_score),3)))
print(f"%15s %5s (%s)" %("Mixed",
    round(statistics.mean(mixed_f1_score),3),
    round(statistics.stdev(mixed_f1_score),3)))

                Mean (StDev)
-----------------------------
Macro Scores
-----------------------------
       Accuracy 0.672 (0.01)
       Macro F1  0.56 (0.002)
    Weighted F1 0.666 (0.004)
-----------------------------
Class Scores
-----------------------------
       Positive 0.824 (0.005)
        Neutral 0.583 (0.013)
       Negative 0.593 (0.014)
          Mixed 0.239 (0.033)


In [None]:
output_model_file = '/content/drive/MyDrive/w266/pytorch_bert_rtgender_easy_data_aug.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved
