<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/RtGender_Annotations_Sentiment_Back_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RtGender - Annotations - Sentiment - Back Translation
[Source](https://github.com/bhadreshpsavani/ExploringSentimentalAnalysis/blob/main/SentimentalAnalysisWithDistilbert.ipynb)




In [21]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


<a id='section01'></a>
## Load Modules

In [1]:
%%capture
!pip install transformers==4.1.1 

# Only needed for Back translation
!pip install sentencepiece==0.1.94
!pip install mosestokenizer==1.1.0
from transformers import MarianMTModel, MarianTokenizer

# needed for tokenization
from transformers import AutoTokenizer

In [2]:
%%capture
!pip install datasets
import datasets 
from datasets import load_dataset, Dataset, DatasetDict

In [3]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [4]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [5]:
# # Setting up the device for TPUs
# %%capture
# # https://colab.research.google.com/github/pytorch/xla/blob/master/contrib/colab/getting-started.ipynb#scrollTo=3P6b3uqfzpDI
# import os
# assert os.environ['COLAB_TPU_ADDR'] 
# !pip install cloud-tpu-client==0.10 https://storage.googleapis.com/tpu-pytorch/wheels/torch_xla-1.9-cp37-cp37m-linux_x86_64.whl
# # imports the torch_xla package
# VERSION = "1.5"  #@param ["1.5" , "20200325", "nightly"]
# !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --version $VERSION

# import torch_xla
# import torch_xla.core.xla_model as xm

# # Creates a random tensor on xla:1 (a Cloud TPU core)
# device = xm.xla_device()



<a id='section02'></a>
## Import and Reshape Data

In [8]:
train_df = pd.read_csv('/content/drive/MyDrive/w266/train_oversampled.csv')
dev_df = pd.read_csv('/content/drive/MyDrive/w266/annotations_dev.csv')

print('train_shape: ',train_df.shape)
print('dev_shape: ',dev_df.shape)

train_shape:  (21184, 9)
dev_shape:  (2303, 9)


In [9]:
# there are NaNs in the dev dataset remove 
nan_values = dev_df[dev_df.isna().any(axis=1)] 
print(nan_values)

# return without missing values in response_text
dev_df.dropna(subset = ["response_text"], inplace=True)

print("Train shape", train_df.shape)
print("Dev shape", dev_df.shape)

      Unnamed: 0         source op_gender  ...   relevance label labels_4
830         2576  facebook_wiki         M  ...  Irrelevant     1        1
1664        2722  facebook_wiki         W  ...  Irrelevant     1        1

[2 rows x 9 columns]
Train shape (21184, 9)
Dev shape (2301, 9)


## Process Data

### Determine Max Length

In [10]:
# find the P99 of length for response_text and set that as the max length 
max_length = train_df['response_text'].astype(str).map(len).quantile(0.99).astype(int)
print(f"99th %tile of response_text length: {max_length}")

99th %tile of response_text length: 289


## Back Translate (skip after first run)
[source](https://dzlab.github.io/dltips/en/pytorch/text-augmentation/)

In [None]:
# Helper function to download data for a language
def download(model_name):
  tokenizer = MarianTokenizer.from_pretrained(model_name)
  model = MarianMTModel.from_pretrained(model_name)
  return tokenizer, model


# model that can translate from English to Romance languages
# this is a single model that can translate to any of the romance languages

target_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-en-ROMANCE')
target_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-en-ROMANCE')

# initialize models that can translate Romance languages to English.

en_tokenizer = MarianTokenizer.from_pretrained('Helsinki-NLP/opus-mt-ROMANCE-en')
en_model = MarianMTModel.from_pretrained('Helsinki-NLP/opus-mt-ROMANCE-en')

Beam search was timing out even with Colab Pro+. Setting a max beam eval of 2 per this [stackoverflow] (https://github.com/huggingface/transformers/issues/7324).

In [None]:
def translate(texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    template = lambda text: f"{text}" if language == "en" else f">>{language}<< {text}"
    src_texts = [template(i) for i in texts]

    # Tokenize the texts
    encoded = tokenizer.prepare_seq2seq_batch(src_texts,return_tensors="pt",
                                              max_length = 150, 
                                              eval_beams=1, 
                                              eval_max_gen_length=150 
                                              )
                                       
    
    # Generate translation using model
    translated = model.generate(**encoded)

    # Convert the generated tokens indices back into text
    translated_texts = tokenizer.batch_decode(translated, skip_special_tokens=True)
    
    return translated_texts

In [None]:
def back_translate(texts, source_lang="en", target_lang="fr"):
    # Translate from source to target language
    fr_texts = translate(texts, target_model, target_tokenizer, 
                         language=target_lang)

    # Translate from target language back to source language
    back_translated_texts = translate(fr_texts, en_model, en_tokenizer, 
                                      language=source_lang)
    
    return back_translated_texts

In [None]:
#@title
# note that even a single text needs to be inside a list
# single_sentence = ['I would like to have a cup of coffee']

# aug_texts = back_translate(single_sentence, source_lang="en", target_lang="es")
# print(aug_texts)

### Test timing for Back translation

In [None]:
#import timeit

# train_df_aug = train_df.drop(['Unnamed: 0', 'source', 'op_gender', 'post_text', 'sentiment', 'relevance', 'label'], axis = 1)
# train_df_aug = train_df_aug[:10]


Time to run 10 examples using ColabPro+ with truncation at 99% of response_text length: 
* TPU: 1 loop, best of 5: 40.4 s per example
* GPU: 1 loop, best of 5: 31.2 s per example

But when I truncate to 150: 1 loop, best of 5: 14.5 s per example


In [None]:
# %%timeit
# es = []
# for i in train_df_aug['response_text']:
#   es.append(back_translate([i], source_lang='en', target_lang='es')[0])


### Run back Translation once.. 
save out and import for future runs.
only one language ran in 24 hours. Will back translate Spanish only

In [None]:
## run the first time only
train_df_aug = train_df.drop(['Unnamed: 0', 'source', 'op_gender', 'post_text', 'sentiment', 'relevance', 'label'], axis = 1)

train_df_aug['es_trans'] = train_df_aug['response_text'].apply(lambda x: back_translate([x], source_lang='en', target_lang='es')[0])
train_df_aug['fr_trans'] = train_df_aug['response_text'].apply(lambda x: back_translate([x], source_lang='en', target_lang='fr')[0])
train_df_aug['it_trans'] = train_df_aug['response_text'].apply(lambda x: back_translate([x], source_lang='en', target_lang='it')[0])
train_df_aug['pt_trans'] = train_df_aug['response_text'].apply(lambda x: back_translate([x], source_lang='en', target_lang='pt')[0])
train_df_aug['ro_trans'] = train_df_aug['response_text'].apply(lambda x: back_translate([x], source_lang='en', target_lang='ro')[0]) 
train_df_aug.to_csv('/content/drive/MyDrive/w266/back_translation_augmented_train_data.csv', index=False)

## Convert to HuggingFace format

In [11]:
#import 
train_df_aug = pd.read_csv('/content/drive/MyDrive/w266/back_translation_augmented_train_data.csv')
print('Back translated train shape: ', train_df_aug.shape)
# combine the translated (for the basic train dataset) with the oversampled version
train_df_aug = pd.merge(train_df, train_df_aug, on = ["labels_4", "response_text"])

Back translated train shape:  (10746, 7)


In [12]:
# reshape data
train_augmented_df = pd.concat([
  train_df_aug[['response_text','labels_4']],
  train_df_aug[['es_trans','labels_4']].rename(columns={'es_trans':'response_text'}),
  train_df_aug[['fr_trans','labels_4']].rename(columns={'fr_trans':'response_text'}),
  train_df_aug[['it_trans','labels_4']].rename(columns={'it_trans':'response_text'})
  # train_df_aug[['pt_trans','labels_4']].rename(columns={'pt_trans':'response_text'}),
  # train_df_aug[['ro_trans','labels_4']].rename(columns={'ro_trans':'response_text'})
  ])

In [13]:
print('Back translated train shape: ', train_augmented_df.shape)
print(train_augmented_df['labels_4'].value_counts(normalize=True))


Back translated train shape:  (88800, 2)
2    0.255495
1    0.252387
0    0.248018
3    0.244099
Name: labels_4, dtype: float64


In [14]:
# change to dataset to work with Huggingface transformer & remove unused columns
columns_to_remove = ['op_gender', 'source', 'Unnamed: 0', 'relevance', 'sentiment','post_text','label']

from datasets import load_dataset
train_dataset = Dataset.from_pandas(train_augmented_df)
dev_dataset = Dataset.from_pandas(dev_df)

dev_dataset = dev_dataset.remove_columns(column_names= columns_to_remove)
dev_dataset = dev_dataset.remove_columns(column_names= '__index_level_0__')


# rename labels_4 to labels
train_dataset = train_dataset.rename_column("labels_4", "label")
dev_dataset = dev_dataset.rename_column("labels_4", "label")

In [15]:
# combine into a DataDictionary for huggingface use
rtg_dataset = DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset 
})

rtg_dataset

DatasetDict({
    train: Dataset({
        features: ['response_text', 'label', '__index_level_0__'],
        num_rows: 88800
    })
    dev: Dataset({
        features: ['response_text', 'label'],
        num_rows: 2301
    })
})

## Tokenize

In [16]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name, max_length = int(max_length))

def tokenize(batch):
    return tokenizer(batch["response_text"], padding=True, truncation=True)

In [17]:
rtg_encoded = rtg_dataset.map(tokenize, batched=True, batch_size=None)
rtg_encoded['train'].features

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

{'__index_level_0__': Value(dtype='int64', id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'response_text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

# Model

In [28]:
import gc
try:
  del trainer
  del results
  del cr
  del model
  gc.collect()
  torch.cuda.empty_cache()
except: pass



In [29]:
from transformers import AutoModelForSequenceClassification
num_labels = 4
epochs = 2
iterations = 5
model = (AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels).to(device))

In [30]:
rtg_encoded.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# check the type for each feature
rtg_encoded["dev"].features

{'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None),
 'response_text': Value(dtype='string', id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

In [31]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1_weighted = f1_score(labels, preds, average="weighted")
    f1_macro = f1_score(labels, preds, average = 'macro')
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1_weighted, "f1_macro": f1_macro} 

In [34]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(rtg_encoded["train"]) // batch_size
training_args = TrainingArguments(output_dir="results",
                                  num_train_epochs=epochs,
                                  learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size,
                                  per_device_eval_batch_size=batch_size,
                                  load_best_model_at_end=False,
                                 # metric_for_best_model="f1_macro",
                                 # weight_decay=0.01,
                                  evaluation_strategy="epoch",
                                #  save_strategy="epoch",
                                  disable_tqdm=False
                                  )

sentiment_mappings = {'Positive': 2, 'Mixed': 3, 'Neutral': 1, 'Negative':0}

In [35]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Nov 15 23:07:39 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 495.44       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    38W / 300W |  16075MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [36]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('Not using a high-RAM runtime')
else:
  print('You are using a high-RAM runtime!')

Your runtime has 54.8 gigabytes of available RAM

You are using a high-RAM runtime!


In [37]:
from sklearn.metrics import classification_report

accuracy_list = []
weighted_f1_score_list = []
macro_f1_score_list = []
negative_f1_score = []
neutral_f1_score = []
mixed_f1_score = []
positive_f1_score = []


for i in range(iterations):
  try:
    del trainer
    del results
    del cr
  except: pass


  trainer = Trainer(model=model, 
                    args=training_args,
                    compute_metrics=compute_metrics,
                    train_dataset=rtg_encoded["train"],
                    eval_dataset=rtg_encoded["dev"]
                    )
  trainer.train()
  results = trainer.evaluate()

  # append macro metrics to lists
  accuracy_list.append(results.get('eval_accuracy'))
  weighted_f1_score_list.append(results.get("eval_f1"))
  macro_f1_score_list.append(results.get("eval_f1_macro"))

  trainer.predict(rtg_encoded["dev"])
  # append the class-level F1 scores
  outputs = trainer.predict(rtg_encoded["dev"])
  predictions = outputs.predictions.argmax(1)
  labels = rtg_encoded["dev"]['label']
  cr = classification_report(labels, predictions, digits=3, output_dict=True)
  negative_f1_score.append(cr.get('0').get("f1-score"))
  neutral_f1_score.append(cr.get('1').get("f1-score"))
  positive_f1_score.append(cr.get('2').get("f1-score"))
  mixed_f1_score.append(cr.get('3').get("f1-score"))


  print(f'---------------------------Iteration {i+1} Complete---------------------------\n')

Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.205062,1.955233,0.664494,0.646651,0.534359
2,0.081273,2.387282,0.653194,0.636807,0.531521


---------------------------Iteration 1 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.060521,2.702831,0.643199,0.632468,0.521587
2,0.044016,2.930515,0.641895,0.627413,0.517994


---------------------------Iteration 2 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.037191,2.975974,0.661452,0.646522,0.542509
2,0.04163,3.174435,0.647979,0.634124,0.529559


---------------------------Iteration 3 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.030316,3.223391,0.648848,0.635821,0.526099
2,0.048357,3.321557,0.644937,0.62991,0.522844


---------------------------Iteration 4 Complete---------------------------



Epoch,Training Loss,Validation Loss,Accuracy,F1,F1 Macro
1,0.026273,3.216458,0.657106,0.643313,0.535387
2,0.043329,3.460248,0.651021,0.635429,0.534987


---------------------------Iteration 5 Complete---------------------------



## Evaluate

In [38]:
import statistics

print("%15s %s (%s)" % ("","Mean", "StDev"))

print("-"*29)
print("Macro Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Accuracy",
    round(statistics.mean(accuracy_list),3),
    round(statistics.stdev(accuracy_list),3)))
print(f"%15s %5s (%s)" %("Macro F1",
    round(statistics.mean(macro_f1_score_list),3),
    round(statistics.stdev(macro_f1_score_list),3)))
print(f"%15s %5s (%s)" %("Weighted F1",
    round(statistics.mean(weighted_f1_score_list),3),
    round(statistics.stdev(weighted_f1_score_list),3)))

print("-"*29)
print("Class Scores")
print("-"*29)

print(f"%15s %s (%s)" %("Positive",
    round(statistics.mean(positive_f1_score),3),
    round(statistics.stdev(positive_f1_score),3)))
print(f"%15s %5s (%s)" %("Neutral",
    round(statistics.mean(neutral_f1_score),3),
    round(statistics.stdev(neutral_f1_score),3)))
print(f"%15s %5s (%s)" %("Negative",
    round(statistics.mean(negative_f1_score),3),
    round(statistics.stdev(negative_f1_score),3)))
print(f"%15s %5s (%s)" %("Mixed",
    round(statistics.mean(mixed_f1_score),3),
    round(statistics.stdev(mixed_f1_score),3)))

                Mean (StDev)
-----------------------------
Macro Scores
-----------------------------
       Accuracy 0.648 (0.005)
       Macro F1 0.527 (0.007)
    Weighted F1 0.633 (0.004)
-----------------------------
Class Scores
-----------------------------
       Positive 0.791 (0.002)
        Neutral 0.545 (0.01)
       Negative 0.558 (0.01)
          Mixed 0.215 (0.023)


In [39]:
output_model_file = '/content/drive/MyDrive/w266/backtranslate_bert_rtgender_sentiment.bin'
output_vocab_file = './'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved
