<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/TRAC-2-notebooks/grayscaling/TRAC_2_Track_A_Grayscaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# TRAC-2 - Track A w/ Grayscaling
---
[Code Source](https://github.com/ainagari/scalar_adjs)

* [BERT Knows Punta Cana is not just beautiful, it’s gorgeous:
Ranking Scalar Adjectives with Contextualised Representations](https://aclanthology.org/2020.emnlp-main.598.pdf)\
*[Scalar Adjective Identification and Multilingual Ranking
](https://arxiv.org/abs/2105.01180)\
*[Identifying and Ordering Scalar Adjectives Using Lexical Substitution](https://www.proquest.com/openview/aade435a5bbdcf41e2b8c24e648826cc/1.pdf?pq-origsite=gscholar&cbl=18750)\
*[A Gold Standard for Scalar Adjectives](https://aclanthology.org/L16-1424/)


In [3]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


<a id='section01'></a>
# Load Modules

In [4]:
#!pip install -U nltk
import nltk; nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [5]:
%%capture
!pip install -q transformers

In [6]:
%%capture
!pip install datasets
import datasets 
from datasets import load_dataset, Dataset, DatasetDict

In [7]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [8]:
# Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'

In [9]:
from nltk.corpus import wordnet as wn
import gzip
import pickle
import numpy as np
import sys
from scipy.spatial.distance import cosine
from operator import itemgetter
from collections import defaultdict
#from pymagnitude import *
import argparse

import itertools

In [10]:
# import eda script from github
!git clone https://github.com/ainagari/scalar_adjs

Cloning into 'scalar_adjs'...
remote: Enumerating objects: 854, done.[K
remote: Counting objects: 100% (438/438), done.[K
remote: Compressing objects: 100% (189/189), done.[K
remote: Total 854 (delta 91), reused 358 (delta 45), pack-reused 416[K
Receiving objects: 100% (854/854), 13.47 MiB | 18.03 MiB/s, done.
Resolving deltas: 100% (129/129), done.


In [11]:
# import fucntions from scalar_adjs
import sys

# sys.path is a list of absolute path strings
sys.path.append('/content/scalar_adjs/')

from read_scalar_datasets import read_scales


In [12]:
!ls

drive  sample_data  scalar_adjs


In [13]:
# get the greyscaling augmentation script from our repo
!git clone https://github.com/ipietri/w266_Final_Project
#sys.path.append('/content/')

Cloning into 'w266_Final_Project'...
fatal: could not read Username for 'https://github.com': No such device or address


<a id='section02'></a>
# Import and Preprocess Data

In [14]:
trac2_task_a = pd.read_csv('/content/drive/MyDrive/w266/task_A_data_oversampled.csv')

print('TASK A: ',trac2_task_a.shape)


trac2_dev = pd.read_csv('/content/drive/MyDrive/w266/trac2_eng_dev.csv')
print('dev: ',trac2_dev.shape)
print("TASK A unique sentiments: ", trac2_task_a['Sub-task A'].unique())

TASK A:  (10125, 2)
dev:  (1066, 4)
TASK A unique sentiments:  ['NAG' 'CAG' 'OAG']


In [15]:
# create a dev dataset for track a and rename columns
trac2_task_a_dev = trac2_dev[['Text','Sub-task A']]
trac2_task_a.rename(columns = {'Text':'text', 'Sub-task A': 'label'}, inplace = True)
trac2_task_a_dev.rename(columns = {'Text':'text', 'Sub-task A': 'label'}, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [16]:
# remove NaNs and return without missing values in response_text

# TRACK A
# train 
trac2_task_a.dropna(subset = ['text'], inplace=True)

# dev
trac2_task_a_dev.dropna(subset = ['text'], inplace=True)

print('train: ',trac2_task_a.shape)
print('dev: ',trac2_task_a_dev.shape)

train:  (10125, 2)
dev:  (1066, 2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [17]:
# Dummy variables

# task A
# task_a_labels = {'NAG':1, 'OAG': 2, 'CAG':0}
# trac2_task_a['label'] = trac2_task_a['label'].map(task_a_labels).astype(int)
# trac2_task_a_dev['label'] = trac2_task_a_dev['label'].map(task_a_labels).astype(int)



# Greyscale
Adapted from Scalar Adj Code # [Extract Relevant Text](https://github.com/ainagari/scalar_adjs/blob/master/extract_flickr_scalar.py)



In [18]:
import pickle
import pdb
import spacy
import os
import collections

nlp = spacy.load("en_core_web_sm")

language_str = "en" #set to english -- one dataset in English only

### Identify the location of every word present the three types of scales
Extract and save out as a dictionary for every example that contains at least one scaled word. Adapted from extract_flickr_scalar.py

In [19]:
rankings = dict()
datanames = ["demelo", "crowd", "wilkinson"]

import dill
filename = "/content/drive/MyDrive/w266/scales_with_milder_option.pickle"
scales_with_milder_option = dill.load(open(filename, "rb"))

In [20]:
rankings = dict()
datanames = ["demelo", "crowd", "wilkinson"]
for dataname in datanames:
    r = read_scales("/content/scalar_adjs/data/" + dataname + "/gold_rankings/")
    rankings[dataname] = r

my_words = set()
for dataname in rankings:
    for scale in rankings[dataname]:
        for word in rankings[dataname][scale]:
            words = word.split(" || ")
            for w in words:
                my_words.add(w)

word_sentence_dict = dict()
for word in my_words:
    word_sentence_dict[word] = set()

def accepted_pos(pos):
    if pos in ["ADJ","ADV", "ADP","VERB","DET"]:
        return True
    return False


## Identify the position of the word which exists in any of the scales

  Since there are ties in our scales
  the milder word may be one or more words.
  So if the original word is foo || bar and the 
  milder word is foolish || barish this 
  {foo: foolish, foo: barish, bar: foolish,  bar: barish}

In [21]:
# create a nested dictionary with every scale, scale list with equalities, and all words in the scale

import collections
scales_dict = collections.defaultdict(dict)

for dataname in datanames:
  for scale_file_name, scale in rankings[dataname].items():
    words_in_scale = []
    for ws in scale:
      # split if there are ties
      words_in_scale.extend(ws.split(" || "))
    scales_dict[dataname][str(scale)] = tuple(words_in_scale)

In [22]:
def locate_scale_word(test_col, label_col):
  '''
  return a column with any scale_words 
  in the text column and their position
  '''
  more_test = collections.defaultdict(lambda: collections.defaultdict(dict))
  for data_name in datanames:
      for scale_name, words in scales_dict[data_name].items():
        for word in words:
          # convert text into a list of words to avoid partial matches found using .find()
          
          sentence_words = test_col.replace("'", "") 
          sentence_words = sentence_words.lower().split()
          
          if word in sentence_words:
            pos = sentence_words.index(word)
            # assume only one word to be replaced
            more_test[data_name][word]['position'] = int(pos)
            more_test[data_name][word]['milder_words'] = scales_with_milder_option[data_name][word]
            

  return more_test

In [23]:
# convert relevant df columns to dictionary

# task A
trac2_task_a['new_col'] = trac2_task_a.apply(lambda x: locate_scale_word(x['text'], x['label']), axis = 1)
dict_from_df_a = trac2_task_a.to_dict('index')


# Augment

In [24]:
%cd "/content/drive/MyDrive/w266"
%run grey_scale_augmentation.ipynb
labels_list, augmented_text_list, id_list = augment_greyscaling(dict_from_df_a, datanames, 'text', 'label')




/content/drive/MyDrive/w266


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [25]:
# append the augmented data plus labels

new_df = pd.DataFrame({'id':id_list, 'text':augmented_text_list, 
              'label':labels_list})
print("Number of augmented examples: ", len(new_df))

# add an example id column
trac2_task_a['id'] = trac2_task_a.index

# add labels indicating original vs augmented examples
trac2_task_a['is_og'] = 1
new_df['is_og'] = 0

# append to the original examples and create new augmented dataframe
train_df_aug = new_df.append(trac2_task_a)


Number of augmented examples:  11966


In [26]:
train_df_aug.drop('new_col', axis = 1, inplace = True)

In [27]:
# save out greyscale adjusted dataset
train_df_aug.to_csv('/content/drive/MyDrive/w266/grey_scaled_augmented_oversampled_subtask_a_train_data.csv', index=False)
print("Total number of examples: ", len(train_df_aug))

Total number of examples:  22091
