<a href="https://colab.research.google.com/github/ipietri/w266_Final_Project/blob/master/notebooks/RtGender-Notebooks/Sentiment_ScalesGrayscaling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Create Sentiment Scales with Greyscaled Language
[Code Source](https://github.com/ainagari/scalar_adjs)

* [BERT Knows Punta Cana is not just beautiful, it’s gorgeous:
Ranking Scalar Adjectives with Contextualised Representations](https://aclanthology.org/2020.emnlp-main.598.pdf)\
*[Scalar Adjective Identification and Multilingual Ranking
](https://arxiv.org/abs/2105.01180)\
*[Identifying and Ordering Scalar Adjectives Using Lexical Substitution](https://www.proquest.com/openview/aade435a5bbdcf41e2b8c24e648826cc/1.pdf?pq-origsite=gscholar&cbl=18750)\
*[A Gold Standard for Scalar Adjectives](https://aclanthology.org/L16-1424/)


In [1]:
try:
  from google.colab import drive
  drive.mount('/content/drive', force_remount=True)
  path = r'/content/drive/MyDrive/w266'
except ModuleNotFoundError:
  path = r'data'

Mounted at /content/drive


<a id='section01'></a>
## Load Modules

In [2]:
#!pip install -U nltk
import nltk; nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [3]:
%%capture
!pip install -q transformers

In [4]:
%%capture
!pip install datasets
import datasets 
from datasets import load_dataset, Dataset, DatasetDict

In [5]:
# Importing the libraries needed
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn import metrics
import torch
import seaborn as sns
import transformers
import json
from tqdm import tqdm
#from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import logging
logging.basicConfig(level=logging.ERROR)

In [6]:
from nltk.corpus import wordnet as wn
import gzip
import pickle
import numpy as np
import sys
from scipy.spatial.distance import cosine
from operator import itemgetter
from collections import defaultdict
#from pymagnitude import *
import argparse

import itertools

In [7]:
# import eda script from github
!git clone https://github.com/ainagari/scalar_adjs

Cloning into 'scalar_adjs'...
remote: Enumerating objects: 854, done.[K
remote: Counting objects: 100% (438/438), done.[K
remote: Compressing objects: 100% (189/189), done.[K
remote: Total 854 (delta 91), reused 358 (delta 45), pack-reused 416[K
Receiving objects: 100% (854/854), 13.47 MiB | 29.35 MiB/s, done.
Resolving deltas: 100% (129/129), done.


In [8]:
# import fucntions from scalar_adjs
import sys

# sys.path is a list of absolute path strings
sys.path.append('/content/scalar_adjs/')

from read_scalar_datasets import read_scales


# Greyscale
Adapted from Scalar Adj Code



# [Extract Relevant Text](https://github.com/ainagari/scalar_adjs/blob/master/extract_flickr_scalar.py)

In [9]:
import pickle
import pdb
import spacy
import os

nlp = spacy.load("en_core_web_sm")

language_str = "en" #set to english -- one dataset in English only

### Identify the location of every word present the three types of scales
Extract and save out as a dictionary for every example that contains at least one scaled word. Adapted from extract_flickr_scalar.py

In [10]:
rankings = dict()
datanames = ["demelo", "crowd", "wilkinson"]
for dataname in datanames:
    r = read_scales("/content/scalar_adjs/data/" + dataname + "/gold_rankings/")
    rankings[dataname] = r

my_words = set()
for dataname in rankings:
    for scale in rankings[dataname]:
        for word in rankings[dataname][scale]:
            words = word.split(" || ")
            for w in words:
                my_words.add(w)

word_sentence_dict = dict()
for word in my_words:
    word_sentence_dict[word] = set()

def accepted_pos(pos):
    if pos in ["ADJ","ADV", "ADP","VERB","DET"]:
        return True
    return False



## Identify the position of the word which exists in any of the scales

## Create Scaled Dictionaries with Grey Scaling

In [11]:
# create a nested dictionary with every scale, scale list with equalities, and all words in the scale

import collections

scales_dict = collections.defaultdict(dict)

for dataname in datanames:
  for scale_file_name, scale in rankings[dataname].items():
    words_in_scale = []
    for ws in scale:
      # split if there are ties
      words_in_scale.extend(ws.split(" || "))
    scales_dict[dataname][str(scale)] = tuple(words_in_scale)


  Since there are ties in our scales
  the milder word may be one or more words.
  So if the original word is foo || bar and the 
  milder word is foolish || barish this 
  {foo: foolish, foo: barish, bar: foolish,  bar: barish}

In [16]:
scales_with_milder_option = collections.defaultdict(lambda: collections.defaultdict(dict))

### iterate through temp dictionaries and create a master dictionary
for data_name in datanames:
  for scale_name, words in scales_dict[data_name].items():
    '''
    Sample input: scale_name = ['harmful', 'toxic', 'deadly']
    Sample output: {'deadly': ['harmful', 'toxic'], 'toxic': ['harmful']}
    '''
    ### convert key from string to list
    # drop first and last characters which are brackets[]
    scale_name = scale_name[2:-1].replace("'", "")
    scale_name = scale_name.split(", ") 

    while len(scale_name) > 1:
      most_extreme_words = scale_name[-1].split(" || ")
      
      milder_words = scale_name[:-1]
      milder_list = []
      for words in milder_words:
        milder_list.extend(words.split(" || "))
      instance = dict(itertools.product(most_extreme_words, milder_list))

      for k in most_extreme_words:
          scales_with_milder_option[data_name][k] = milder_list
      
      # drop most extreme terms
      scale_name.pop()

In [17]:
scales_with_milder_option

defaultdict(<function __main__.<lambda>>,
            {'crowd': defaultdict(dict,
                         {'abundant': ['plenty', 'vast'],
                          'accurate': ['clear'],
                          'adequate': ['fine'],
                          'admirable': ['fine'],
                          'angry': ['upset'],
                          'appropriate': ['a-okay', 'okay', 'fine'],
                          'awesome': ['fine', 'well'],
                          'bankrupt': ['poor', 'indigent'],
                          'beautiful': ['fair', 'fairer', 'fairest'],
                          'better': ['good'],
                          'big': ['sizeable'],
                          'bizarre': ['odd'],
                          'breathtaking': ['cute',
                           'pretty',
                           'lovely',
                           'lovelier'],
                          'broad': ['general'],
                          'broader': ['general', 'broad'],
   

In [22]:
import dill
with open('/content/drive/MyDrive/w266/scales_with_milder_option.pickle', 'wb') as handle:
    dill.dump(scales_with_milder_option, handle, protocol=pickle.HIGHEST_PROTOCOL)