In [1]:
"""
Gather the utterances...and make emotion predictions...look at them...
"""

'\nGather the utterances...and make emotion predictions...look at them...\n'

In [3]:
%load_ext autoreload
%autoreload 1
%aimport prediction_models

import numpy as np 
import pandas as pd 
import json
from pandas.io.json import json_normalize
import nltk
import copy
import random
from collections import Counter
import operator
import math

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
in_f = "/home/ICT2000/chawla/Documents/internship2020/git_repo/storage/airtable/all_data-1030-shuffled.json" 
with open(in_f) as f:
    all_data = json.load(f)
    
print(len(all_data))
print(all_data[0].keys())

1030
dict_keys(['convo_is_finished', 'world_tag', 'bad_workers', 'acts', 'turns', 'workers', 'fpath', 'qualtrics', 'dialogue_id'])


In [15]:
def get_utterances(item):
    
    utterances = []
    
    for act in item['acts']:
        
        if (act['text'] not in ['Submit-Deal', 'Accept-Deal', 'Reject-Deal', 'Walk-Away', 'Submit-Post-Survey']):
            utterances.append(act['text'])
            
    return utterances

def remove_emo_from_utter(txt):
    return txt.replace('🙂', ' ').replace('😮', ' ').replace('☹️', ' ').replace('😡', ' ')

def remove_emoticons(all_utterances):
    
    new_all_utterances = []
    for utter in all_utterances:
        new_all_utterances.append(remove_emo_from_utter(utter))
    
    return new_all_utterances

def _size(corpus: dict) -> int:
    return sum(corpus.values())

def _log_odds(
    word: str,
    c1: dict,
    c2: dict,
    bg: dict,
    size1: int,
    size2: int,
    size3: int,
) -> float:
    numerator_1 = c1[word] + bg[word]
    numerator_2 = c2[word] + bg[word]
    denom_1 = size1 + size3 - numerator_1
    denom_2 = size2 + size3 - numerator_2
    raw_logodds = math.log(numerator_1 / denom_1) - math.log(
        numerator_2 / denom_2
    )
    
    variance = (1 / numerator_1) + (1 / numerator_2)
    return raw_logodds / math.sqrt(variance)

def get_log_odds(pred2utts):
    
    #tokenize
    pred2toks = {}
    for pred, utts in pred2utts.items():
        pred2toks[pred] = []    
        for utt in utts:
            pred2toks[pred] += [w.lower() for w in nltk.word_tokenize(utt)]
            
    pred2ratios = {}
            
    for pred, toks in pred2toks.items():
    
        toks_rest = []
        for pred2, toks2 in pred2toks.items():
            if(pred2 != pred):
                toks_rest += toks2
        
        c_1 = dict(Counter(toks))
        c_2 = dict(Counter(toks_rest))
        c_bg = dict(Counter(toks + toks_rest))
        
        size1 = _size(c_1)
        size2 = _size(c_2)
        size3 = _size(c_bg)
        
        supported_tokens = set(c_1.keys())
        supported_tokens &= set(c_2.keys())
        supported_tokens &= set(c_bg.keys())
        
        ratios = []
        for tok in supported_tokens:
            rat = _log_odds(tok, c_1, c_2, c_bg, size1, size2, size3)
            ratios.append((tok, rat))

        ratios.sort(key=operator.itemgetter(1), reverse=True)
        
        pred2ratios[pred] = ratios
        
    return pred2ratios

def print_stats(pred2utts):
    
    #get counts for each label
    print("---Count of utterances---")
    for pred, utts in pred2utts.items():
        print(pred, len(utts))
    
    print("----Sample Utterances----")
    #get 5 random utterances for each label
    for pred, utts in pred2utts.items():
        print(pred, utts[:5])
    
    #get top 5 words for each label
    print("----Top words for each label----")
    #sorted list of tuples for each pred (word, log odds ratio)
    pred2logodds = get_log_odds(pred2utts)
    
    for pred, logodds in pred2logodds.items():
        print(f"----{pred}----")
        
        for tup in logodds[:10]:
            print(tup[0], round(tup[1], 3))
        
def get_pred2utts(modelname, num_utts, rem_emoticons):
    
    all_utterances = []

    for item in all_data:
        all_utterances += get_utterances(item)

    if(rem_emoticons):
        all_utterances = remove_emoticons(all_utterances)

    random.shuffle(all_utterances)

    print(len(all_utterances), len(all_utterances)/len(all_data))

    model_obj = None
    if(modelname == 'LIWC'):
        model_obj = prediction_models.LIWCModel()
    elif(modelname == 'Emoticons'):
        model_obj = prediction_models.EmoticonModel()
    
    assert model_obj

    pred2utts = {}

    for utt in all_utterances[:num_utts]:    
        pred = model_obj.predict(utt)

        if(pred not in pred2utts):
            pred2utts[pred] = []

        pred2utts[pred].append(utt)
    
    return pred2utts

In [20]:
modelnames = ['LIWC', 'Emoticons']

modelname = 'Emoticons'#'LIWC'
num_utts = 100000
rem_emoticons = False

pred2utts = get_pred2utts(modelname, num_utts, rem_emoticons)

11919 11.571844660194175


In [21]:
print_stats(pred2utts)

---Count of utterances---
Neutral 10055
Joy 1458
Sadness 241
Surprise 124
Anger 41
----Sample Utterances----
Neutral ['Perfect! What would you think about if we divided it up so you could have all 3 firewood then, and we could split water and food? ', "Would you be willing to give me 3 firewood, I'll give you 3 water or food, which ever you need more and we split the rest. What do you say?", 'Give 1 package of firewood, 2 packages of water and 1 package of food. Seal this deal Sir.', 'if you take all the water i will need all the firewood and two food. deal?', 'Great, that sounds fair to me. ']
Joy ['I am ok with that if i get two waters, just incase i have to put out the fire 🙂', 'That sounds great! I will submit the deal. Hope you have a nice day and best of luck on your camping trip! 🙂', "I can't accept that deal unfortunately. Our campsite requires substantial hiking with a lot of elevation gain. I will need the extra food and water to sustain the hiking. How about I take 2 food pa

In [22]:
#model_obj = prediction_models.EmotionTwitterDataModel()
model_obj = prediction_models.EmotionTwitterDataModel()



In [23]:
model_obj.predict('I hope this works!')

'joy'

In [24]:
num_utts = 100000
rem_emoticons = True#True

all_utterances = []

for item in all_data:
    all_utterances += get_utterances(item)

if(rem_emoticons):
    all_utterances = remove_emoticons(all_utterances)

random.shuffle(all_utterances)

print(len(all_utterances), len(all_utterances)/len(all_data))

pred2utts = {}

for i, utt in enumerate(all_utterances[:num_utts]):
    if(not i%500):
        print(i)
    pred = model_obj.predict(utt)

    if(pred not in pred2utts):
        pred2utts[pred] = []

    pred2utts[pred].append(utt)

11919 11.571844660194175
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500


In [25]:
print_stats(pred2utts)

---Count of utterances---
joy 8576
sadness 512
anger 2019
love 94
fear 699
surprise 17
thirst 1
hunger 1
----Sample Utterances----
joy ['Deal with ok but I need 1 water.', 'Hello there, How would you feel about trading supplies in this manner. 2 Water for me and 2 food , while you get 1 Food , 1 Water , and 3 Firewood? ', 'Well we do have the starter kit, but ok then you can have 1 water, 2 firewood and all the food.  I feel generous today ', 'For sure.. I am just busy planning and getting supplies - do not want to forget anything.  ', 'funnny you but no. so what else will i be left with if i give you those. okay let say i get 2 food, 2 firewood and 1 water.']
sadness ['I am old. Old people need hydration, more than younger people. Are you younger?', 'Thank you very much. I am anemic so it helps to have all the firewood to keep warm. ', "O wow.  I'm sorry to here that. I can give you all my firewood and a food if you can give me all your water. ", 'Good! So I primarily need food, as ou

In [6]:
import transformers
transformers.__version__

'4.4.2'