In [9]:
%load_ext autoreload
%autoreload 1
%aimport prediction_models

import numpy as np 
import pandas as pd 
import json
from pandas.io.json import json_normalize
import nltk
import copy
import random
from collections import Counter
import operator
import math
from transformers import AutoTokenizer, AutoModelWithLMHead, AutoModelForSequenceClassification, pipeline, T5ForConditionalGeneration, T5Tokenizer
from scipy.special import softmax

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
def get_utterances(item):
    
    utterances = []
    
    for act in item['acts']:
        
        if (act['text'] not in ['Submit-Deal', 'Accept-Deal', 'Reject-Deal', 'Walk-Away', 'Submit-Post-Survey']):
            utterances.append(act['text'])
            
    return utterances

def remove_emo_from_utter(txt):
    return txt.replace('🙂', ' ').replace('😮', ' ').replace('☹️', ' ').replace('😡', ' ')

def remove_emoticons(all_utterances):
    
    new_all_utterances = []
    for utter in all_utterances:
        new_all_utterances.append(remove_emo_from_utter(utter))
    
    return new_all_utterances

In [4]:
in_f = "/home/ICT2000/chawla/Documents/internship2020/git_repo/storage/airtable/all_data-1030-shuffled.json" 
with open(in_f) as f:
    all_data = json.load(f)
    
print(len(all_data))
print(all_data[0].keys())

1030
dict_keys(['convo_is_finished', 'world_tag', 'bad_workers', 'acts', 'turns', 'workers', 'fpath', 'qualtrics', 'dialogue_id'])


In [8]:
tokenizer = AutoTokenizer.from_pretrained("mrm8488/t5-base-finetuned-emotion")
model = AutoModelWithLMHead.from_pretrained("mrm8488/t5-base-finetuned-emotion")

labels = ['joy', 'anger', 'sadness', 'love', 'fear', 'surprise']
ixs = []

for label in labels:
    ixs.append(tokenizer.encode(label)[0])

In [37]:
l2l = {}

for label in labels:
    lst = []
    for utt, pm in utterance2pm.items():
        
        pred = utterance2pred[utt]
        
        if(pred not in labels):
            pred = 'fear'
        
        if(pred == label):
            lst.append(pm)
            
    l2l[label] = lst

median25s = {}

for label in labels:
    
    median25s[label] = dict(pd.Series(l2l[label]).describe())['25%']

print(median25s)

{'joy': 0.8315099800331248, 'anger': 0.6590414741499082, 'sadness': 0.6939650287352179, 'love': 0.6823807518787013, 'fear': 0.41884136464940847, 'surprise': 0.7050954456329644}


In [39]:
def predict(text):
        
    input_ids = tokenizer.encode(text, return_tensors='pt')

    output = model.generate(input_ids=input_ids, max_length=2, 
                           output_scores=True,
                           return_dict_in_generate=True)

    scores = softmax(output['scores'][0][0].numpy().tolist())
    
    out_ix = np.argmax(scores)

    label = tokenizer.decode([out_ix])

    #assert label in ['joy', 'anger', 'sadness', 'love', 'fear', 'surprise'], dec
    return label, scores[out_ix]

def get_scores(text):
    input_ids = tokenizer.encode(text, return_tensors='pt')

    output = model.generate(input_ids=input_ids, max_length=2, 
                           output_scores=True,
                           return_dict_in_generate=True)

    scores = softmax(output['scores'][0][0].numpy().tolist())
    
    output = {
        'joy': scores[3922],
        'anger': scores[11213],
        'sadness': scores[24784],
        'love': scores[333],
        'fear': scores[2971],
        'surprise': scores[4158],
    }
    
    #assert label in ['joy', 'anger', 'sadness', 'love', 'fear', 'surprise'], dec
    return output

In [38]:
num_utts = 1000000
rem_emoticons = True#True

all_utterances = []

for item in all_data:
    all_utterances += get_utterances(item)

if(rem_emoticons):
    all_utterances = remove_emoticons(all_utterances)

random.shuffle(all_utterances)

print(len(all_utterances), len(all_utterances)/len(all_data))

utterance2pred = {}
utterance2pm = {}

for i, utt in enumerate(all_utterances[:num_utts]):
    if(not i%500):
        print(i)
    pred, pm = predict(utt)
    
    if (pred not in labels):
        pred = 'fear'
    
    if(pm < median25s[pred]):
        pred = 'neutral'

    utterance2pred[utt] = pred
    utterance2pm[utt] = pm
    
print(len(utterance2pred))

counter = Counter(list(utterance2pred.values()))

print("------")
print(counter)
    
print("------")

pms = list(utterance2pm.values())
print(pd.Series(pms).describe())

with open("/home/ICT2000/chawla/Documents/internship2020/git_repo/storage/emotion_predictions.json", 'w') as fp:
    json.dump(utterance2pred, fp)

11919 11.571844660194175
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500
11611
------
Counter({'joy': 5752, 'neutral': 4521, 'anger': 539, 'fear': 497, 'sadness': 236, 'love': 56, 'surprise': 10})
------
count    11611.000000
mean         0.792558
std          0.207098
min          0.253872
25%          0.624068
50%          0.869988
75%          0.981004
max          0.999969
dtype: float64


In [42]:
#get utterance level scores for all labels

num_utts = 1000000
rem_emoticons = True#True

all_utterances = []

for item in all_data:
    all_utterances += get_utterances(item)

if(rem_emoticons):
    all_utterances = remove_emoticons(all_utterances)

random.shuffle(all_utterances)

print(len(all_utterances), len(all_utterances)/len(all_data))

utterance2scores = {}

for i, utt in enumerate(all_utterances[:num_utts]):
    if(not i%500):
        print(i)
        
    utterance2scores[utt] = get_scores(utt)

with open("/home/ICT2000/chawla/Documents/internship2020/git_repo/storage/emotion_model_scores.json", 'w') as fp:
    json.dump(utterance2scores, fp)

11919 11.571844660194175
0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
9500
10000
10500
11000
11500


In [41]:
utterance2scores

{'I am happy with the 2 firewood and 1 water, but would then prefer 2 food also if that if ok.': {'joy': 0.999083111704311,
  'anger': 5.738228812745169e-06,
  'sadness': 1.7258449013629365e-05,
  'love': 0.0006063432222956147,
  'fear': 7.557319995685185e-07,
  'surprise': 4.500916208545557e-06},
 'Okay friend. Which item do you need the most?': {'joy': 0.9467635936572711,
  'anger': 0.016739454568358018,
  'sadness': 0.005857130517080082,
  'love': 0.015529360114214907,
  'fear': 0.005516657564301295,
  'surprise': 0.001049752902521131},
 'No, that is a terrible offer, you get more of everything': {'joy': 0.0005463096819684481,
  'anger': 0.12334891168061868,
  'sadness': 0.8398108964331208,
  'love': 5.8613231273685474e-05,
  'fear': 0.03430896274257295,
  'surprise': 3.998537230471304e-05},
 'Correct. ': {'joy': 0.888298994444777,
  'anger': 0.02289916617680063,
  'sadness': 0.0003387536888299207,
  'love': 0.003450029080425222,
  'fear': 0.006314605672584641,
  'surprise': 0.00024