# Importing needed modules

In [1]:
from datasets import list_datasets, load_dataset
import openai
from IPython.display import display, Markdown
import pandas as pd
import re
from tqdm.auto import tqdm
tqdm.pandas()
import backoff

#from electra_classifier import *
%matplotlib inline 
%config InlineBackend.figure_format='retina'

In [2]:
from tenacity import (
    retry,
    stop_after_attempt,
    wait_random_exponential,
)  # for exponential backoff

# Data preparation

## Importing data

In [3]:
empathetic_dialogue_dataset = load_dataset('empathetic_dialogues')  

Found cached dataset empathetic_dialogues (/home/ahmed_b/.cache/huggingface/datasets/empathetic_dialogues/default/0.1.0/09bbeed3882a67db98c73952fb3c1c9a85af83dc78f81454c2454382fd03f6cf)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
train_df = empathetic_dialogue_dataset["train"].to_pandas()
val_df = empathetic_dialogue_dataset["validation"].to_pandas()
test_df = empathetic_dialogue_dataset["test"].to_pandas()
all_df = pd.concat([train_df, val_df, test_df])
sample_df = all_df.iloc[:100]

In [5]:
print(f"We have {len(train_df)} rows ({int(100*len(train_df)/len(all_df))}%) for training, {len(val_df)} rows ({int(100*len(val_df)/len(all_df))}%) for validation and {len(test_df)} rows ({int(100*len(test_df)/len(all_df))}%) for testing.")

We have 76673 rows (76%) for training, 12030 rows (12%) for validation and 10943 rows (10%) for testing.


In [6]:
df = train_df
#df = all_df
#df = sample_df
df.head()

Unnamed: 0,conv_id,utterance_idx,context,prompt,speaker_idx,utterance,selfeval,tags
0,hit:0_conv:1,1,sentimental,I remember going to the fireworks with my best...,1,I remember going to see the fireworks with my ...,5|5|5_2|2|5,
1,hit:0_conv:1,2,sentimental,I remember going to the fireworks with my best...,0,Was this a friend you were in love with_comma_...,5|5|5_2|2|5,
2,hit:0_conv:1,3,sentimental,I remember going to the fireworks with my best...,1,This was a best friend. I miss her.,5|5|5_2|2|5,
3,hit:0_conv:1,4,sentimental,I remember going to the fireworks with my best...,0,Where has she gone?,5|5|5_2|2|5,
4,hit:0_conv:1,5,sentimental,I remember going to the fireworks with my best...,1,We no longer talk.,5|5|5_2|2|5,


## Data Cleaning

Some rows contain utterances that are mixed or duplicated

In [7]:
full_text = ""
indexes_to_drop = []
for i in tqdm(range(len(df))):
    utterance = df.iloc[i].utterance
    if "|" in utterance[:-1]: # excluding the sentenses finishing with the character "|" 
        text_starting_from_2nd_row = utterance[utterance.index('\n')+1:]
        first_row = ','.join(df.loc[i].astype(str).tolist())
        full_text = full_text + "\n" + first_row + "\n"+ text_starting_from_2nd_row
        indexes_to_drop.append(i)
full_text = full_text[1: ] # Removing the initial "\n"
df = df.drop(index=indexes_to_drop)

  0%|          | 0/76673 [00:00<?, ?it/s]

In [8]:
print(f"We dropped {len(indexes_to_drop)} bad rows with bad utterance formatting. We now have {len(df)} clean rows.")

We dropped 51 bad rows with bad utterance formatting. We now have 76622 clean rows.


In [9]:
grouped_by_df = df.groupby(["conv_id", "context"])["utterance"].apply(list).to_frame().reset_index()
grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo..."
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,..."
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...


In [10]:
indexes_with_only_1_utterance = grouped_by_df[grouped_by_df.utterance.apply(lambda x: len(x))<2].index
print(f"We have {len(indexes_with_only_1_utterance)} conversations with only 1 utterance. We're gonna drop those")

We have 13 conversations with only 1 utterance. We're gonna drop those


In [11]:
grouped_by_df = grouped_by_df.drop(indexes_with_only_1_utterance, axis=0)

We will remove the last utterance when there is an odd number of utterances in a conversation

In [12]:
def remove_last_odd_utternace(utterance):
    if len(utterance)%2:
        return utterance[:-1]
    else: 
        return utterance

In [13]:
grouped_by_df["utterance"] = grouped_by_df.utterance.apply(lambda x: remove_last_odd_utternace(x))

In [14]:
grouped_by_df

Unnamed: 0,conv_id,context,utterance
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo..."
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,..."
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...
...,...,...,...
17788,hit:9999_conv:19999,apprehensive,[So I went skydiving for the first time the ot...
17789,hit:999_conv:1998,confident,[I believe I did rather well on my Law School ...
17790,hit:999_conv:1999,devastated,[I was shocked when Lebron left the cavs again...
17791,hit:99_conv:198,ashamed,"[I cheated on a test. I am very ashamed., oh n..."


In [15]:
def count_sentences(text):
    # Split the text into sentences using regular expressions
    sentences = re.split('[.!?]+', text)
    
    # Count the number of sentences
    num_sentences = len(sentences)
    
    # Return the result
    return num_sentences

import matplotlib.pyplot as plt
number_of_sentences_list = df.utterance.apply(lambda x: count_sentences(x)).to_list()
less_than_2_sentences_percentage = int(100*len([x for x in number_of_sentences_list if x <= 2]) /len(number_of_sentences_list))
print(f"More than {less_than_2_sentences_percentage}% of the uternaces contain 2 sentences or less.") 

number_of_sentences_less_than_10 = [x for x in number_of_sentences_list if x <= 10]
#plt.hist(number_of_sentences_less_than_10, bins=50)

More than 58% of the uternaces contain 2 sentences or less.


Most utterances are 2 sentences or less (58%).

In [16]:
def repair_sentence(sentence):
    # Replace _comma_ with ,
    sentence = re.sub(r'_comma_', ',', sentence)
    # Replace \' with '
    sentence = re.sub(r"\\'", "'", sentence)
    # Remove double quotes
    sentence = sentence.replace('"', '')
    return sentence

In [17]:
print(f"We have {len(grouped_by_df)} conversations in total")

We have 17780 conversations in total


# PAID ChatGPT API !!

But First, let's import the ELECTRA classifier

In [18]:
print("Preparing Electra Classifier...")
from electra_classifier import *

best_model_path = "electra_cls/lightning_logs/version_18/checkpoints/epoch=9-step=3629.ckpt"
emotion_categories = ['admiration', 'amusement', 'anger', 'annoyance', 'approval', 'caring',
       'confusion', 'curiosity', 'desire', 'disappointment', 'disapproval',
       'disgust', 'embarrassment', 'excitement', 'fear', 'gratitude', 'grief',
       'joy', 'love', 'nervousness', 'optimism', 'pride', 'realization',
       'relief', 'remorse', 'sadness', 'surprise', 'neutral']

MODEL_NAME = "google/electra-base-discriminator"
tokenizer = ElectraTokenizer.from_pretrained(MODEL_NAME)

trained_model = EmotionClassifier.load_from_checkpoint(
    # trainer.checkpoint_callback.best_model_path,
    best_model_path,
    n_classes=len(emotion_categories)
)

def predict_emotion_and_probability(text):
    encoding = tokenizer(
          text,
          max_length=64, 
          truncation=True,
          padding="max_length",
          add_special_tokens=True,
          return_token_type_ids=False,
          return_attention_mask=True,
          return_tensors="pt"
      )
    outputs = trained_model(**encoding)
    probabilities = list(torch.softmax(outputs, dim=-1).detach().numpy().flatten())
    emotion_idx = torch.argmax(outputs, dim=-1).item() 
    predicted_emotion = emotion_categories[emotion_idx]

    emotion_categories_list = list(emotion_categories)
    probabilities, emotion_categories_list = zip(*sorted(zip(probabilities, emotion_categories_list)))
    probabilities = probabilities[::-1]
    emotion_categories_list = emotion_categories_list[::-1]
    emotion_probability = round(100*probabilities[0], 2)#:.2f

    return predicted_emotion, emotion_probability

print("Electra Classifier is ready!")

Preparing Electra Classifier...


Global seed set to 42
Some weights of the model checkpoint at google/electra-base-discriminator were not used when initializing ElectraClassifier: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraClassifier from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraClassifier from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraClassifier were not initialized from the model checkpoint at google/electra-base-discriminator and are newly initialized: ['classifier.out_proj.bias', 'classifier.dense.bias'

Electra Classifier is ready!


In [19]:
sample_text = "I do! I was so happy when I opened the box and that fat mofo jumped out!"
predict_emotion_and_probability(sample_text)

('joy', 98.35)

In [20]:
def show_formatted_choice(unformatted_text):
  display(Markdown(unformatted_text["choices"][0]["message"]["content"]))

## Calling the API

In [21]:
API_KEY = "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"
openai.api_key = API_KEY

## Testing with only 1 conversation

### Non-emotional version

In [62]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def call_chatgpt_no_emotion(utterances):
    # Prepare the messages to be sent to ChatGPT
    messages = []
    for utt_numb, utterance in enumerate(utterances[:-1]):
        if not utt_numb%2: 
            messages.append({"role": "user", "content": repair_sentence(utterance)})
        else:
            messages.append({"role": "assistant", "content": repair_sentence(utterance)})  
    messages.append({"role": "assistant", "content": "I'll reply with less than 2 sentences: "})
            
    # Calling ChatGPT to complete the conversation
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo", 
      messages=messages
    )
    return completion

In [63]:
def complete_with_non_emotional_chatgpt(utterances):
    return call_chatgpt_no_emotion(utterances)["choices"][0]["message"]["content"]

In [64]:
### EXAMPLE CONVERSATION
conversation = grouped_by_df.utterance.iloc[7]

In [65]:
display(conversation)

['My little cousin gifted me for my birthday',
 'Very sweet of him. Did he surprise you?',
 'Yeah definitely. it was very pleasant one at that. he earned that money while doing chores at home',
 'Nice! Hes a sweet kid. I bet that brought you joy as well.']

In [66]:
chatgpt_response_no_emotion = call_chatgpt_no_emotion(conversation)

In [67]:
show_formatted_choice(chatgpt_response_no_emotion)

That's so thoughtful of your little cousin. It's wonderful to see children learn the value of hard work and use their earnings to make others happy.

In [68]:
non_emotional_emotion, non_emotional_probability = predict_emotion_and_probability(chatgpt_response_no_emotion["choices"][0]["message"]["content"])

In [70]:
print(f'The non-emotional version gives a response with the "{non_emotional_emotion}" ({non_emotional_probability}% confidence)')

The non-emotional version gives a response with the "admiration" (66.84% confidence)


### Prompt2 version

In [22]:
@retry(wait=wait_random_exponential(min=1, max=60), stop=stop_after_attempt(6))
def call_chatgpt_prompt2(utterances):
    # Prepare the messages to be sent to ChatGPT
    messages = []
    for utt_numb, utterance in enumerate(utterances[:-1]):
        if not utt_numb%2: 
            messages.append({"role": "user", "content": repair_sentence(utterance)})
        else:
            messages.append({"role": "assistant", "content": repair_sentence(utterance)})  
            
    messages[-1]["content"] = messages[-1]["content"] + ". Try to understand how I feel."
    messages.append({"role": "assistant", "content": "I'll reply with less than 2 sentences: "})
            
    # Calling ChatGPT to complete the conversation
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo", 
      messages=messages
    )
    return completion

In [31]:
def complete_with_prompt2_chatgpt(utterances):
    return call_chatgpt_prompt2(utterances)["choices"][0]["message"]["content"]

In [32]:
### EXAMPLE CONVERSATION
conversation = grouped_by_df.utterance.iloc[7]

In [33]:
display(conversation)

['My little cousin gifted me for my birthday',
 'Very sweet of him. Did he surprise you?',
 'Yeah definitely. it was very pleasant one at that. he earned that money while doing chores at home',
 'Nice! Hes a sweet kid. I bet that brought you joy as well.']

In [34]:
chatgpt_response_prompt2 = call_chatgpt_prompt2(conversation)

In [35]:
show_formatted_choice(chatgpt_response_prompt2)

It's wonderful to receive a gift that someone worked hard for, especially from a young family member. You must feel very loved and appreciated.

In [71]:
non_emotional_emotion, non_emotional_probability = predict_emotion_and_probability(chatgpt_response_prompt2["choices"][0]["message"]["content"])

In [73]:
print(f'The prompt2 version gives a response with the "{non_emotional_emotion}" ({non_emotional_probability}% confidence)')

The prompt2 version gives a response with the "admiration" (87.5% confidence)


## Testing with only the first 10 conversations:

### Non-emotional version

In [74]:
sample_grouped_by_df = grouped_by_df[:10]
sample_grouped_by_df["chatgpt_response_no_emotion"] = sample_grouped_by_df["utterance"].progress_apply(lambda x: complete_with_non_emotional_chatgpt(x))

  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [75]:
sample_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,chatgpt_response_no_emotion
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...,I'm sorry to hear that. Maybe one day you'll c...
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo...",That's great to hear! Toads can make wonderful...
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,...",Nice choice! The combination of blue and yello...
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...,I'm sorry to hear that. It can be tough to fee...
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...,It's never too late to make a difference. Next...


In [76]:
sample_grouped_by_df["chatgpt_response_detail_no_emotion"] = sample_grouped_by_df.chatgpt_response_no_emotion.progress_apply(lambda x: predict_emotion_and_probability(x))

  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [77]:
sample_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,chatgpt_response_no_emotion,chatgpt_response_detail_no_emotion
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...,I'm sorry to hear that. Maybe one day you'll c...,"(remorse, 41.53)"
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo...",That's great to hear! Toads can make wonderful...,"(joy, 91.4)"
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,...",Nice choice! The combination of blue and yello...,"(admiration, 88.48)"
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...,I'm sorry to hear that. It can be tough to fee...,"(remorse, 76.07)"
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...,It's never too late to make a difference. Next...,"(caring, 99.36)"


In [78]:
sample_grouped_by_df.utterance.iloc[4]

["I was walking on the road. I saw a beggar and i didn't help him.",
 "Wow_comma_ that's kinda mean",
 'yeah i know. i was in a hurry and i am ashamed of myself!!',
 'You think he will be there next time so you can help him?']

In [79]:
sample_grouped_by_df.chatgpt_response_no_emotion.iloc[4]

"It's never too late to make a difference. Next time you see someone in need, take a moment to help them out."

### Prompt2 version

In [38]:
sample_grouped_by_df = grouped_by_df[:10]
sample_grouped_by_df["chatgpt_response_prompt2"] = sample_grouped_by_df["utterance"].progress_apply(lambda x: complete_with_prompt2_chatgpt(x))

  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [39]:
sample_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,chatgpt_response_prompt2
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...,I'm sorry to hear that you and your friend are...
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo...","As an AI, I don't have emotions to empathize w..."
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,...","As an AI language model, I am incapable of fee..."
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...,I'm sorry you didn't have a good time. It's ok...
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...,It's good that you recognize your actions and ...


Now analyzing the result..

In [40]:
sample_grouped_by_df["chatgpt_response_detail_prompt2"] = sample_grouped_by_df.chatgpt_response_prompt2.progress_apply(lambda x: predict_emotion_and_probability(x))

  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [41]:
sample_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,chatgpt_response_prompt2,chatgpt_response_detail_prompt2
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...,I'm sorry to hear that you and your friend are...,"(sadness, 98.22)"
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo...","As an AI, I don't have emotions to empathize w...","(joy, 95.57)"
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,...","As an AI language model, I am incapable of fee...","(joy, 98.72)"
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...,I'm sorry you didn't have a good time. It's ok...,"(remorse, 67.45)"
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...,It's good that you recognize your actions and ...,"(caring, 82.59)"


In [42]:
sample_grouped_by_df.utterance.iloc[4]

["I was walking on the road. I saw a beggar and i didn't help him.",
 "Wow_comma_ that's kinda mean",
 'yeah i know. i was in a hurry and i am ashamed of myself!!',
 'You think he will be there next time so you can help him?']

In [43]:
sample_grouped_by_df.chatgpt_response_prompt2.iloc[4]

"It's good that you recognize your actions and feel remorseful. Next time, try to take a moment to help someone in need, even if it's just a small gesture."

## Testing with 10% - 90% split

### Splitting the data

In [44]:
num_rows = grouped_by_df.shape[0]
# calculate the index of the row to split on
split_idx = int(0.1 * num_rows)
# select the first 10% of rows using iloc
first_10_percent_grouped_by_df = grouped_by_df.iloc[:split_idx]
print(f"The first 10% contains {len(first_10_percent_grouped_by_df)} conversations.")
# select the remaining 90% of rows using iloc
last_90_precent_grouped_by_df = grouped_by_df.iloc[split_idx:]
print(f"The last 90% contains {len(last_90_precent_grouped_by_df)} conversations.")

The first 10% contains 1778 conversations.
The last 90% contains 16002 conversations.


#### Experiment: remove utterances finishing with "neutral" emotion: 

In [89]:
first_10_percent_grouped_by_df["utterance_2_emotion"] = first_10_percent_grouped_by_df["utterance"].progress_apply(lambda x: predict_emotion_and_probability(x[-2]))

  0%|          | 0/1778 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [90]:
first_10_percent_grouped_by_df[['utterance_minus_2_emotion', 'utterance_minus_2_emotion_conf']] = first_10_percent_grouped_by_df['utterance_2_emotion'].apply(lambda x: pd.Series(x))
first_10_percent_grouped_by_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,conv_id,context,utterance,utterance_2_emotion,utterance_minus_2_emotion,utterance_minus_2_emotion_conf
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...,"(neutral, 99.86)",neutral,99.86
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo...","(joy, 98.35)",joy,98.35
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,...","(neutral, 91.12)",neutral,91.12
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...,"(disappointment, 31.49)",disappointment,31.49
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...,"(embarrassment, 99.49)",embarrassment,99.49


In [91]:
first_10_percent_grouped_by_df[first_10_percent_grouped_by_df.utterance_minus_2_emotion == "neutral"].utterance_minus_2_emotion_conf.describe()

count    417.000000
mean      87.731607
std       17.501684
min       22.990000
25%       83.140000
50%       97.030000
75%       99.520000
max       99.960000
Name: utterance_minus_2_emotion_conf, dtype: float64

In [92]:
first_10_percent_grouped_by_df.utterance_minus_2_emotion.value_counts()

neutral           417
approval          211
admiration        165
joy                95
gratitude          91
sadness            77
excitement         63
disappointment     61
love               56
disapproval        54
amusement          51
optimism           51
fear               46
realization        42
desire             37
curiosity          32
caring             32
surprise           31
nervousness        28
annoyance          24
anger              22
confusion          21
embarrassment      20
remorse            15
disgust            13
relief             12
pride               7
grief               4
Name: utterance_minus_2_emotion, dtype: int64

In [None]:
#first_10_percent_grouped_by_df.to_pickle("ChatEPT/prompt2/first_10_percent_grouped_by_df_prompt2")

### Working first on the first 10% of the data

In [93]:
first_10_percent_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,utterance_2_emotion,utterance_minus_2_emotion,utterance_minus_2_emotion_conf
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...,"(neutral, 99.86)",neutral,99.86
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo...","(joy, 98.35)",joy,98.35
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,...","(neutral, 91.12)",neutral,91.12
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...,"(disappointment, 31.49)",disappointment,31.49
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...,"(embarrassment, 99.49)",embarrassment,99.49


#### Non-emotional version

In [94]:
utterances = first_10_percent_grouped_by_df.utterance.iloc[18]
utterances

['I know I will get my bonus this quarter.',
 "That's exciting! What do you plan to do with it?",
 'I plan to go on a vacation.',
 'Wonderful! Where are you going to go?']

In [95]:
complete_with_non_emotional_chatgpt(utterances)

"That sounds like a great plan! Where do you think you'll go?"

In [96]:
first_10_percent_grouped_by_df["chatgpt_response_no_emotion"] = first_10_percent_grouped_by_df["utterance"].progress_apply(lambda x: complete_with_non_emotional_chatgpt(x))

  0%|          | 0/1778 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [97]:
first_10_percent_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,utterance_2_emotion,utterance_minus_2_emotion,utterance_minus_2_emotion_conf,chatgpt_response_no_emotion
0,hit:0_conv:1,sentimental,[I remember going to see the fireworks with my...,"(neutral, 99.86)",neutral,99.86,I'm sorry to hear that. Maybe someday you'll r...
1,hit:10000_conv:20000,surprised,"[My girlfriend got me a pet toad today!, Do yo...","(joy, 98.35)",joy,98.35,That's great! Toads can make wonderful pets an...
2,hit:10000_conv:20001,impressed,"[I really like the new paint job on my house.,...","(neutral, 91.12)",neutral,91.12,Blue and yellow is a lovely combination! I'm g...
3,hit:10001_conv:20002,lonely,[I went to the skating rink all by myself toda...,"(disappointment, 31.49)",disappointment,31.49,I'm sorry you didn't enjoy your time at the sk...
4,hit:10002_conv:20004,ashamed,[I was walking on the road. I saw a beggar and...,"(embarrassment, 99.49)",embarrassment,99.49,"It's okay to make mistakes, but it's important..."


In [None]:
first_10_percent_grouped_by_df.to_pickle("ChatEPT/prompt2/first_10_percent_grouped_by_df_prompt2")

#### Prompt2 version

In [98]:
utterances = first_10_percent_grouped_by_df.utterance.iloc[18]
utterances

['I know I will get my bonus this quarter.',
 "That's exciting! What do you plan to do with it?",
 'I plan to go on a vacation.',
 'Wonderful! Where are you going to go?']

In [99]:
complete_with_prompt2_chatgpt(utterances)

"As an AI language model, I don't have feelings, but I can understand and respond to yours. A bonus is always a great reason to treat yourself to something special, such as a well-deserved vacation."

In [100]:
first_10_percent_grouped_by_df["chatgpt_response_prompt2"] = first_10_percent_grouped_by_df["utterance"].progress_apply(lambda x: complete_with_prompt2_chatgpt(x))

  0%|          | 0/1778 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [None]:
first_10_percent_grouped_by_df.head()

In [None]:
first_10_percent_grouped_by_df.to_pickle("ChatEPT/prompt2/first_10_percent_grouped_by_df_prompt2")

### Working on the last 90% of the data

#### Experiment: remove utterances finishing with "neutral" emotion: 

In [47]:
last_90_precent_grouped_by_df["utterance_2_emotion"] = last_90_precent_grouped_by_df["utterance"].progress_apply(lambda x: predict_emotion_and_probability(x[-2]))

  0%|          | 0/16002 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [48]:
last_90_precent_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,utterance_2_emotion
1779,hit:11426_conv:22853,disgusted,[I went into my shed to clean it and i found a...,"(neutral, 99.85)"
1780,hit:11427_conv:22854,apprehensive,[My sister asked me to go out for drinks with ...,"(annoyance, 94.36)"
1781,hit:11427_conv:22855,furious,[I missed my train yesterday. I was so mad at ...,"(disappointment, 71.76)"
1782,hit:11428_conv:22856,embarrassed,[Yesterday When I was coming from my office. I...,"(amusement, 92.57)"
1783,hit:11428_conv:22857,confident,"[I recently was in a Arcade game tournament. ,...","(approval, 27.76)"


In [49]:
last_90_precent_grouped_by_df[['utterance_minus_2_emotion', 'utterance_minus_2_emotion_conf']] = last_90_precent_grouped_by_df['utterance_2_emotion'].apply(lambda x: pd.Series(x))
last_90_precent_grouped_by_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


Unnamed: 0,conv_id,context,utterance,utterance_2_emotion,utterance_minus_2_emotion,utterance_minus_2_emotion_conf
1779,hit:11426_conv:22853,disgusted,[I went into my shed to clean it and i found a...,"(neutral, 99.85)",neutral,99.85
1780,hit:11427_conv:22854,apprehensive,[My sister asked me to go out for drinks with ...,"(annoyance, 94.36)",annoyance,94.36
1781,hit:11427_conv:22855,furious,[I missed my train yesterday. I was so mad at ...,"(disappointment, 71.76)",disappointment,71.76
1782,hit:11428_conv:22856,embarrassed,[Yesterday When I was coming from my office. I...,"(amusement, 92.57)",amusement,92.57
1783,hit:11428_conv:22857,confident,"[I recently was in a Arcade game tournament. ,...","(approval, 27.76)",approval,27.76


In [50]:
last_90_precent_grouped_by_df[last_90_precent_grouped_by_df.utterance_minus_2_emotion == "neutral"].utterance_minus_2_emotion_conf.describe()

count    4102.000000
mean       88.093774
std        17.041761
min        20.330000
25%        81.932500
50%        97.470000
75%        99.670000
max        99.970000
Name: utterance_minus_2_emotion_conf, dtype: float64

In [51]:
last_90_precent_grouped_by_df.utterance_minus_2_emotion.value_counts()

neutral           4102
approval          1791
admiration        1322
sadness            729
disappointment     715
joy                668
gratitude          663
excitement         587
disapproval        517
optimism           509
amusement          505
fear               465
love               389
annoyance          382
desire             353
curiosity          287
anger              278
confusion          261
surprise           253
realization        226
nervousness        219
caring             197
embarrassment      174
disgust            137
remorse            101
relief              87
grief               48
pride               37
Name: utterance_minus_2_emotion, dtype: int64

In [52]:
#first_10_percent_grouped_by_df.to_pickle("ChatEPT/prompt2/first_10_percent_grouped_by_df_prompt2")

In [53]:
last_90_precent_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,utterance_2_emotion,utterance_minus_2_emotion,utterance_minus_2_emotion_conf
1779,hit:11426_conv:22853,disgusted,[I went into my shed to clean it and i found a...,"(neutral, 99.85)",neutral,99.85
1780,hit:11427_conv:22854,apprehensive,[My sister asked me to go out for drinks with ...,"(annoyance, 94.36)",annoyance,94.36
1781,hit:11427_conv:22855,furious,[I missed my train yesterday. I was so mad at ...,"(disappointment, 71.76)",disappointment,71.76
1782,hit:11428_conv:22856,embarrassed,[Yesterday When I was coming from my office. I...,"(amusement, 92.57)",amusement,92.57
1783,hit:11428_conv:22857,confident,"[I recently was in a Arcade game tournament. ,...","(approval, 27.76)",approval,27.76


#### Splitting the last 90% into 9 batches of 10% each

In [54]:
batch_1 = last_90_precent_grouped_by_df.iloc[:split_idx]
batch_2 = last_90_precent_grouped_by_df.iloc[split_idx:2*split_idx]
batch_3 = last_90_precent_grouped_by_df.iloc[2*split_idx:3*split_idx]
batch_4 = last_90_precent_grouped_by_df.iloc[3*split_idx:4*split_idx]
batch_5 = last_90_precent_grouped_by_df.iloc[4*split_idx:5*split_idx]
batch_6 = last_90_precent_grouped_by_df.iloc[5*split_idx:6*split_idx]
batch_7 = last_90_precent_grouped_by_df.iloc[6*split_idx:7*split_idx]
batch_8 = last_90_precent_grouped_by_df.iloc[7*split_idx:8*split_idx]
batch_9 = last_90_precent_grouped_by_df.iloc[8*split_idx:]

#### Non-emotional version

In [102]:
utterances = last_90_precent_grouped_by_df.utterance.iloc[18]
utterances

['i just stepped on a hairball',
 "Like the kind a cat vomits or came off of a brush? I'm hoping for you it came from a brush",
 'nope it squeezed betwixt my toes',
 'Oh my goodness for a lack of better terms! I would be no good after that']

In [103]:
complete_with_non_emotional_chatgpt(utterances)

"That's gross. I suggest you clean your foot thoroughly to avoid any infections or unpleasant smells."

In [None]:
last_90_precent_grouped_by_df["chatgpt_response_no_emotion"] = last_90_precent_grouped_by_df["utterance"].progress_apply(lambda x: complete_with_non_emotional_chatgpt(x))

  0%|          | 0/16002 [00:00<?, ?it/s]

In [None]:
last_90_precent_grouped_by_df.head()

In [None]:
last_90_precent_grouped_by_df.to_pickle("ChatEPT/last_90_precent_grouped_by_df")

#### Prompt2 version

In [55]:
utterances = last_90_precent_grouped_by_df.utterance.iloc[18]
utterances

['i just stepped on a hairball',
 "Like the kind a cat vomits or came off of a brush? I'm hoping for you it came from a brush",
 'nope it squeezed betwixt my toes',
 'Oh my goodness for a lack of better terms! I would be no good after that']

In [56]:
complete_with_prompt2_chatgpt(utterances)

"I'm sorry to hear that. You may want to wash your foot thoroughly to ensure cleanliness and avoid any potential odors."

In [57]:
last_90_precent_grouped_by_df["chatgpt_response_prompt2"] = last_90_precent_grouped_by_df["utterance"].progress_apply(lambda x: complete_with_prompt2_chatgpt(x))

  0%|          | 0/16002 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [58]:
last_90_precent_grouped_by_df.head()

Unnamed: 0,conv_id,context,utterance,utterance_2_emotion,utterance_minus_2_emotion,utterance_minus_2_emotion_conf,chatgpt_response_prompt2
1779,hit:11426_conv:22853,disgusted,[I went into my shed to clean it and i found a...,"(neutral, 99.85)",neutral,99.85,I can only imagine how difficult and unpleasan...
1780,hit:11427_conv:22854,apprehensive,[My sister asked me to go out for drinks with ...,"(annoyance, 94.36)",annoyance,94.36,It's completely okay if you don't want to go o...
1781,hit:11427_conv:22855,furious,[I missed my train yesterday. I was so mad at ...,"(disappointment, 71.76)",disappointment,71.76,"I'm sorry to hear that, it can be frustrating ..."
1782,hit:11428_conv:22856,embarrassed,[Yesterday When I was coming from my office. I...,"(amusement, 92.57)",amusement,92.57,I'm sorry to hear that. It can be embarrassing...
1783,hit:11428_conv:22857,confident,"[I recently was in a Arcade game tournament. ,...","(approval, 27.76)",approval,27.76,Congratulations on coming in second place in t...


In [59]:
last_90_precent_grouped_by_df.to_pickle("ChatEPT/prompt2/last_90_precent_grouped_by_df_prompt2")

#### Emotional version

In [None]:
#last_90_precent_grouped_by_df["chatgpt_emotional_response"] = last_90_precent_grouped_by_df["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

##### Dispatching into 10 batches

In [None]:
batch_1["chatgpt_emotional_response"] = batch_1["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_2["chatgpt_emotional_response"] = batch_2["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_3["chatgpt_emotional_response"] = batch_3["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_4["chatgpt_emotional_response"] = batch_4["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_5["chatgpt_emotional_response"] = batch_5["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_6["chatgpt_emotional_response"] = batch_6["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_7["chatgpt_emotional_response"] = batch_7["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_8["chatgpt_emotional_response"] = batch_8["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_9["chatgpt_emotional_response"] = batch_9["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

In [None]:
batch_1.to_pickle("ChatEPT/prompt2/batch_1_prompt2")

In [None]:
batch_2.to_pickle("ChatEPT/prompt2/batch_2_prompt2")
batch_3.to_pickle("ChatEPT/prompt2/batch_3_prompt2")
batch_4.to_pickle("ChatEPT/prompt2/batch_4_prompt2")
batch_5.to_pickle("ChatEPT/prompt2/batch_5_prompt2")
batch_6.to_pickle("ChatEPT/prompt2/batch_6_prompt2")
batch_7.to_pickle("ChatEPT/prompt2/batch_7_prompt2")
batch_8.to_pickle("ChatEPT/prompt2/batch_8_prompt2")
batch_9.to_pickle("ChatEPT/prompt2/batch_9_prompt2")

##### Merging into one dataframe

In [None]:
batch_1.head()

In [None]:
batch_2.head()

In [None]:
merged_last_90_percent_emotional_df = pd.concat([batch_1, batch_2, batch_3, batch_4, batch_5, batch_6, batch_7, batch_8, batch_9])

In [None]:
merged_last_90_percent_emotional_df.head()

In [None]:
last_90_precent_grouped_by_df.head()

In [None]:
merged_last_90_percent_all_df = pd.merge(merged_last_90_percent_emotional_df, last_90_precent_grouped_by_df[["conv_id", "chatgpt_response_prompt2"]], on="conv_id")

In [None]:
merged_last_90_percent_all_df.head()

In [None]:
merged_last_90_percent_all_df.to_pickle("ChatEPT/prompt2/merged_last_90_percent_all_df_prompt2")

In [None]:
last_90_precent_grouped_by_df["chatgpt_emotional_response"] = last_90_precent_grouped_by_df["utterance"].progress_apply(lambda x: complete_with_emotional_chatgpt(x))

##### Debugging 

It stopped in 

In [None]:
last_90_precent_grouped_by_df.utterance.iloc[2023]

In [None]:
last_90_precent_grouped_by_df.head()

In [None]:
#last_90_precent_grouped_by_df.to_pickle("ChatEPT/prompt2/last_10_percent_grouped_by_df_prompt2")

In [None]:
ddf = pd.read_pickle("")

In [None]:
#utterances = grouped_by_df.utterance.iloc[15691]
utterances = grouped_by_df.utterance.iloc[10]
display(utterances)

In [None]:
grouped_by_df

In [None]:
emotion

In [None]:
probability

In [None]:
predict_emotion_and_probability(ground_truth)

In [None]:
predict_emotion_and_probability(chatgpt_response_string)

In [None]:
predict_emotion_from_utterance("It's amazing how much we can learn from those who have experienced different walks of life than our own. I'm glad you had this eye-opening experience.")

In [None]:
predict_emotion_from_utterance("That's great to hear. Sometimes the people we set out to help end up teaching us more than we could have imagined.")

In [None]:
def find_row(sentence):
    for i in range(len(grouped_by_df)):
        utterances = grouped_by_df.utterance.iloc[i]
        if utterances[0].startswith(sentence):
            break
    print(i)

In [None]:
find_row("I visited an orphan")

In [None]:
grouped_by_df.utterance.iloc[15691]

# Preparing the Electra classifier

# Preparing the emotion engine

In [None]:
dd_emotion_choice_df = pd.read_pickle("dd_emotion_choice_df")
dd_emotion_choice_no_neutral_df = pd.read_pickle("dd_emotion_choice_no_neutral_df")

In [None]:
dd_emotion_choice = dd_emotion_choice_df.set_index('Initial emotion')["Mostly followed by"].to_dict()
dd_emotion_choice_no_neutral = dd_emotion_choice_no_neutral_df.set_index('Initial emotion')["Mostly followed by"].to_dict()
#display("Correspondance: ", dd_emotion_choice)
#print()
#display("If we exculde the neutral emotions, our chatbot becomes more emotional:", dd_emotion_choice_no_neutral)

In [None]:
def choose_response_emotion(utterance, emotion_dict):
    predicted_emotion = predict_emotion_from_utterance(utterance)
    respone_emotion = emotion_dict[predicted_emotion]
    return response_emotion

In [None]:
def prepare_beginning_of_response(utterance, emotion_dict):
    response_emotion = choose_response_emotion(utterance, emotion_dict)
    if response_emotion != "neutral":
        return f"My response's gonna be brief and sound full of {response_emotion}:"
    else: 
        return f"My response's gonna be brief and sound {response_emotion}:"

In [None]:
def test_emotion_choice(sample_utterance):
    predicted_emotion = predict_emotion_from_utterance(sample_utterance)
    response_emotion = dd_emotion_choice[predicted_emotion]
    print(f'The predicted emotion is: "{predicted_emotion}"\nand the response should have the "{response_emotion}" emotion')

In [None]:
sample_utterance = "this is disgusting"

In [None]:
prepare_beginning_of_response(sample_utterance, dd_emotion_choice)

# Preparing the ChatGPT API

In [None]:
API_KEY = "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"
openai.api_key = API_KEY

In [None]:
def show_formatted_choice(unformatted_text):
  display(Markdown(unformatted_text["choices"][0]["message"]["content"]))

In [None]:
def call_chatept(utterance):
    beginning_of_response = prepare_beginning_of_response(utterance, dd_emotion_choice)
    completion = openai.ChatCompletion.create(
      model="gpt-3.5-turbo", 
      messages=[
          {
              "role": "user",
              "content": utterance
          },
          {
              "role": "assistant",
              "content": beginning_of_response
          }
      ]
    )
    return completion

In [None]:
utterance = "I am hearing weird noise in the house"
chatept_response = call_chatept(utterance)

In [None]:
show_formatted_choice(chatept_response)

In [None]:
test_emotion_choice(utterance)

# Testing on DailyDialogue Dataset

In [None]:
from datasets import list_datasets, load_dataset
daily_dialogue_dataset = load_dataset('daily_dialog')  

In [None]:
train_df = daily_dialogue_dataset["train"].to_pandas()
val_df = daily_dialogue_dataset["validation"].to_pandas()
test_df = daily_dialogue_dataset["test"].to_pandas()
all_df = pd.concat([train_df, val_df, test_df])
sample_df = train_df.iloc[:100]

In [None]:
list_of_sample_utterances = sample_df.iloc[0].dialog.tolist()
list_of_sample_utterances

In [None]:
sample_df.iloc[0].emotion.tolist()

In [None]:
list_of_electra_emotions = []
for utterance in list_of_sample_utterances:
    list_of_electra_emotions.append(predict_emotion_from_utterance(utterance))
list_of_electra_emotions

In [None]:
dict(list(zip(list_of_sample_utterances,list_of_electra_emotions)))

In [None]:
user_request = "I am hearing weird noise in the house"
beginning_of_response = "My response is going to be brief and sound scared: "

In [None]:
user_request = "I have a problem"
beginning_of_response = "My response is going to be brief and sound curious: "

In [None]:
chatgpt_response = call_chatgpt(user_request, beginning_of_response)

In [None]:
chatgpt_response

In [None]:
show_formatted_choice(chatgpt_response)

In [None]:
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo", 
  messages=[
      {
          "role": "user",
          "content": "Tell the world about the ChatGPT API in the style of a pirate."
      }
  ]
)

In [None]:
completion["choices"][0]["message"]["content"]

In [None]:
show_formatted_choice(completion)