In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import contractions
import math
import re

In [2]:
##### pick llm-generated phrases
llmdf = pd.read_csv('phrasedf2-scored_contexts.csv')
llmdf.head(4)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,llama_prompt,llama_response,llama_story,llama_mem,llama_read,mistral_prompt,mistral_response,mistral_story,mistral_mem,mistral_read,original,mem_count_cer0,memorability,readability,words,llama_use_score,mistral_use_score
0,0,0,0,"You're really late, what happened?",Yes I am here actually.,Jane was worried about her friend Mark's absen...,6,1,Is everything alright there? I have a present ...,"Yes, I'm here actually.",Tom was waiting at the bus stop for his friend...,7,1,Yes I am here actually.,7,high,7,5,0.063138,0.216689
1,1,1,1,How did you manage to book such a high-end place?,That's a lot of dollars.,John and Emily were discussing their upcoming ...,5,1,"Hey Lisa, I have some exciting news!","Wow, that's a lot of dollars!",Michael just got a surprise inheritance from h...,10,1,That's a lot of dollars.,7,high,7,5,0.076006,0.059551
2,2,2,2,I hope you're not stuck in traffic?,Are you going to join us for lunch?,Samantha was excited to share her new business...,8,1,"I've got to take this call, it's urgent.",Are you going to join us for lunch?,Michael and Lisa were working together on a pr...,8,1,Are you going to join us for lunch?,8,high,8,8,0.064968,0.031671
3,3,3,3,Have you finalized the project plan?,And how would I be going for work?,Jane was getting ready for work and asked Mark...,5,2,What time will you be getting here for lunch?,And how would I be going for work?,Tom and Lisa were planning to meet up for lunc...,4,1,And how would I be going for work?,3,low,3,8,0.132081,0.270791


In [3]:
#calculate prompt diff angle in degrees (from rads)
llmdf['use_diff_rad'] = llmdf.apply(lambda x: np.degrees(abs(np.arccos(x['llama_use_score'])-np.arccos(x['mistral_use_score']))), axis=1)

In [4]:
# prepare to process the whole set by calculating U' scores

# set of stop words
stop_words = set(stopwords.words('english')) 

# helper function to remove non alphabet characters
def cleanstr(string):
    string = string.lower()
    string = string.strip()
    #string = ''.join(letter for letter in string if letter.isalnum())
    string = re.sub(r"[!#\"$%&'()*+,-./:;<=>?@[\]^_`{|}~]", '', string)
    return string

# helper function to remove stopwords from a sentence
# 1. make lowercase
# 2. expand contractions
# 3. tokenize 
# 4. remove stopword tokens
# 5. rejoin
# 6. remove any symbols
def remove_stopwords(text, stopwords):

    #lowercase
    text = text.lower()
    
    #expand contractions
    expanded_words = []    
    for word in text.split():
      expanded_words.append(contractions.fix(word))          
    text = ' '.join(expanded_words)

    # tokenize
    word_tokens = word_tokenize(text) 
    filtered_sentence = [] 
      
    for w in word_tokens: 
        if w not in stopwords: 
            filtered_sentence.append(w)
    text= " ".join(filtered_sentence)
    
    #remove symbols
    text = cleanstr(text)
    
    #print(text)
    
    return text


# helper function to select the best LLM prompt candidate to match a MobileEmail phrase
def pick_llm_prompt(row, stopwords):

    #find % of common words in prompt and response
    filtered_llama = remove_stopwords(row['llama_prompt'], stopwords)
    filtered_llama = word_tokenize(filtered_llama)
    
    filtered_mistral = remove_stopwords(row['mistral_prompt'], stopwords)
    filtered_mistral = word_tokenize(filtered_mistral)
    
    tokenised_response = remove_stopwords(row['original'], stopwords)
    tokenised_response = word_tokenize(tokenised_response)
    
    #handle the chance that sentence is made up of all stop words
    if len(tokenised_response)==0:
        n_llama_common=0
        n_mistral_common=0
    else:
        n_llama_common = len(set(filtered_llama)&set(tokenised_response)) / len(set(tokenised_response))
        n_mistral_common = len(set(filtered_mistral)&set(tokenised_response)) / len(set(tokenised_response))
    
    #print(n_llama_common, n_mistral_common)
    
    #calculate U' score = USE_score/e^(response_length*CommonalityPct)
    adj_llama_score = abs(row['llama_use_score']/math.e**(len(tokenised_response)*n_llama_common))
    adj_mistral_score = abs(row['mistral_use_score']/math.e**(len(tokenised_response)*n_mistral_common))
        
    #sum the scores
    sum_score = adj_llama_score+adj_mistral_score
    probs = [adj_llama_score/sum_score, adj_mistral_score/sum_score]
    
    # Choose elements with different probabilities
    sel_prompt = np.random.choice(['llama', 'mistral'], size=1, p=probs)[0]
    
    #print("prompts:", [row['llama_prompt'], row['mistral_prompt'], row['original']])
    #print("use_scores:", row['llama_use_score'], row['mistral_use_score'])
    #print("adj_scores:", adj_llama_score, adj_mistral_score)
    #print(sel_prompt, probs)
    #print('-------')
    if sel_prompt=='llama':
        return {'final_prompt':row['llama_prompt'], 'final_prompt_use_score':adj_llama_score} 
    else:
        return {'final_prompt':row['mistral_prompt'], 'final_prompt_use_score':adj_mistral_score}

In [5]:
# get picking
llmdf[['final_prompt','final_prompt_use_score']]=llmdf.apply(lambda x: pick_llm_prompt(x, stop_words), axis='columns', result_type='expand')
llmdf.head(2)

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,llama_prompt,llama_response,llama_story,llama_mem,llama_read,mistral_prompt,mistral_response,...,original,mem_count_cer0,memorability,readability,words,llama_use_score,mistral_use_score,use_diff_rad,final_prompt,final_prompt_use_score
0,0,0,0,"You're really late, what happened?",Yes I am here actually.,Jane was worried about her friend Mark's absen...,6,1,Is everything alright there? I have a present ...,"Yes, I'm here actually.",...,Yes I am here actually.,7,high,7,5,0.063138,0.216689,8.894675,"You're really late, what happened?",0.063138
1,1,1,1,How did you manage to book such a high-end place?,That's a lot of dollars.,John and Emily were discussing their upcoming ...,5,1,"Hey Lisa, I have some exciting news!","Wow, that's a lot of dollars!",...,That's a lot of dollars.,7,high,7,5,0.076006,0.059551,0.945032,"Hey Lisa, I have some exciting news!",0.059551


In [6]:
# save as JSON for high and low memorability mobileEmail phrases
llmdf[llmdf['memorability']=='high'].sort_values(by='final_prompt_use_score', ascending=False)[['final_prompt','original','final_prompt_use_score']].head(45).to_json('llm_chats_high.json', orient='records')
llmdf[llmdf['memorability']=='low'].sort_values(by='final_prompt_use_score', ascending=False)[['final_prompt','original','final_prompt_use_score']].head(45).to_json('llm_chats_low.json', orient='records')

In [7]:
# generate baseline set too
baselinedf = pd.read_csv('phrasedf.csv')
baselinedf = baselinedf[((baselinedf['memorability']=='low') | (baselinedf['memorability']=='high')) & (baselinedf['words']>=5)].sort_values(by='mem_count_cer0', ascending=False)[['llama_prompt','original','memorability', 'mem_count_cer0']] 
baselinedf.rename({"llama_prompt":"final_prompt"}, inplace=True)

baselinedf[baselinedf['memorability']=='high'].head(45).to_json('baseline_chats_high.json', orient='records')
baselinedf[baselinedf['memorability']=='low'].tail(46).to_json('baseline_chats_low.json', orient='records') 
#remember to remove row with phone number as it's a bit weird to have that