In [23]:
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import re
from itertools import combinations
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
import contractions
import math

In [10]:
### Process P1 - generated phrase pairs.

# load simple LLM prompt generations made with P1
simpledf = pd.read_csv('phrasedf.csv')
# keep only those with a memorability classification of high or low.
simpledf = simpledf[(simpledf['memorability']=='high') | (simpledf['memorability']=='low')]
print(len(simpledf))

608


In [11]:
## load MobileEmail phrase memorability calculations according to Leiva's formula
## these were done once for successful generations using Llama, and once for Mistral
## therefore the same phrase is scored in both "runs" - these are mostly duplicated.

mistral_leiva = pd.read_csv('ranking-mistral_response.txt', sep=';')
llama_leiva = pd.read_csv('ranking-llama_response.txt', sep=';')

print(mistral_leiva.head(3))
print(llama_leiva.head(3))

   memorability_score  reprentativeness_score           sentence
0           10.169470                0.013376         I like it.
1           10.755305                0.018705  We have the data.
2           10.946415                0.021922  This is the crew.
   memorability_score  reprentativeness_score     sentence
0            9.224900                0.004709  back at you
1           10.002871                0.005758   On its way
2           10.169470                0.016176   I like it.


In [12]:
# helper function to clean strings from non alphabet entries
def cleanstr(string):
    string = string.lower()
    string = string.strip()
    #string = ''.join(letter for letter in string if letter.isalnum())
    string = re.sub(r"[!#\"$%&'()*+,-./:;<=>?@[\]^_`{|}~]", '', string)
    return string

In [13]:
# clean the mobileEmail phrase from each dataset and put clean sentences in new columns
mistral_leiva['sentence_clean'] = mistral_leiva['sentence'].apply(cleanstr)
llama_leiva['sentence_clean'] = llama_leiva['sentence'].apply(cleanstr)
simpledf['original_clean']=simpledf['original'].apply(cleanstr)

# example
mistral_leiva.head(3)

Unnamed: 0,memorability_score,reprentativeness_score,sentence,sentence_clean
0,10.16947,0.013376,I like it.,i like it
1,10.755305,0.018705,We have the data.,we have the data
2,10.946415,0.021922,This is the crew.,this is the crew


In [14]:
# merge the dataframes into one
leiva_join = simpledf.set_index('original_clean').join(mistral_leiva.set_index('sentence_clean'))
leiva_join.reset_index(inplace=True)
leiva_join.rename(columns={"memorability_score": "memorability_score_1"}, inplace=True)
leiva_join=leiva_join.drop(columns=['reprentativeness_score','sentence'])
leiva_join.head(3)
leiva_join=leiva_join.set_index('original_clean').join(llama_leiva.set_index('sentence_clean'))
leiva_join.rename(columns={"memorability_score": "memorability_score_2"}, inplace=True)
leiva_join=leiva_join.drop(columns=['reprentativeness_score','sentence'])
leiva_join.fillna(-1, inplace=True)

# helper function - in case a mobileEmail phrase was done by Llama and not by Mistral (and vice versa)
def calc_score(s1, s2):
    if s1==-1 or s2==-1:
        return max(s1, s2)
    else:
        return ((s1+s2)/2) #theoretically should be =S1 or =S2 but just to be safe...

# final computation of Leiva's memorability formula score for each mobileEmail phrase
leiva_join['final_leiva_memscore']=leiva_join.apply(lambda x: calc_score(x['memorability_score_1'], x['memorability_score_2']), axis=1)
# average llm-assessed memorability of a mobileEmail phrase
leiva_join['llm_mem']=(leiva_join['llama_mem']+leiva_join['mistral_mem']) / 2
leiva_join.head(3)

Unnamed: 0_level_0,Unnamed: 0,llama_prompt,llama_response,llama_mem,llama_read,mistral_prompt,mistral_response,mistral_mem,mistral_read,original,...,mistral_coherence_m,llama_coherence_l,mistral_coherence_g,llama_coherence_g,llama_co_score,mistral_co_score,memorability_score_1,memorability_score_2,final_leiva_memscore,llm_mem
original_clean,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
this will be hard,2,I just found out I need to give a presentation...,This will be hard.,8,2,Can you explain quantum mechanics to me in one...,This will be hard.,0,3,This will be hard.,...,low,low,low,medium,1.4,1.0,11.531672,11.531672,11.531672,4.0
how much volume,5,how much audio can be stored on this new 8gb m...,How much volume?,9,2,The water tank in the basement is overflowing.,How much volume?,7,2,How much volume?,...,low,low,high,low,1.4,1.8,14.596618,14.596618,14.596618,8.0
have a good evening,8,"I'm heading out to watch the sunset tonight, i...",Have a good evening.,8,2,Have you finished the project?,Have a good evening.,10,1,Have a good evening.,...,low,high,low,low,2.2,1.8,12.978467,12.978467,12.978467,9.0


In [17]:
# split Leiva-scored memorability into bins
leiva_join['memorability_leiva']=pd.qcut(leiva_join['final_leiva_memscore'], q=3, labels=['high','mid','low']) #reverse scoring
leiva_join.head(3)
print(len(leiva_join))

615


In [18]:
simpledf = leiva_join[['llama_prompt', 'mistral_prompt', 'original', 'memorability']]
simpledf = simpledf.rename(columns={"llama_prompt": "llama_prompt_s", "mistral_prompt": "mistral_prompt_s", "memorability": "memorability_s"})
simpledf.info()
simpledf.to_csv('simpledf.csv')

<class 'pandas.core.frame.DataFrame'>
Index: 615 entries, this will be hard to i will conference in for thursdays customer service meeting
Data columns (total 4 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   llama_prompt_s    615 non-null    object
 1   mistral_prompt_s  615 non-null    object
 2   original          615 non-null    object
 3   memorability_s    615 non-null    object
dtypes: object(4)
memory usage: 24.0+ KB


In [19]:
#### Process P3-generated phrases

In [82]:
# load P3-generated phrases with USE scorings
storydf = pd.read_csv('phrasedf2-scored.csv')
storydf = storydf[['llama_prompt', 'mistral_prompt', 'original', 'memorability', 'llama_story', 'mistral_story']]
storydf = storydf.rename(columns={"llama_prompt": "llama_prompt_n", "mistral_prompt": "mistral_prompt_n", "memorability": "memorability_n"})
storydf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 402 entries, 0 to 401
Data columns (total 6 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   llama_prompt_n    402 non-null    object
 1   mistral_prompt_n  402 non-null    object
 2   original          402 non-null    object
 3   memorability_n    402 non-null    object
 4   llama_story       402 non-null    object
 5   mistral_story     402 non-null    object
dtypes: object(6)
memory usage: 19.0+ KB


In [83]:
merged = simpledf.join(storydf.set_index('original'), on='original')
merged.dropna(inplace=True)
merged.reset_index(inplace=True)
merged

Unnamed: 0,original_clean,llama_prompt_s,mistral_prompt_s,original,memorability_s,llama_prompt_n,mistral_prompt_n,memorability_n,llama_story,mistral_story
0,yes i am here actually,are you free to meet up today?,Do you live at 123 Main Street?,Yes I am here actually.,high,"You're really late, what happened?",Is everything alright there? I have a present ...,high,Jane was worried about her friend Mark's absen...,Tom was waiting at the bus stop for his friend...
1,thats a lot of dollars,I just saw the price of my dream car and it's ...,"I just sold my old car for $25,000.",That's a lot of dollars.,high,How did you manage to book such a high-end place?,"Hey Lisa, I have some exciting news!",high,John and Emily were discussing their upcoming ...,Michael just got a surprise inheritance from h...
2,are you going to join us for lunch,have you finalized the meeting time?,I've made reservations for six at the new Ital...,Are you going to join us for lunch?,high,I hope you're not stuck in traffic?,"I've got to take this call, it's urgent.",high,Samantha was excited to share her new business...,Michael and Lisa were working together on a pr...
3,and how would i be going for work,I'm feeling pretty stressed about the project ...,Have you checked the traffic on your usual com...,And how would I be going for work?,low,Have you finalized the project plan?,What time will you be getting here for lunch?,low,Jane was getting ready for work and asked Mark...,Tom and Lisa were planning to meet up for lunc...
4,further out will fax you price target slide,can you provide me with the latest sales figures?,Have you seen the latest stock market trend fo...,Further out - will fax you price target slide.,low,Do you know when Ryan will get here?,Do you have a moment to discuss this promising...,low,Emily was waiting for Ryan at the coffee shop....,John was analyzing the stock market trends for...
...,...,...,...,...,...,...,...,...,...,...
396,i will take a look at this today,can you review the sales report for last quarter?,Can you check if the company's financial repor...,I will take a look at this today.,high,Do you think I'll be able to get your input on...,Lisa: Have you had a chance to go over the kit...,high,John needed help with his new project at work....,Tom and Lisa were planning to renovate their k...
397,yes still need it when are you back,"I'm still looking for the USB drive, did you b...",I forgot my phone at home and I need to make s...,Yes still need it - when are you back?,low,Do you think we can still make it?,"Hey Lisa, I've finished with the documents you...",low,John and Emily had been planning their vacatio...,Tom had borrowed some important documents from...
398,on mahonia margining i spoke with mike garberding,Can you confirm the meeting with Mike Garberding?,Have you read the latest research on plant spe...,"On Mahonia margining, I spoke with Mike Garber...",low,"I've been thinking about our college days, wha...","Hello Mike, what beautiful flowers you have here!",low,We had driven for hours when I met Mike Garber...,While taking a stroll along the Mahonia border...
399,will you come get me,I'm stuck on the side of the road with a flat ...,"I forgot my keys at home, can you bring them t...",Will you come get me?,high,How's your day going?,Can you bring me my wallet?,high,"Emma was stuck on the mountain trail, her phon...",John had an accident and his car was towed awa...


In [84]:
merged.to_csv('merged.csv')

In [None]:
# Send the saved merged file to the USE score script for processing (not shown here)
# Read back the results

In [40]:
scored = pd.read_csv('all_scored_storied.csv')
scored.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 401 entries, 0 to 400
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Unnamed: 0.1      401 non-null    int64  
 1   Unnamed: 0        401 non-null    int64  
 2   original_clean    401 non-null    object 
 3   llama_prompt_s    401 non-null    object 
 4   mistral_prompt_s  401 non-null    object 
 5   original          401 non-null    object 
 6   memorability_s    401 non-null    object 
 7   llama_prompt_n    401 non-null    object 
 8   mistral_prompt_n  401 non-null    object 
 9   memorability_n    401 non-null    object 
 10  llama_story       401 non-null    object 
 11  mistral_story     401 non-null    object 
 12  llama_use_s       401 non-null    float64
 13  mistral_use_s     401 non-null    float64
 14  llama_use_n       401 non-null    float64
 15  mistral_use_n     401 non-null    float64
dtypes: float64(4), int64(2), object(10)
memory u

In [41]:
# remove redundant fields
scored = scored[['original', 'memorability_n', 'llama_prompt_s', 'llama_prompt_n', 
                 'mistral_prompt_s', 'mistral_prompt_n', 'llama_use_s', 'llama_use_n', 'mistral_use_s', 'mistral_use_n']]
scored.head(5)

Unnamed: 0,original,memorability_n,llama_prompt_s,llama_prompt_n,mistral_prompt_s,mistral_prompt_n,llama_use_s,llama_use_n,mistral_use_s,mistral_use_n
0,Yes I am here actually.,high,are you free to meet up today?,"You're really late, what happened?",Do you live at 123 Main Street?,Is everything alright there? I have a present ...,0.345601,0.063146,0.211158,0.216734
1,That's a lot of dollars.,high,I just saw the price of my dream car and it's ...,How did you manage to book such a high-end place?,"I just sold my old car for $25,000.","Hey Lisa, I have some exciting news!",0.263285,0.076007,0.205592,0.059551
2,Are you going to join us for lunch?,high,have you finalized the meeting time?,I hope you're not stuck in traffic?,I've made reservations for six at the new Ital...,"I've got to take this call, it's urgent.",0.178498,0.064963,0.121953,0.031671
3,And how would I be going for work?,low,I'm feeling pretty stressed about the project ...,Have you finalized the project plan?,Have you checked the traffic on your usual com...,What time will you be getting here for lunch?,0.073331,0.132067,0.204906,0.270786
4,Further out - will fax you price target slide.,low,can you provide me with the latest sales figures?,Do you know when Ryan will get here?,Have you seen the latest stock market trend fo...,Do you have a moment to discuss this promising...,0.144428,0.091344,0.152306,0.099654


In [42]:
### adjust the USE scores -> U'

In [43]:
# define set of stop words
stop_words = set(stopwords.words('english')) 

# helper function to remove stopwords from a sentence
# 1. make lowercase
# 2. expand contractions
# 3. tokenize 
# 4. remove stopword tokens
# 5. rejoin
# 6. remove any symbols
def remove_stopwords(text, stopwords):

    #lowercase
    text = text.lower()
    
    #expand contractions
    expanded_words = []    
    for word in text.split():
      expanded_words.append(contractions.fix(word))          
    text = ' '.join(expanded_words)

    # tokenize
    word_tokens = word_tokenize(text) 
    filtered_sentence = [] 
      
    for w in word_tokens: 
        if w not in stopwords: 
            filtered_sentence.append(w)
    text= " ".join(filtered_sentence)
    
    #remove symbols
    text = cleanstr(text)
    
    #print(text)
    
    return text

# helper function to derive U' scoring
def adjust_llm_prompt(row, prompt_col, orig_col, use_col, stopwords):

    #find % of common words in prompt and response
    filtered_prompt = remove_stopwords(row[prompt_col], stopwords)
    filtered_prompt = word_tokenize(filtered_prompt)
    tokenised_response = remove_stopwords(row[orig_col], stopwords)
    tokenised_response = word_tokenize(tokenised_response)
    
    #handle the chance that sentence is made up of all stop words
    if len(tokenised_response)==0:
        n_common=0
    else:
        n_common = len(set(filtered_prompt)&set(tokenised_response)) / len(set(tokenised_response))
    #print(n_common)
    
    #calculate U' score = USE_score/e^(response_length*CommonalityPct)
    adj_score = abs(row[use_col]/math.e**(len(tokenised_response)*n_common))
        
    #print("prompt - response:", [row[prompt_col], row[orig_col]])
    #print("use_score:", row[use_col])
    #print("adj_score:", adj_score)
    #print('-------')
    
    return adj_score


In [44]:
# compute U' scores for llama and mistral P1 (_s) and P3 (_n) phrase pair prompts
scored['llama_use_s_adj']=scored.apply(lambda x: adjust_llm_prompt(x, 'llama_prompt_s', 'original', 'llama_use_s', stop_words), axis='columns', result_type='expand')
scored['llama_use_n_adj']=scored.apply(lambda x: adjust_llm_prompt(x, 'llama_prompt_n', 'original', 'llama_use_n', stop_words), axis='columns', result_type='expand')
scored['mistral_use_s_adj']=scored.apply(lambda x: adjust_llm_prompt(x, 'mistral_prompt_s', 'original', 'mistral_use_s', stop_words), axis='columns', result_type='expand')
scored['mistral_use_n_adj']=scored.apply(lambda x: adjust_llm_prompt(x, 'mistral_prompt_n', 'original', 'mistral_use_n', stop_words), axis='columns', result_type='expand')

In [45]:
scored.head(2)

Unnamed: 0,original,memorability_n,llama_prompt_s,llama_prompt_n,mistral_prompt_s,mistral_prompt_n,llama_use_s,llama_use_n,mistral_use_s,mistral_use_n,llama_use_s_adj,llama_use_n_adj,mistral_use_s_adj,mistral_use_n_adj
0,Yes I am here actually.,high,are you free to meet up today?,"You're really late, what happened?",Do you live at 123 Main Street?,Is everything alright there? I have a present ...,0.345601,0.063146,0.211158,0.216734,0.345601,0.063146,0.211158,0.216734
1,That's a lot of dollars.,high,I just saw the price of my dream car and it's ...,How did you manage to book such a high-end place?,"I just sold my old car for $25,000.","Hey Lisa, I have some exciting news!",0.263285,0.076007,0.205592,0.059551,0.263285,0.076007,0.205592,0.059551


In [46]:
# helper function for automating statistical analyses
def statprint(df, keylist):
    #descriptive statistics
    print("C", "Mean", "Std")
    for k in keylist: 
        print(k, round(stats.tmean(df[k]), 3), round(stats.tstd(df[k]),3))
    print("-----------")
    
    normalities = {}
    allnormal = True

    sets = []
    for k in keylist:
        sets.append(df[k])
    
    for k in keylist:
        normalities[k] = stats.normaltest(df[k]).pvalue
        if normalities[k] < 0.05:
            allnormal=False
        #print(stats.normaltest(testdf[k])) #normal p>0.05
    
    #anova or friedman?
    if allnormal:
        toprint = "ANOVA "
        stat = stats.f_oneway(*sets)
    else:
        toprint = "FRIEDMAN "
        stat = stats.friedmanchisquare(*sets)
        
    print(toprint+str(round(stat.statistic, 3)) + ", p="+str(round(stat.pvalue,3)))
    print("-----------")
    
    if stat.pvalue<0.05: # go to paired tests
    
        #paired sample tests
        
        combos = list(combinations(keylist, 2))
        padj = 0.05/len(combos)
        
        for combo in combos:
            if normalities[combo[0]]>0.05 and normalities[combo[1]]>0.05:
                stat = stats.ttest_rel(df[combo[0]],df[combo[1]])
                toprint = "T-test "
                toprint += "("+str(combo[0]) +", "+ str(combo[1]) + "): t="+ str(round(stat.statistic, 3)) + ", p="+str(round(stat.pvalue,3))
            else:
                stat = stats.wilcoxon(df[combo[0]],df[combo[1]])
                toprint = "Wilcoxon "
                toprint += "("+str(combo[0]) +", "+ str(combo[1]) + "): Z="+ str(round(stat.zstatistic, 3)) + ", p="+str(round(stat.pvalue,3))
            
            if stat.pvalue<padj:
                toprint+=" *"
            print(toprint)
        
        print ("bf-adj p:", round(0.05/len(combos),4))

In [47]:
# statistics for USE score comparisons
statprint(scored, ['llama_use_s', 'llama_use_n', 'mistral_use_s', 'mistral_use_n'])

C Mean Std
llama_use_s 0.225 0.111
llama_use_n 0.136 0.087
mistral_use_s 0.197 0.112
mistral_use_n 0.181 0.106
-----------
FRIEDMAN 188.68, p=0.0
-----------
Wilcoxon (llama_use_s, llama_use_n): Z=-12.932, p=0.0 *
Wilcoxon (llama_use_s, mistral_use_s): Z=-5.653, p=0.0 *
Wilcoxon (llama_use_s, mistral_use_n): Z=-7.024, p=0.0 *
Wilcoxon (llama_use_n, mistral_use_s): Z=-9.535, p=0.0 *
Wilcoxon (llama_use_n, mistral_use_n): Z=-7.108, p=0.0 *
Wilcoxon (mistral_use_s, mistral_use_n): Z=-2.203, p=0.028
bf-adj p: 0.0083


In [48]:
# statistics for U' score comparisons
statprint(scored, ['llama_use_s_adj', 'llama_use_n_adj', 'mistral_use_s_adj', 'mistral_use_n_adj'])

C Mean Std
llama_use_s_adj 0.148 0.085
llama_use_n_adj 0.123 0.076
mistral_use_s_adj 0.138 0.082
mistral_use_n_adj 0.13 0.075
-----------
FRIEDMAN 22.733, p=0.0
-----------
Wilcoxon (llama_use_s_adj, llama_use_n_adj): Z=-4.601, p=0.0 *
Wilcoxon (llama_use_s_adj, mistral_use_s_adj): Z=-2.37, p=0.018
Wilcoxon (llama_use_s_adj, mistral_use_n_adj): Z=-3.216, p=0.001 *
Wilcoxon (llama_use_n_adj, mistral_use_s_adj): Z=-2.895, p=0.004 *
Wilcoxon (llama_use_n_adj, mistral_use_n_adj): Z=-1.68, p=0.093
Wilcoxon (mistral_use_s_adj, mistral_use_n_adj): Z=-1.286, p=0.199
bf-adj p: 0.0083
