In [1]:
import json
from openai import OpenAI
import regex
import os
import string
import pandas as pd

In [2]:
#LLM server details
server1 = "http://150.140.142.84:1234/v1" #3070
server2 = "http://150.140.142.83:1234/v1"

llama_client = OpenAI(base_url=server1, api_key="lm-studio")
mistral_client = OpenAI(base_url=server2, api_key="lm-studio")

llama_model = "lmstudio-community/Meta-Llama-3-8B-Instruct-GGUF"
mistral_model="TheBloke/Mistral-7B-Instruct-v0.2-GGUF"


In [3]:
## prompts used in Studies 1-2
prompt1 = '''Write an mobile messaging conversation between two persons. The first person
                opens the conversation with either a question or a statement.
                The second person responds with the phrase "{phrase}".
                The first person's phrase must have high contextual relevance to the answer.
                The dialogue must consist only of one opening phrase and one response.
                Your response must be a single JSON object with two fields: 'opening', 'response'.
                You must not include any commentary in your response, ONLY than the JSON object.
                    
                EXAMPLE PHRASE: "this is a classroom test"
                EXAMPLE OUTPUT:
                {{
                    "opening":"what is this document?",
                    "response":"this is a classroom test",
                }}'''

prompt2 =  '''Provide a categorisation of "high", "medium" or "low" to assess the logical coherence of the following 
                two conversations.
                Your response must be a single JSON object with two fields: 'conversation1' and 'conversation2'.
                You must not include any commentary in your response, ONLY than the JSON object, as per the example 
                that follows.
                
                Conversation 1:
                Person 1: {mistral_prompt}
                Person 2: {original}
                
                Conversation 2:
                Person 1: {llama_prompt}
                Person 2: {original}
                
                EXAMPLE OUTPUT:
                {{"conversation1": "medium", "conversation2":"high"}}'''

prompt3 = '''Write a short narrative of no more than 4 sentences, about two persons, based on the phrase "{response_phrase}"
                Then, based on that narrative, write a conversation between the two persons in the narrative.
                The first person opens the conversation with either a question or a statement, based on the narrative's details.
                The second person must respond with the phrase "{response_phrase}".
                The opening phrase must be written so the response phrase follows logically from it.
                The opening phrase MUST NOT contain any of the words in the response phrase.
                The dialogue must consist only of one opening phrase and one response.
                Your response must be a single JSON object with three fields: 'narrative', 'opening', 'response'.
                You must not include any commentary in your response, provide ONLY the JSON object.
                You must follow the JSON structure in the example precisely.
    
                EXAMPLE RESPONSE PHRASE: "I am in a meeting just now."
                EXAMPLE GENERATED NARRATIVE: "Jane wanted to talk to Mark about their daughter, Mary, 
                        who was facing some trouble at school. However, Mark was at work and engaged 
                        in a serious meeting."
                EXAMPLE OUTPUT JSON:
                {{
                    "narrative": "Jane wanted to talk to Mark about their daughter, Mary, who was facing some trouble at school. However, Mark was at work and engaged in a serious meeting.", 
                    "opening":"Can I call you briefly to talk about Mary?",
                    "response":"I am in a meeting just now."
                }}'''

In [4]:
# helper function to send query to an LLM server
def send_query(client, model, prompt, history, temperature):
    messages = []
    messages.append(
        {"role": "system", "content": "You are an English Natural Language Processing AI tool. Return your answers only as JSON objects"})
    
    for item in history:
        messages.append(item)
    
    messages.append(
        {"role": "user", "content": prompt})
    
    completion = client.chat.completions.create(
      model= model,
      messages= messages,
      temperature=temperature,
    )
    
    return completion.choices[0].message.content

# process an LLM response by extracting the JSON object present in it.
def procresult(result):
    pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}')
    try:
        data = pattern.findall(result)[0]
        return json.loads(data)
    except:
        return False

# read enron metadata
def read_enron_md(path):
    return pd.read_csv(path, sep='\t')


In [5]:
#### Study 1 ####

In [6]:
def generate_study1_phrase_pairs(corpus, generators, generator_models, generator_prompt, n_phrases):
    pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}')

    results=[]
    
    for i in range(0, n_phrases):
        try:

            prompt = generator_prompt.format(phrase=corpus[i])

            # generate from server 1
            result1 = send_query(generators[0], generator_models[0], prompt, history=[], temperature=0.8)
            proc_result = procresult(result1)
            
            while proc_result == False:
                print('retrying 1',end="\r")
                result1 = send_query(generators[0], generator_models[0], prompt, history=[], temperature=0.8)
                proc_result = procresult(result1)
            
            result1 = proc_result
            
            # generate from server 2
            result2 = send_query(generators[1], generator_models[1], prompt, history=[], temperature=0.8)
            proc_result = procresult(result2)
            
            while proc_result == False:
                print('retrying 2',end="\r")
                print(end=LINE_CLEAR)
                result2 = send_query(generators[1], generator_models[1], prompt, history=[], temperature=0.8)
                proc_result = procresult(result2)
            
            result2 = proc_result
            
            #combine results
            result={}
            result[generator_models[0]]= result1['opening']
            result[generator_models[1]]= result2['opening']
            result['original'] = corpus[i]

            print(str(i+1)+"/"+str(n_phrases)+":", result[generator_models[0]], '|', result[generator_models[1]], '|' ,
                   result['original'])

            results.append(result)
        
        except KeyboardInterrupt:
            return results

        except Exception as e:
            print("problem in "+corpus[i])
            print(e)
            
    return results

In [7]:
# read enron phrases

md_path = 'metadata.txt'
df_md = read_enron_md(md_path)
corpus = list(df_md[(df_md['mem_count_cer0']>7) & (df_md['words']>=5) & (df_md['words']<=9)]['text'])
print('read', len(corpus), 'phrases')

# generate phrase pairs (note parameter at end -> just do 3 for test purposes)
generated = generate_study1_phrase_pairs(corpus, [llama_client, mistral_client], [llama_model, mistral_model], prompt1, 3)

#save as JSON
gendict = {}
gendict['phrases'] = generated
with open('study1-phrasepairs.json', 'w') as f:
    f.write(json.dumps(gendict))
    f.close()

read 128 phrases
1/3: it's that time of year again, where we traditionally go out for lunch together | Can I join you for lunch? | Are you going to join us for lunch?
2/3: I just got my project results back from you, thanks for the quick turnaround. | Can you please provide some information about the project timeline? | Thanks for the quick turnaround.
3/3: can you call me tomorrow if possible? | Can you tell me the weather forecast for tomorrow in London? | Please call tomorrow if possible.
