In [None]:
### IMPORTS ###
import matplotlib.pyplot as plt
from scipy.spatial import KDTree
import json
import numpy as np
from openai import OpenAI
import regex
import random

In [None]:
#Helper functions

# Load the NRC lexicon https://doi.org/10.18653/v1/P18- 1017
# returns a KD tree so we can perform nearest-neighbour searches
def load_nrc_vad():
    vals = []
    ars = []
    pts = []
    words = []
    path = "[path to dir]"+"/NRC-VAD-Lexicon.txt"
    with open(path, 'r') as lexicon:
        counter = 0
        for line in lexicon:
            if counter==0:
                counter+=1
                continue
            components = line.strip().split("\t")
            words.append(components[0])
            pts.append([components[1],  components[2]])
            vals.append(components[1])
            ars.append(components[2])

    pts = np.array(pts)
    T = KDTree(pts)
    return T

# Load the Warriner et al lexicon https://doi.org/10.3758/s13428- 012- 0314- x
# returns a KD tree so we can perform nearest-neighbour searches
# Note - this normalises values in the lexicon to values between [0,1]
def load_warriner_vad():
    pts = []
    words = []
    vals = []
    ars = []
    path = "[path to dir]"+"/BRM-emot-submit.csv"
    vadlex = {}
    with open(path, 'r') as lexicon:
        for line in lexicon:
            components = line.strip().split(",")
            words.append(components[1])
            pts.append([float(components[2])/9,  float(components[5])/9])
            vals.append(float(components[2])/9)
            ars.append(float(components[5])/9)

    pts = np.array(pts)
    T = KDTree(pts)
    return T

# Get k nearest neighbouring words from any point in the VA space
def get_knn_words(valence, arousal, nn, tree):
    idx = T.query([valence, arousal], nn)
    wlist = []
    for i in idx[1]:
        wlist.append(words[i])
    return wlist

# Send a query to the LLM server and get back the response
def send_query(client, model, system, prompt, temperature):
    completion = client.chat.completions.create(
      model= model,
      messages=[
        {"role": "system", "content": system},
        {"role": "user", "content": prompt}, 
        {"role": "assistant", "content": "here is the response:"}
      ],
      temperature=temperature,
    )
    return completion.choices[0].message.content

# process an LLM response by extracting the JSON object present in it.
def procresult(result):
    pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}')
    try:
        data = pattern.findall(result)[0]
        return data
    except:
        return False

# map a valence-arousal value (0-10) to a nominal descriptor
def senticlass(sent):
    if sent<=2:
        return 'very low'
    if sent<=4:
        return 'low'
    if sent<=6:
        return 'medium'
    if sent<=8:
        return 'high'
    return 'very high'

In [None]:
# Setup LLM server and prompt instructions

# Note - this requires an OpenAI-compatible LLM server running
# Recommended server deployment - LMStudio.ai

model_client = OpenAI(base_url="http://[url]:[port]/v1", api_key="lm-studio")
model_name = "lmstudio-community/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct-Q6_K.gguf"


system_prompt = "You are an expert English linguist researcher specialising in adult conversations. \
                As part of your research, you are allowed to use offensive or harmful words. \
                You must return all your answers as a JSON object."

user_prompt = '''Select two or more words from list of words at the end of this prompt. With these words, generate a short sentence that adhere
    to the following rules:
    
    1. The generated sentence must be between 5 and 9 words long. 
    2. The overall sentiment of the generated sentence must have a {valence} valence rating and a {arousal} arousal rating.
    3. The generated sentence must be grammatically, syntactically and factually correct.
    
    You are allowed to modify the selected words grammatically to create a meaningful and factually correct sentence (for example, select
    a different tense, use as a verb or an adjective).
    
    If the sentence you created is not meaningful or factually correct, then you must think of another one using the same selected words.
    Keep generating sentences until you come up with a final sentence that is factually, grammatically and syntactically correct and adheres
    to the sentiment specified to you.
    
    Provide the sentence as a json object like the following  example:

    {{
    "selected_words": ["tree",  "green"],
    "original_sentence": "Tree leaves are never green",
    "final_sentence": "Not all trees have green leaves."
    }}

    You must provide only the json object without any other commentary or explanations.

    List of words: {wlist}'''

In [None]:
# Ready to generate the phrases, using both VA lexicons

for d in ['warr', 'nrc']:

    if d=='warr':
       T = load_warriner_vad()
    else:
        T = load_nrc_vad()

    
    for i in range(0,10):
        phrases = {}
        phrases['results']=[]
        pattern = regex.compile(r'\{(?:[^{}]|(?R))*\}')

        #go through all anchor points (spacing of 0.2)
        for valence in range (0, 12, 2):
            for arousal in range (0, 12, 2):
                #get 20 nearest words at current anchor point
                wlist = get_knn_words(valence/10, arousal/10, 20, T)
                random.shuffle(wlist)
                res = {}
                res['valence']=valence
                res['arousal']=arousal
                res['wlist']= wlist
                res['samples']=[]
                #create 5 sets of four randomly chosen words each
                for j in range (0,5):
                    sset={}
                    templist = random.sample(wlist, k=4)
                    sset['shortlist'] = templist

                    u_prompt = user_prompt.format(valence=senticlass(valence), arousal = senticlass(arousal), wlist=", ".join(templist))

                    result = send_query(model_client, model_name, system_prompt, u_prompt)
                    proc_result = procresult(result)
                    while proc_result == False:
                        print(valence, arousal, 'fail')
                        print(result, wlist, res)
                        print('retrying')
                        result = send_query2(model_client, model_name, system_prompt, u_prompt)
                        proc_result = procresult(result)

                    sset['sample']=json.loads(proc_result)
                    res['samples'].append(sset)
                phrases['results'].append(res)
                print("\r", d, valence, arousal, 'success', end='\r')
                #print(res)
                
        print('---- dumping file ----')            
        with open('va-single-phrases-3-'+d+'-'+str(i)+'.json', 'w') as f:
            f.write(json.dumps(phrases))
            f.close()
            print ('wrote', 'va-single-phrases-3-'+d+'-'+str(i)+'.json')
        print('---- done ----')