In [1]:
from langchain import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain_openai import ChatOpenAI
from dotenv import load_dotenv
import os
import pickle
from utils import Text, apply_history
from datetime import datetime
from tqdm import tqdm

from multiprocessing import Process, Manager

In [5]:
#load text data
num_texts = 94

data = pickle.load(open(f"/workspace/data/ARTS_only_texts_{num_texts}.pkl", "rb"))
determined_pairs = pickle.load(open(f"/workspace/data/determined_pairs_{num_texts*24}.pkl", "rb"))

texts = {t_id : Text(t_id, text[0]) for t_id, text in data.iterrows()}

In [6]:
load_dotenv("/workspace/.env")
OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

In [7]:
prompt_template = """
I'm going to present you with two texts and I want you to decide which one is simpler. 
       The following guidelines should be taken into account for the decision: 
       Imagine you are writing an exam where you are allowed to google and where the task is to understand the two given texts.
       Which of the two texts: generates less cognitive load?, can you understand more quickly?, are you more confident to answer questions about?, is easier for you to reformulate without changing the meaning?
       Both Texts are delimited by ```

       Text A:
       ```
       {TEXT_A}
       ```

       Text B: 
       ```
       {TEXT_B}
       ```

       The answer should be either A or B, depending on which of the texts is easier to understand. 
       Please answer without any further text, just one letter.
"""


In [8]:
model_name = "gpt-4-1106-preview"
#model_name = "gpt-3.5-turbo-1106"

In [9]:
#temperature = 0 to suppress creativity

prompt= PromptTemplate(template=prompt_template, input_variables=["TEXT_A", "TEXT_B"])
model = ChatOpenAI(temperature=0, model=model_name)
output_parser = StrOutputParser()

chain = prompt | model | output_parser

In [10]:
def regen_history(history):
    gpt_history = manager.dict()
    procs = []

    for key, val in history.items():
        if val[1] == -1:
            i_a, i_b = determined_pairs[key]
            proc = Process(target=handle_request, args=(i_a, i_b, texts, gpt_history, key))
            procs.append(proc)
            proc.start()
        else:
            gpt_history[key] = input_variables

    for proc in procs:
            proc.join()

    return dict(gpt_history)

In [11]:
def regen_coin_flip(history):
    for key, val in history.items():
        if val[1] == -1:
            entry = val
            entry = (entry[0], entry[0][random.randint(0,1)], entry[2])
            history[key] = entry
    return history

In [12]:
def check_broken_history(history, verbose=False):
    cnt = 0
    for key, val in history.items():
        if val[1] == -1:
            cnt+= 1
            if verbose:
                i_a, i_b = determined_pairs[key]
                print(i_a, i_b)
    return cnt

In [13]:
def handle_request(i_a, i_b, texts, gpt_history, match_id):
    system_time = datetime.now().strftime("%H:%M:%S")

    a = texts[i_a].get_text()
    b = texts[i_b].get_text()
    res = chain.invoke({"TEXT_A": a, "TEXT_B":  b})
    winner = i_a
    if res.lower() == 'a':
        winner = i_b
    elif res.lower() == 'b':
        winner = i_a
    else:
        winner = -1
    
    entry = ((i_a, i_b), winner, system_time)
    gpt_history[match_id] = entry

In [14]:
manager = Manager()

In [None]:
num_processes = 30
procs = []


gpt_history = manager.dict()
open_indices = list(range(len(determined_pairs)))

with tqdm(total = len(determined_pairs)) as pbar:
    while len(open_indices) > 0:
        for _ in range(num_processes):
            if len(open_indices) <= 0:
                continue
            current_index = open_indices.pop()
            i_a, i_b = determined_pairs[current_index]
            proc = Process(target=handle_request, args=(i_a, i_b, texts, gpt_history, current_index))
            procs.append(proc)
            proc.start()
            pbar.update(1)
        
        for proc in procs:
            proc.join()


gpt_history = dict(gpt_history)

In [None]:
#load history
gpt_history = pickle.load(open(f"/workspace/{model_name}-{num_texts}_history.pkl", "rb"))

In [18]:
path = f"/workspace/{model_name}-{num_texts}_2_history.pkl"
pickle.dump(gpt_history, open(path, "wb"))
print(f"saved history at {path}")

saved history at /workspace/gpt-4-1106-preview-94_2_history.pkl
