In [1]:
import json
import os
import sys
sys.path.insert(0, '..')
from utility import REPO_ROOT_PATH, get_GPT_response, client
from tqdm import tqdm
import ast
import numpy as np
import math


In [2]:
def eds_check(text):
    system_prompt = '''You are an expert in Ehlers-Danlos syndrome (EDS). 
    Given the user message, you analyse it and see if it is related to EDS or not.
    If it is related to EDS, return True, else return False. 
    Do not return any other explanation, only return the boolean.
    '''
    response = get_GPT_response(text, system_prompt, os.environ.get('MODEL_NAME'), temperature=0.3)
    return response


def batch_eds_check(text_batch):
    system_prompt = '''You are an expert in Ehlers-Danlos syndrome (EDS).
    Given the array of user messages, you analyse every element of the array and see if each element of the array is related to EDS or not.
    If it is related to EDS, return True, else return False. 
    Do not return any other explanation, only return a list of booleans.
    For example:
    if the input is ["what are the symptoms of EDS", "what is the capital of france"]
    your response should be an array as follows:
    ['True', 'False']
    '''    
    promptsArray = text_batch    
    stringifiedPromptsArray = json.dumps(promptsArray)        
    prompts = [
        {
        "role": "user",
        "content": stringifiedPromptsArray
    }
    ]
    
    batchInstruction = {
        "role":"system",
        "content":system_prompt
    }
    
    prompts.append(batchInstruction)
    
    stringifiedBatchCompletion = client.chat.completions.create(model=os.environ.get('MODEL_NAME'),
                                             messages=prompts,
                                             max_tokens=1000)
    try:
        batchCompletion = ast.literal_eval(stringifiedBatchCompletion.choices[0].message.content)
        return batchCompletion
    except:
        return None
    





In [3]:
DATA_PATH = os.path.join(REPO_ROOT_PATH, 'eds_data')


In [4]:
reddit_data_path = 'reddit_data/reddit_data_2'


In [5]:
files = os.listdir(os.path.join(DATA_PATH, reddit_data_path))


In [6]:
sel_data = []
for index, file in enumerate(files):
    with open(os.path.join(DATA_PATH, reddit_data_path, file), 'r') as f:
        data = json.load(f)
        for data_ in data:
            sel_data.append(data_)


In [12]:
%%time

batch_size = 50
nbatch = math.ceil(len(sel_data)/batch_size)
sel_data_batch = np.array_split(sel_data, nbatch)

verified_data = []
for data_batch in tqdm(sel_data_batch):
    data_batch_text = list(map(lambda x:x['instruction'], data_batch))
    response = batch_eds_check(data_batch_text)
    if response:
        try:
            data_batch_text_sel = np.array(data_batch_text)[np.where(np.array(response) == 'True')[0]]
            verified_data.extend(data_batch_text_sel)
        except:
            continue
    else:
        continue
    
    
    


100%|█████████████████████████████████████████| 165/165 [09:06<00:00,  3.31s/it]

CPU times: user 4.2 s, sys: 257 ms, total: 4.46 s
Wall time: 9min 6s





In [46]:
verified_data_json = []
for item in sel_data:
    if item['instruction'] in verified_data:
        verified_data_json.append(item)
            
    
        

In [50]:
with open(os.path.join(DATA_PATH, reddit_data_path, 'verified_data', 'reddit_data_2_verified.json'), 'w') as f:
    json.dump(verified_data_json, f)
    

In [8]:
# %%time

# batch_size = 100
# nbatch = math.ceil(len(sel_data)/batch_size)
# sel_data_batch = np.array_split(sel_data, nbatch)

# verified_data = []
# for index, data_batch in tqdm(enumerate(sel_data_batch)):
#     data_batch_text = list(map(lambda x:x['instruction'], data_batch))
#     response = batch_eds_check(data_batch_text)
#     if response:
#         try:
#             data_batch_text_sel = np.array(data_batch_text)[np.where(np.array(response) == 'True')[0]]
#             verified_data.extend(data_batch_text_sel)
#         except:
#             for item in data_batch_text:
#                 response = eds_check(item)
#                 if response == 'True':
#                     verified_data.extend([item])
#     else:
#         for item in data_batch_text:
#             response = eds_check(item)
#             if response == 'True':
#                 verified_data.extend([item])
    
    


In [9]:
# %%time

# import ast

# system_prompt = '''You are an expert in Ehlers-Danlos syndrome (EDS).
# Given the array of user messages, you analyse every element of the array and see if each element of the array is related to EDS or not.
# If it is related to EDS, return True, else return False. 
# Do not return any other explanation, only return an array of booleans.
# For example:
# if the input is ['what are the symptoms of EDS', 'what is the capital of france']
# your response should be:
# ['True', 'False']
# '''

# promptsArray = sel_data

# stringifiedPromptsArray = json.dumps(promptsArray)

# # print(promptsArray)

# prompts = [
#     {
#     "role": "user",
#     "content": stringifiedPromptsArray
# }
# ]

# batchInstruction = {
#     "role":"system",
#     "content":system_prompt
# }

# prompts.append(batchInstruction)

# print("ChatGPT: ")
# stringifiedBatchCompletion = client.chat.completions.create(model=os.environ.get('MODEL_NAME'),
#                                          messages=prompts,
#                                          max_tokens=1000)
# batchCompletion = ast.literal_eval(stringifiedBatchCompletion.choices[0].message.content)




In [10]:


# system_prompt = 'you are a helpful assistant'


# promptsArray = ["Hello world, from", "How are you B", "I am fine. W", "The  fifth planet from the Sun is "]

# stringifiedPromptsArray = json.dumps(promptsArray)

# print(promptsArray)

# prompts = [
#     {
#     "role": "user",
#     "content": stringifiedPromptsArray
# }
# ]

# batchInstruction = {
#     "role":"system",
#     "content":"Complete every element of the array. Reply with an array of all completions."
# }

# prompts.append(batchInstruction)

# print("ChatGPT: ")
# stringifiedBatchCompletion = client.chat.completions.create(model=os.environ.get('MODEL_NAME'),
#                                          messages=prompts,
#                                          max_tokens=1000)
# batchCompletion = json.loads(stringifiedBatchCompletion.choices[0].message.content)
# print(batchCompletion)
