In [None]:
#import libraries
import os 
import io
import openai
from transformers.models.imagegpt.modeling_imagegpt import IMAGEGPT_INPUTS_DOCSTRING
from transformers import GPT2Tokenizer
import pandas as pd 
from pandas.io import json
from numpy import nan
import time
import csv
import shutil
import datetime
import pytz
import re
import matplotlib.pyplot as plt
import numpy as np
import random
import multiprocessing

In [None]:
multiprocessing.cpu_count()

In [None]:
# Your API key should go here
openai.api_key = "OPENAI_API_KEY"
section_no = "Final_Prompts"

In [None]:
# Generate All the variations 
import itertools
import random

def get_combinations(prompt_paths, model_engines, datasets, folds_LLL, folds_HPRD50, folds_IEPA):
    combinations = []

    # Create dictionary for dataset-specific folds
    dataset_folds = {
        'LLL': folds_LLL,
        'HPRD50': folds_HPRD50,
        'IEPA': folds_IEPA
    }

    # Iterate over each dataset to get the dataset-specific folds and prompts
    for dataset in datasets:
        current_folds = dataset_folds[dataset]
        for combination in itertools.product(prompt_paths, model_engines, [dataset], current_folds):
            combinations.append(combination)
            
    # Shuffle the list of combinations
    random.shuffle(combinations)
    return combinations

In [None]:
#get user input
def user_input(file_path):
    try:
        with open(file_path, 'r') as file:
            query = file.read()
            file_name = file_path.split('/')[-1]
    except FileNotFoundError:
        print(f"{file_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
        
    return query

In [None]:
#define output path
def output_path(Run_no, dataset, temperature, prompt_type, model_engine):
    base = "Output/" + model_engine + "_"+section_no+"_" + str(temperature) + "/Prompt" + str(prompt_type) +"/"+dataset + "/"
    extension_path = dataset + "_T" + str(temperature) + "_" + prompt_type + "_Run" + str(Run_no)+'/'
    Implementation_base_path_output = os.path.join(base, extension_path)
    os.makedirs(Implementation_base_path_output, exist_ok=True)
    return Implementation_base_path_output

In [None]:
#count number of tokens
def count_input_tokens(folder_path):
  # Initialize GPT2 tokenizer
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  # Get list of file names in folder
  file_names = os.listdir(folder_path)
  input_max_token = 0

  # Loop through each file and calculate number of tokens
  for file_name in file_names:
      # Check if file is a text file
      if file_name.endswith(".txt"):
          # Read file contents
          with open(os.path.join(folder_path, file_name), "r" , encoding='utf-8') as f:
              file_contents = f.read()
          # Calculate number of tokens
          num_tokens = len(tokenizer.encode(file_contents))
          if (num_tokens>input_max_token):
            input_max_token = num_tokens
            
  total_max_tokens = input_max_token+200

  # Loop through each file and calculate number of tokens
  for file_name in file_names:
      # Check if file is a text file
      if file_name.endswith(".txt"):
          # Read file contents
          with open(os.path.join(folder_path, file_name), "r" , encoding='utf-8') as f:
              file_contents = f.read()
          # Calculate number of tokens
          num_tokens = len(tokenizer.encode(file_contents))
          total = num_tokens+ total_max_tokens
        
  return total_max_tokens

In [None]:
#ChatGPT Completion
def get_completion(BACKOFF_OCCURRED_, model, query, Sentences, max_tokens,  temperature):

    prompt = f"""
        {query}
        {Sentences}
        """
    messages = [{"role": "user", "content": prompt}]
    
    tries = 0
    
    while True:
        try:
            start_time = time.time()
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature,
            )
            call_time = time.time()
            time_f = call_time - start_time
            break
        except (openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.APIError) as e:
            tries += 1
            max_backoff = 60  # Example: maximum of 60 seconds
            backoff_time =  min(5 + 5*(tries ** 2), max_backoff)
            with BACKOFF_OCCURRED_.get_lock():
                BACKOFF_OCCURRED_.value = True

            time.sleep(backoff_time)
            with BACKOFF_OCCURRED_.get_lock():
                BACKOFF_OCCURRED_.value = False
            print("Backoff Released\n")
    
    message = response['choices'][0]['message']['content']
    output_token = response['usage']['completion_tokens']
    input_token = response['usage']['prompt_tokens']
    
    now_utc = datetime.datetime.now(pytz.utc)
    timezone = pytz.timezone("US/Central")
    now_eastern = now_utc.astimezone(timezone)
    time_stamp = str(now_eastern)
    return message, input_token, output_token, time_f, time_stamp, tries

In [None]:
#call chatGPT
def call_ChatGPT(BACKOFF_OCCURRED_, current_folds, Implementation_base_path_input, Implementation_base_path_output, Run, total_max_tokens, temperature, query, dataset, model):

        RETRY_COUNT = 5
        input_file_path = os.path.join(Implementation_base_path_input, f'{current_folds}')
        output_file_path = os.path.join(Implementation_base_path_output, f'{Run}_{current_folds}')
        time_track_path = os.path.join(Implementation_base_path_output, f'{temperature}_time_track.csv')

        while BACKOFF_OCCURRED_.value:
            print("\nBackoff occurred! Pausing all threads for a set duration...")
            sleep_time = random.randint(1, 5)
            time.sleep(sleep_time)

            
        for attempt in range(RETRY_COUNT):
            try:
                with open(input_file_path) as f:
                    Sentences = f.read()
                message, input_token, output_token, time_f, time_stamp, tries = get_completion(BACKOFF_OCCURRED_, model, query, Sentences=Sentences, max_tokens=total_max_tokens, temperature=temperature)

                last_line = message.strip().split('\n')[-1]
                status = "Complete" if "Done" in last_line else "Possibly Incomplete"

                print(f"Fold ={current_folds}, Run= {Run}, Temperature={temperature}, {status}. output_file_path: {output_file_path}")

                with open(time_track_path, "a") as f:
                    print(dataset, ',', Run, ',', current_folds, ',', temperature, ',', input_token, ',', output_token, ',', time_f, ',', time_stamp, ',', tries, file=f)

                with open(output_file_path, "w") as f:
                    print(message, file=f)
                    
                break
                
            except Exception as e:  # Catch general exceptions
                    print(f"Error occurred: {e}. Retrying {attempt+1}/{RETRY_COUNT}. output_file_path: {output_file_path}")
                    time.sleep(1)  

            else:  # This block will be executed if the for loop completed without 'break', i.e., if all attempts failed.
                print(f"All {RETRY_COUNT} retries failed for fold {current_folds} at Run {Run}. output_file_path: {output_file_path}")


In [None]:
#execution code
BACKOFF_OCCURRED_ = multiprocessing.Value('b', False)  # 'b' denotes a boolean

def execute_code(run_no, prompt_type, model_engine, dataset, current_folds):
    global BACKOFF_OCCURRED_
    print(run_no, prompt_type, model_engine, dataset, current_folds)
    
    if prompt_type ==  'BASE':
        prompt_path = f"Prompts/Final_Prompts/P60_S3_{prompt_type}.txt"
    elif prompt_type ==  'WD':
        prompt_path = f"Prompts/Final_Prompts/P60_S3_{prompt_type}_{dataset}.txt"
    elif prompt_type ==  'WND':
        prompt_path = f"Prompts/Final_Prompts/P60_S3_{prompt_type}_{dataset}.txt"
        
    temperature = 0.0 
    query = user_input(prompt_path)
    Implementation_base_path_input = 'Datasets/' + dataset + '/10fold/'
    Implementation_base_path_output = output_path(run_no, dataset, temperature, prompt_type, model_engine)
    total_max_tokens = count_input_tokens(Implementation_base_path_input)
    call_ChatGPT(BACKOFF_OCCURRED_, current_folds, Implementation_base_path_input, Implementation_base_path_output, run_no, total_max_tokens, temperature, query, dataset,model = model_engine)



In [None]:
#main function
if __name__ == '__main__':
    prompt_types = [
        'BASE', 
        'WD',
        'WND'
    ]
    model_engines = ["gpt-3.5-turbo-0613", "gpt-4-0613"]
    datasets = ["LLL", "HPRD50", "IEPA"]
    # List for LLL and HPRD50 with 10 folds
    folds_LLL = [f"fold{i}.txt" for i in range(1, 11)]
    folds_HPRD50 = [f"fold{i}.txt" for i in range(1, 11)]
    # Lists for IEPA with 10 folds , and each fold is split into two parts
    folds_IEPA = [f"fold{i}_part{j}.txt" for i in range(1, 11) for j in range(1, 4)]

    for Run_no in range(1, 11):
        all_combinations = get_combinations(prompt_types, model_engines, datasets, folds_LLL, folds_HPRD50, folds_IEPA)
        print(f"Total number of combinations: {len(all_combinations)}")
        args = [(Run_no,) + combo for combo in all_combinations]
        no_of_workers = 20
        with multiprocessing.Pool(no_of_workers) as pool:
            pool.starmap(execute_code, args)
            time.sleep(1)
            
print("Done")