In [None]:
import os 
import io
import openai
from transformers.models.imagegpt.modeling_imagegpt import IMAGEGPT_INPUTS_DOCSTRING
from transformers import GPT2Tokenizer
import pandas as pd 
from pandas.io import json
from numpy import nan
import time
import csv
import shutil
import datetime
import pytz
import re
import matplotlib.pyplot as plt
import numpy as np
import multiprocessing
import itertools
import random

In [None]:
multiprocessing.cpu_count()

In [None]:
openai.api_key = "OPENAI_API_KEY"
section_no = "PROTEIN_Prompts"

In [None]:
#Generate All the variations 
def get_combinations(prompt_paths, model_engines, datasets, folds_LLL, folds_HPRD50, folds_IEPA):
    combinations = []
    dataset_folds = {
        'LLL': folds_LLL,
        'HPRD50': folds_HPRD50,
        'IEPA': folds_IEPA
    }
    for dataset in datasets:
        current_folds = dataset_folds[dataset]
        for combination in itertools.product(prompt_paths, model_engines, [dataset], current_folds):
            combinations.append(combination)
    random.shuffle(combinations)
    return combinations

In [None]:
def user_input(file_path):
    try:
        with open(file_path, 'r') as file:
            query = file.read()
            file_name = file_path.split('/')[-1]
    except FileNotFoundError:
        print(f"{file_path} not found.")
    except Exception as e:
        print(f"An error occurred: {e}")
    return query

In [None]:
def output_path(Run_no, dataset, temperature, prompt_no, model_engine):
    base = "Output/" + model_engine + "_"+section_no+"_" + str(temperature) + "/" + str(prompt_no) +"/"+dataset + "/"
    extension_path = dataset + "_T" + str(temperature) + "_" + prompt_no + "_Run" + str(Run_no)+'/'
    Implementation_base_path_output = os.path.join(base, extension_path)
    os.makedirs(Implementation_base_path_output, exist_ok=True)
    return Implementation_base_path_output

In [None]:
def count_input_tokens(folder_path):
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  file_names = os.listdir(folder_path)
  max_num_input_lines = 0

  # Loop through each file and calculate number of tokens
  for file_name in file_names:
      if file_name.endswith(".txt"):
          with open(os.path.join(folder_path, file_name), "r" , encoding='utf-8') as f:
              file_contents = f.read()
          with open(os.path.join(folder_path, file_name), "r" , encoding='utf-8') as fp:
              num_input_line = len(fp.readlines())
              print("num_input_line:",num_input_line)
          num_tokens = len(tokenizer.encode(file_contents))
          if (num_input_line>max_num_input_lines):
            max_num_input_lines = num_input_line
          print(f"{file_name}: {num_tokens} tokens")  

  total_max_tokens = max_num_input_lines*10 + 30

    # Loop through each file and calculate number of tokens
  for file_name in file_names:
      if file_name.endswith(".txt"):
          with open(os.path.join(folder_path, file_name), "r" , encoding='utf-8') as f:
              file_contents = f.read()
          num_tokens = len(tokenizer.encode(file_contents))
          total = num_tokens+ total_max_tokens
          print(f"{file_name}: Input: {num_tokens} tokens, Output: {total_max_tokens} : Total: {total}")
  print(total_max_tokens)
  return total_max_tokens

In [None]:
#ChatGPT Completion
def get_completion(BACKOFF_OCCURRED_, model, query, Sentences, max_tokens,  temperature): 
    prompt = f"""
        {query}
        {Sentences}
        """
    messages = [{"role": "user", "content": prompt}]
    tries = 0
    
    while True:
        try:
            start_time = time.time()
            response = openai.ChatCompletion.create(
                model=model,
                messages=messages,
                max_tokens=max_tokens,
                temperature=temperature,
            )
            call_time = time.time()
            time_f = call_time - start_time
            break
        except (openai.error.RateLimitError, openai.error.ServiceUnavailableError, openai.error.APIError) as e:
            tries += 1
            max_backoff = 60 
            backoff_time =  min(5 + 5*(tries ** 2), max_backoff)
            with BACKOFF_OCCURRED_.get_lock():
                BACKOFF_OCCURRED_.value = True
            time.sleep(backoff_time)
            with BACKOFF_OCCURRED_.get_lock():
                BACKOFF_OCCURRED_.value = False
            print("Backoff Released\n")
    
    message = response['choices'][0]['message']['content']
    output_token = response['usage']['completion_tokens']
    input_token = response['usage']['prompt_tokens']
    
    now_utc = datetime.datetime.now(pytz.utc)
    timezone = pytz.timezone("US/Central")
    now_eastern = now_utc.astimezone(timezone)
    time_stamp = str(now_eastern)
    return message, input_token, output_token, time_f, time_stamp, tries

In [None]:
import os
import time

def call_ChatGPT(BACKOFF_OCCURRED_, current_folds, Implementation_base_path_input, Implementation_base_path_output, Run, total_max_tokens, temperature, query, dataset, model):
        RETRY_COUNT = 5
        input_file_path = os.path.join(Implementation_base_path_input, f'{current_folds}')
        output_file_path = os.path.join(Implementation_base_path_output, f'{Run}_{current_folds}')
        time_track_path = os.path.join(Implementation_base_path_output, f'{temperature}_time_track.csv')

        while BACKOFF_OCCURRED_.value:
            print("\nBackoff occurred! Pausing all threads for a set duration...")
            sleep_time = random.randint(1, 5)
            time.sleep(sleep_time)   
            
        for attempt in range(RETRY_COUNT):
            try:
                with open(input_file_path) as f:
                    Sentences = f.read()
                message, input_token, output_token, time_f, time_stamp, tries = get_completion(BACKOFF_OCCURRED_, model, query, Sentences=Sentences, max_tokens=total_max_tokens, temperature=temperature)

                last_line = message.strip().split('\n')[-1]
                status = "Complete" if "Done" in last_line else "Possibly Incomplete"

                print(f"Fold ={current_folds}, Run= {Run}, Temperature={temperature}, {status}. output_file_path: {output_file_path}")

                with open(time_track_path, "a") as f:
                    print(dataset, ',', Run, ',', current_folds, ',', temperature, ',', input_token, ',', output_token, ',', time_f, ',', time_stamp, ',', tries, file=f)

                with open(output_file_path, "w") as f:
                    print(message, file=f)
                    
                break
                
            except Exception as e:  # Catch general exceptions. Be specific if you know which exceptions to expect
                    print(f"Error occurred: {e}. Retrying {attempt+1}/{RETRY_COUNT}. output_file_path: {output_file_path}")
                    time.sleep(1) 

            else:  # This block will be executed if the for loop completed without 'break', i.e., if all attempts failed.
                print(f"All {RETRY_COUNT} retries failed for fold {current_folds} at Run {Run}. output_file_path: {output_file_path}")

In [None]:
import random
import multiprocessing
import time

# Setting up the shared variable
BACKOFF_OCCURRED_ = multiprocessing.Value('b', False)  # 'b' denotes a boolean

def execute_code(run_no, prompt_no, model_engine, dataset, current_folds):
    global BACKOFF_OCCURRED_
    prompt_path = f"Prompts/PROTEIN_Prompts/{prompt_no}.txt"
    temperature = 0.0 
    query = user_input(prompt_path)
    
    Implementation_base_path_input = 'Datasets/PROTEIN_DATA/' + dataset + '/PROTEIN_splitted_all_sentences/input/'
    print(Implementation_base_path_input)
    Implementation_base_path_output = output_path(run_no, dataset, temperature, prompt_no, model_engine)
    total_max_tokens = count_input_tokens(Implementation_base_path_input)
    call_ChatGPT(BACKOFF_OCCURRED_, current_folds, Implementation_base_path_input, Implementation_base_path_output, run_no, total_max_tokens, temperature, query, dataset,model = model_engine)

if __name__ == '__main__':
    prompt_no = ['Prompt15']
    model_engines = ["gpt-3.5-turbo-0613", "gpt-4-0613"]
    datasets = ["LLL", "HPRD50", "IEPA"]
    
    # LLL - Folds 4 and 9 have part1 and part2
    folds_LLL = [f"fold{i}.txt" for i in range(1, 11) if i not in [4, 9]]
    folds_LLL.extend([f"part{j}_fold4.txt" for j in range(1, 3)])
    folds_LLL.extend([f"part{j}_fold9.txt" for j in range(1, 3)])

    # HPRD50 - Fold 4 has part1 and part2
    folds_HPRD50 = [f"fold{i}.txt" for i in range(1, 11) if i != 4]
    folds_HPRD50.extend([f"part{j}_fold4.txt" for j in range(1, 3)])

    # IEPA - All folds have part1 and part2, fold 1 additionally has part3
    folds_IEPA = [f"part{j}_fold{i}.txt" for i in range(1, 11) for j in range(1, 3)]
    folds_IEPA.extend(["part3_fold1.txt"])


    for Run_no in range(1, 11):
        # Get all combinations
        all_combinations = get_combinations(prompt_no, model_engines, datasets, folds_LLL, folds_HPRD50, folds_IEPA)
        # Print the total number of combinations
        print(f"Total number of combinations: {len(all_combinations)}")

        # Create argument tuples for starmap
        args = [(Run_no,) + combo for combo in all_combinations]
        print("\n\n\n", args)
        # Use as many workers as there are CPUs available
        no_of_workers = 20

        with multiprocessing.Pool(no_of_workers) as pool:
            pool.starmap(execute_code, args)
            time.sleep(1)