In [None]:
import os 
import io
from transformers.models.imagegpt.modeling_imagegpt import IMAGEGPT_INPUTS_DOCSTRING
from transformers import GPT2Tokenizer
import pandas as pd 
from pandas.io import json
from numpy import nan
import time
import csv
import shutil
import datetime
import pytz
import re
import matplotlib.pyplot as plt
import numpy as np
import time

In [None]:
import json
import torch
from transformers import (AutoTokenizer,
                          AutoModelForCausalLM,
                          BitsAndBytesConfig,
                          pipeline)


In [None]:
HF_TOKEN = "your_token"

In [None]:
model_name = "huggingface_model_name" 
model_engine = "model_engine_spec"

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name,
                                          token=HF_TOKEN)

tokenizer.pad_token = tokenizer.eos_token

In [None]:
n_gpus = torch.cuda.device_count()
print("N GPUS: ", n_gpus)
max_memory = f'{40960}MB'

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config,
    token=HF_TOKEN,
    max_memory = {i: max_memory for i in range(n_gpus)},
)

In [None]:
text_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=4096,
    #do_sample=False,
    temperature=0.1,
    #top_p=0.9,
)

In [None]:
def get_response_LLAMA(prompt):
  sequences = text_generator(prompt)
  gen_text = sequences[0]["generated_text"]
  return gen_text

In [None]:
################## User input in txt file #####################
def user_input():
    #file_path = 'input.txt'
    file_path = 'prompt_path'

    with open(file_path, 'r') as file:
        user_input = file.read()

    dataset = "dataset_name"
    temperature = 0.0
    query = user_input

    # Printing the extracted values
    print(dataset)
    print(temperature)
    print(query)
    
    return dataset, temperature,  query

dataset, temperature, query = user_input()

if dataset == "dataset_name":
    Implementation_base_path_input = 'dataset_path'

print(Implementation_base_path_input)

In [None]:
#count_input_tokens(Implementation_base_path_input)

from transformers import GPT2Tokenizer
from math import ceil

folder_path = Implementation_base_path_input

def count_input_tokens_no_use(folder_path):
  print(folder_path)
  # Initialize GPT2 tokenizer
  tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
  # Get list of file names in folder
  file_names = os.listdir(folder_path)
  # List to store file names with more than 1500 tokens
  large_token_files = []
  x = 0
  y = 0
  parts = 0
  # Loop through each file and calculate number of tokens
  for file_name in file_names:
      # Check if file is a text file
      if file_name.endswith(".txt"):
          # Read file contents
          with open(os.path.join(folder_path, file_name), "r" , encoding='utf-8') as f:
              file_contents = f.read()
          with open(os.path.join(folder_path, file_name), "r" , encoding='utf-8') as fp:
              num_input_line = len(fp.readlines())
          # Calculate number of tokens
          num_tokens = len(tokenizer.encode(file_contents))
          num_output_tokens  = num_input_line*100
          total_token = num_tokens + num_output_tokens
          if total_token > 16000:
            print(f"{file_name}: {total_token} tokens \t parts needed: {ceil(num_output_tokens/4000)}")
            x= x+1
            parts = parts + ceil(num_output_tokens/4000)
            large_token_files.append(file_name)
          else:
            y = y+1
            parts = parts +1
        
  print(x)
  print(y)
  print(parts)
  return large_token_files
              
            
large_token_files = count_input_tokens_no_use(folder_path)
print(large_token_files)


In [None]:
# 10 fold
def call_LLM(Implementation_base_path_input, Implementation_base_path_output, Run, temperature, query):
    print("Run no: ", Run+1)
    print("Temperature: ", temperature)
    print("Query:", query)
    print("Dataset: ", dataset)
    r = 11
    for fold_no in range(1, r):
        with open(os.path.join(Implementation_base_path_input, f'fold{fold_no}.txt')) as f:
            Sentences = f.read()

        prompt = f"""
        {query}
        {Sentences}
        Output:
                """
            
        message = get_response_LLAMA(prompt)

        last_line = message.strip().split('\n')[-1]
        if "Done" in last_line:
            print(f"Fold = {fold_no}, Run = {Run}, Temperature = {temperature}, Complete")
        else:
            print(f"Fold = {fold_no}, Run = {Run}, Temperature = {temperature}, Possibly Incomplete")

        with open(
            os.path.join(Implementation_base_path_output + str(Run + 1) + f'_fold_{fold_no}.txt'), "a"
        ) as f:
            print(message, file=f)


In [None]:
import os

def output_path(Run_no, dataset, temperature):
    base = "Output/"+model_engine+"/" + dataset + "_T"+ str(temperature)+ '/'
    extension_path = dataset + "_T" + str(temperature) + "_Run" + str(Run_no) + '/'
    Implementation_base_path_output = os.path.join(base, extension_path)

    # Use exist_ok=True to avoid FileExistsError in a multiprocessing context
    os.makedirs(Implementation_base_path_output, exist_ok=True)
        
    return Implementation_base_path_output


In [None]:
input_file_names = []

for file_name in os.listdir(Implementation_base_path_input):
    
    if os.path.isfile(os.path.join(Implementation_base_path_input, file_name)):
        input_file_names.append(file_name)

file_count = len(input_file_names)

print("Number of files in the folder:", file_count)
#print("File names:", input_file_names)

In [None]:
for Run in range(3):
    R = Run+1
    Implementation_base_path_output = output_path(R, dataset, temperature)
    print("Implementation base path input:", Implementation_base_path_input)
    print("Implementation base path output:", Implementation_base_path_output)
    call_LLM(Implementation_base_path_input, Implementation_base_path_output, Run, temperature, query)

