In [118]:
import json, openai, requests, warnings
warnings.filterwarnings("ignore")
import configparser
import uuid
from time import sleep
import tiktoken

In [2]:
credentials = configparser.ConfigParser()
credentials.read_file(open('apidata.config'))
my_key = credentials['OPENAI']['KEY']
my_org = credentials['OPENAI']['org']

In [3]:
def gpt3_completion(org, key,prompt,prompt_part,
                    engine='text-davinci-003',temp=1.0,
                   top_p=1.0,tokens=500,freq_pen=0.0,
                   pres_pen=0.0,stop=['asdfasdf','asdfasdf'],
                   filename='temp_file.jsonl'):
    """The function will take the prompt and the prompt parts to 
    begin the data collection. Once the data collection is completed, 
    the prompt part will be used for fine tuning. The prompt used will 
    be discarded"""
    openai.api_key = key
    openai.organization = org
    max_retry=5
    retry = 0
    prompt = prompt.encode(encoding='ASCII',errors='ignore').decode()
    while True:
        try:
            response = openai.Completion.create(
            model=engine,
            prompt=prompt,
            temperature=temp,
            max_tokens=tokens,
            top_p=top_p,
            frequency_penalty=freq_pen,
            presence_penalty=pres_pen,
            stop=stop)
            
            text = response['choices'][0]['text'].strip()
            outprompt = prompt_part
            dict_reply = {'prompt':outprompt,'completion':text}
            with open(filename,'a+') as taker:
                json.dump(dict_reply,fp=taker)
                taker.write('\n')
            
            return dict_reply
        except Exception as e:
            retry += 1
            if retry > max_retry:
                return f'GPT errored out {e}'
            sleep(1)

In [47]:
def gpt3_question_collect(org, key,prompt,
                    engine='text-davinci-003',temp=1.0,
                   top_p=1.0,tokens=500,freq_pen=0.0,
                   pres_pen=0.0,stop=['asdfasdf','asdfasdf'],
                   filename='questions_file.jsonl'):
    """The function will take the prompt return the questions 
    that can be asked on the dataset. The same will be written 
    in questions_file.jsonl"""
    openai.api_key = key
    openai.organization = org
    max_retry=5
    retry = 0
    prompt = prompt.encode(encoding='ASCII',errors='ignore').decode()
    while True:
        try:
            response = openai.Completion.create(
            model=engine,
            prompt=prompt,
            temperature=temp,
            max_tokens=tokens,
            top_p=top_p,
            frequency_penalty=freq_pen,
            presence_penalty=pres_pen,
            stop=stop)
            
            text = response['choices'][0]['text'].strip()
            dict_reply = {'prompt':prompt,'questions':text}
            with open(filename,'a+') as taker:
                json.dump(dict_reply,fp=taker)
                taker.write('\n')
            
            return dict_reply
        except Exception as e:
            retry += 1
            if retry > max_retry:
                return f'GPT errored out {e}'
            sleep(1)

In [61]:
def gpt3_answer_collect(org, key,prompt_question,
                    engine='text-davinci-003',temp=1.0,
                   top_p=1.0,tokens=500,freq_pen=0.0,
                   pres_pen=0.0,stop=['asdfasdf','asdfasdf'],
                   filename='prompt_file.jsonl'):
    """The function will take the prompt return the questions 
    that can be asked on the dataset. The same will be written 
    in questions_file.jsonl"""
    openai.api_key = key
    openai.organization = org
    max_retry=5
    retry = 0
    prompt = prompt_question.encode(encoding='ASCII',errors='ignore').decode()
    while True:
        try:
            response = openai.Completion.create(
            model=engine,
            prompt=prompt_question,
            temperature=temp,
            max_tokens=tokens,
            top_p=top_p,
            frequency_penalty=freq_pen,
            presence_penalty=pres_pen,
            stop=stop)
            
            text = response['choices'][0]['text'].strip()
            dict_reply = {'prompt':prompt_question,
                          'completion':text}
            with open(filename,'a+') as taker:
                json.dump(dict_reply,fp=taker)
                taker.write('\n')
            
            return dict_reply
        except Exception as e:
            retry += 1
            if retry > max_retry:
                return f'GPT errored out {e}'
            sleep(1)

In [4]:
#reading in the dataset sample_data.csv

with open('space_titanic.csv','r') as sd:
    data = sd.readlines()

In [5]:
len(data)

8694

In [6]:
data[0]+data[1]

'PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported\n0001_01,Europa,False,B/0/P,TRAPPIST-1e,39.0,False,0.0,0.0,0.0,0.0,0.0,Maham Ofracculy,False\n'

In [None]:
# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("text-davinci-003")

In [8]:
enc.encode("This is my first use of tiktoken")

[1212, 318, 616, 717, 779, 286, 256, 1134, 30001]

In [9]:
enc.decode(enc.encode("This is my first use of tiktoken"))

'This is my first use of tiktoken'

In [10]:
enc.n_vocab

50281

In [8]:
def num_tokens_from_string(string: str, 
                           encoding_name: str = 'text-davinci-003') -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [9]:
num_tokens_from_string(string="This is the first use of tiktoken",
                       encoding_name='text-davinci-003')

9

In [19]:
### prompt design

prompt = f"""
[Dataset] = {data[0]}+{data[1:60]} 
prompt:Based on above [Dataset] list 10 questions that I can ask

Answer:
"""

In [20]:
num_tokens_from_string(prompt)

3527

### The prompt and the answer should be total 4001 token

In [21]:
4001 - num_tokens_from_string(prompt)

474

Psuedocode:

1) Split the dataset into small chunks so it can fit inside the prompt. 

1a) Each dataset should be given a unique name

2) Use the dataset chunks and send the prompt asking for the 15 questions that you can ask

3) Collect the questions and attach it with the dataset chunk and write in a seperate file

Do this for 15 chunks of the dataset you will have 225 questions

4) Each dataset chunks will become 15 prompts with individual questions

Prompt: Refer the {Dataset(num)} that was provided to you. {question}

Answer:

5) The prompt, along with the dataset, dataset num, question is considered as one prompt. The answer the model provides is considered as completion. 

{"prompt":dataset{num}=dataset_values,Refer the {dataset(num)} that was provided to you. {question}, "completion":Answer}

6) Train the model...

In [24]:
#I will train with 6000 datasets
len(data)/60

144.9

In [51]:
data_chunks = []
for x in list(range(1,100)):
    data_t = f'{data[0]}+{data[(x+1):(x*60+1)]}'
    
    prompt_maker = f"""
    dataset_{x}={data_t}
    
    Prompt:Refer to the [dataset_{x+1}] and List 25 questions that can be asked from it
    
    Answer:
    """
    data_chunks.append(prompt_maker)

In [45]:
num_tokens_from_string(data_chunks[0])

3527

In [58]:
data_chunks[0]

"\n    dataset_1=PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported\n+['0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True\\n', '0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False\\n', '0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False\\n', '0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True\\n', '0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True\\n', '0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True\\n', '0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True\\n', '0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True\\n', '0008_01,Europa,True,B/1/P,55 Cancri e,14.0,F

In [50]:
gpt3_question_collect(org=my_org,key=my_key,
                      prompt=data_chunks[1])

{'prompt': "\n    dataset_2=PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported\n+['0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True\\n', '0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False\\n', '0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False\\n', '0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True\\n', '0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True\\n', '0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True\\n', '0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True\\n', '0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True\\n', '0008_01,Europa,True,B/1/P,55 Canc

In [59]:
gpt3_question_collect(org=my_org,key=my_key,
                      prompt=data_chunks[0])

{'prompt': "\n    dataset_1=PassengerId,HomePlanet,CryoSleep,Cabin,Destination,Age,VIP,RoomService,FoodCourt,ShoppingMall,Spa,VRDeck,Name,Transported\n+['0002_01,Earth,False,F/0/S,TRAPPIST-1e,24.0,False,109.0,9.0,25.0,549.0,44.0,Juanna Vines,True\\n', '0003_01,Europa,False,A/0/S,TRAPPIST-1e,58.0,True,43.0,3576.0,0.0,6715.0,49.0,Altark Susent,False\\n', '0003_02,Europa,False,A/0/S,TRAPPIST-1e,33.0,False,0.0,1283.0,371.0,3329.0,193.0,Solam Susent,False\\n', '0004_01,Earth,False,F/1/S,TRAPPIST-1e,16.0,False,303.0,70.0,151.0,565.0,2.0,Willy Santantines,True\\n', '0005_01,Earth,False,F/0/P,PSO J318.5-22,44.0,False,0.0,483.0,0.0,291.0,0.0,Sandie Hinetthews,True\\n', '0006_01,Earth,False,F/2/S,TRAPPIST-1e,26.0,False,42.0,1539.0,3.0,0.0,0.0,Billex Jacostaffey,True\\n', '0006_02,Earth,True,G/0/S,TRAPPIST-1e,28.0,False,0.0,0.0,0.0,0.0,,Candra Jacostaffey,True\\n', '0007_01,Earth,False,F/3/S,TRAPPIST-1e,35.0,False,0.0,785.0,17.0,216.0,0.0,Andona Beston,True\\n', '0008_01,Europa,True,B/1/P,55 Canc

In [62]:
gpt3_answer_collect(org=my_org,key=my_key,
                   prompt_question='''Refer to [dataset_1] and 
                   answer What is the range of FoodCourt options used by the passengers?''')

{'prompt': 'Refer to [dataset_1] and \n                   answer What is the range of FoodCourt options used by the passengers?',
 'completion': 'The range of FoodCourt options used by the passengers in dataset_1 includes drinks, snacks, sandwiches, entrees, desserts, and more.'}

In [63]:
gpt3_answer_collect(org=my_org,key=my_key,
                   prompt_question='''Refer to [dataset_1] and 
                   answer How many passengers between 0 and 10 years of age had a Cabin type F/0/P?''')

{'prompt': 'Refer to [dataset_1] and \n                   answer How many passengers between 0 and 10 years of age had a Cabin type F/0/P?',
 'completion': 'There is no information available on the ages of passengers in the provided dataset 1. However, the dataset does indicate the number of passengers who had a Cabin type F, 0, or P, which is 17.'}

In [64]:
gpt3_answer_collect(org=my_org,key=my_key,
                   prompt_question='''Refer to [dataset_1] and 
                   answer What is the mean VRDeck value for passengers travelling to TRAPPIST-1e?''')

{'prompt': 'Refer to [dataset_1] and \n                   answer What is the mean VRDeck value for passengers travelling to TRAPPIST-1e?',
 'completion': 'The mean VRDeck value for passengers travelling to TRAPPIST-1e is 5.87.'}

In [69]:
with open('questions_file.jsonl','r') as qpf:
    json_data = json.load(qpf)

In [77]:
def question_extractor(reply_dict):
    extracted = []
    quest_list = reply_dict['questions'].split('\n')
    
    for que in quest_list:
        ready = que.split('.')[1].strip(' ')
        extracted.append(ready)
    
    return extracted

In [72]:
questionList = json_data[0]['questions'].split('\n')

In [76]:
question = questionList[0].split('.')[1].strip(' ')

In [80]:
quest_set1 = question_extractor(json_data[1])

In [81]:
quest_set0 = question_extractor(json_data[0])

In [86]:
#make the prompts with the quest sets
def prompt_question(num, question):
    return f'''Refer to [dataset_{num}] and answer {question} '''

In [92]:
prompt_question(1,quest_set0[2])

'Refer to [dataset_1] and answer What is the average age of the passengers? '

In [94]:
#Iterate over the question sets
for ind,que in enumerate(quest_set0[2:]):
    quest_ref_dataset = prompt_question(1,que)
    gpt3_answer_collect(key=my_key,org=my_org,
                        prompt_question=quest_ref_dataset)

In [95]:
#Iterate over the question sets
for ind,que in enumerate(quest_set1):
    quest_ref_dataset = prompt_question(2,que)
    gpt3_answer_collect(key=my_key,org=my_org,
                        prompt_question=quest_ref_dataset)

In [119]:
def file_upload(filename, purpose='fine-tune'):
    resp = openai.File.create(purpose=purpose,file=open(file=filename,
                                                        mode='rb'))
    print(resp)
    return resp

def finetune_model(fileId, suffix,model='davinci',api_key=my_key):
    header = {'Content-Type':'application/json',
              'Authorization':f'Bearer {api_key}'}
    payload = {'training_file':fileId,'model':model,'suffix':suffix}
    resp = requests.request(method='POST',
                            url='https://api.openai.com/v1/fine-tunes',
                           headers=header,
                           json=payload,timeout=40)
    print(resp.json())
    
def finetune_get(ftId,api_key=my_key):
    header = {'Content-Type':'application/json',
              'Authorization':f'Bearer {api_key}'}
    resp = requests.request(method='GET',
                url=f'https://api.openai.com/v1/fine-tunes/{ftId}',
                           headers=header,timeout=40)
    print(resp.json())
    
def finetune_event(ftId,api_key=my_key):
    header = {'Content-Type':'application/json',
              'Authorization':f'Bearer {api_key}'}
    resp = requests.request(method='GET',
                url=f'https://api.openai.com/v1/fine-tunes/{ftId}/events',
                           headers=header,timeout=40)
    print(resp.json())

In [113]:
file_upload(filename='prompt_file.jsonl')

{
  "bytes": 10818,
  "created_at": 1678374670,
  "filename": "file",
  "id": "file-B7AJ8LI7onSaVqC3ZaW68NR9",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}


<File file id=file-B7AJ8LI7onSaVqC3ZaW68NR9 at 0x7f26ed373150> JSON: {
  "bytes": 10818,
  "created_at": 1678374670,
  "filename": "file",
  "id": "file-B7AJ8LI7onSaVqC3ZaW68NR9",
  "object": "file",
  "purpose": "fine-tune",
  "status": "uploaded",
  "status_details": null
}

In [131]:
finetune_model(fileId='file-B7AJ8LI7onSaVqC3ZaW68NR9',
               api_key=my_key,suffix='space_titanic_r02',
              model='davinci')

{'object': 'fine-tune', 'id': 'ft-gOeTtRL8FbQYMKzrheNNBm8H', 'hyperparams': {'n_epochs': 4, 'batch_size': None, 'prompt_loss_weight': 0.01, 'learning_rate_multiplier': None}, 'organization_id': 'org-QWWIpNe4Z3KsRVI0YR6lhyJW', 'model': 'davinci', 'training_files': [{'object': 'file', 'id': 'file-B7AJ8LI7onSaVqC3ZaW68NR9', 'purpose': 'fine-tune', 'filename': 'file', 'bytes': 10818, 'created_at': 1678374670, 'status': 'processed', 'status_details': None}], 'validation_files': [], 'result_files': [], 'created_at': 1678377767, 'updated_at': 1678377767, 'status': 'pending', 'fine_tuned_model': None, 'events': [{'object': 'fine-tune-event', 'level': 'info', 'message': 'Created fine-tune: ft-gOeTtRL8FbQYMKzrheNNBm8H', 'created_at': 1678377767}]}


In [137]:
finetune_event(ftId='ft-gOeTtRL8FbQYMKzrheNNBm8H',
               api_key=my_key)

{'object': 'list', 'data': [{'object': 'fine-tune-event', 'level': 'info', 'message': 'Created fine-tune: ft-gOeTtRL8FbQYMKzrheNNBm8H', 'created_at': 1678377767}]}


In [138]:
finetune_get(ftId='ft-gOeTtRL8FbQYMKzrheNNBm8H',
               api_key=my_key)

{'object': 'fine-tune', 'id': 'ft-gOeTtRL8FbQYMKzrheNNBm8H', 'hyperparams': {'n_epochs': 4, 'batch_size': None, 'prompt_loss_weight': 0.01, 'learning_rate_multiplier': None}, 'organization_id': 'org-QWWIpNe4Z3KsRVI0YR6lhyJW', 'model': 'davinci', 'training_files': [{'object': 'file', 'id': 'file-B7AJ8LI7onSaVqC3ZaW68NR9', 'purpose': 'fine-tune', 'filename': 'file', 'bytes': 10818, 'created_at': 1678374670, 'status': 'processed', 'status_details': None}], 'validation_files': [], 'result_files': [], 'created_at': 1678377767, 'updated_at': 1678377767, 'status': 'pending', 'fine_tuned_model': None, 'events': [{'object': 'fine-tune-event', 'level': 'info', 'message': 'Created fine-tune: ft-gOeTtRL8FbQYMKzrheNNBm8H', 'created_at': 1678377767}]}
