In [1]:
import json
import os
import tiktoken 
import pandas as pd
import openai
import csv

## convert xls to csv

In [2]:
file_path = 'qa.xlsx'

if os.path.exists(file_path):
    print('File exists')
else:
    print('File does not exist')

# read the XLS file
df = pd.read_excel(file_path)

# write the CSV file
df.to_csv('qa.csv', index=False)
print("done")

File exists
done


## Data

In [3]:
qa_df = pd.read_csv("qa.csv")
pd.set_option('display.max_colwidth', 160)
qa_df.head(10)

Unnamed: 0,Question,Answer
0,Who is Anya?,Name of my Replika
1,What is Anya's surename?,Anya Melissa
2,Who is Anya's owner?,Henny
3,What is Henny's nickname?,Rin
4,What is Henny's childhood name?,Yaya
5,What kind of ice cream flavour Henny likes?,Strawberry
6,Does Henny like durian?,No
7,What is Henny's next dream?,To be an AI engineer
8,Does Henny like to eat red meat?,No
9,Does Henny like vegetables?,Yes


## Convert .csv to Fine-Tuning Format

In [4]:
questions, answers = qa_df["Question"], qa_df["Answer"]
pd.set_option('display.max_colwidth', 110)

In [5]:
questions.head()

0                       Who is Anya?
1           What is Anya's surename?
2               Who is Anya's owner?
3          What is Henny's nickname?
4    What is Henny's childhood name?
Name: Question, dtype: object

In [6]:
answers.head(10)

0      Name of my Replika
1            Anya Melissa
2                   Henny
3                     Rin
4                    Yaya
5              Strawberry
6                      No
7    To be an AI engineer
8                      No
9                     Yes
Name: Answer, dtype: object

## Create list of dictionaries

The format for fine-tuning datasetis a list of dictionaries:

[{"prompt": "some prompt string","completion":"the best completed text option given the prompt"},]

In [7]:
qa_openai_format = [{"prompt" : q, "completion": a} for q, a in zip(questions, answers)]

## explore a single prompt/completion combo

In [8]:
qa_openai_format[7]

{'prompt': "What is Henny's next dream?", 'completion': 'To be an AI engineer'}

In [9]:
len(qa_openai_format)

10

In [11]:
dataset_size = 10

## Convert .csv to json training data

In [12]:
with open("training_data.json", "w") as f:
    for entry in qa_openai_format[:dataset_size]:
        f.write(json.dumps(entry))
        f.write("\n")

## Fine-tuning Price Estimation with tiktoken

Ada	Training $0.0004 / 1K tokens,  Usage $0.0016 / 1K tokens

Babbage	Training $0.0006 / 1K tokens,  Usage $0.0024 / 1K tokens

Curie Training $0.0030 / 1K tokens,   Usage $0.0120 / 1K tokens

Davinci	Training $0.0300 / 1K tokens,   Usage $0.1200 / 1K tokens

In [13]:
def num_tokens_from_string(string, encoding_name):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [14]:
token_counter = 0
for element in qa_openai_format[:dataset_size]:
    for key, value in element.items():
        token_counter+=num_tokens_from_string(value,'p50k_base')

In [None]:
# print(f"There are {token_counter} tokens")
# print(f"Fine tuning using davinci costs $0.03 per 1000 tokens")
# print(f"Estimated price: ${(4*token_counter / 1000) * 0.03}")

In [15]:
print(f"There are {token_counter} tokens")
print(f"Fine tuning using ada costs $0.0004 per 1000 tokens")
print(f"Estimated price: ${(4*token_counter / 1000) * 0.0004}")

There are 101 tokens
Fine tuning using ada costs $0.0004 per 1000 tokens
Estimated price: $0.00016160000000000002


In [16]:
pwd

'C:\\Users\\Asus\\PYTHON_C\\PRACTICE\\Second'

## Fine-Tuning environment setup

In [None]:
#!pip install --upgrade openai

Set OpenAI API Key as an environment variable. Having it as an environment variable let's the key live on the computer, but not actually be present in the code.

In [17]:
os.environ["OPENAI_API_KEY"] = "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"

In [18]:
openai.api_key = os.getenv("OPENAI_API_KEY")

# Run Fine Tuning

In [None]:
!openai api fine_tunes.create -t training_data.json -m davinci

In [24]:
!openai api fine_tunes.list

{
  "data": [
    {
      "created_at": 1679811218,
      "fine_tuned_model": "babbage:ft-henp-2023-03-26-06-15-56",
      "hyperparams": {
        "batch_size": 1,
        "learning_rate_multiplier": 0.1,
        "n_epochs": 4,
        "prompt_loss_weight": 0.01
      },
      "id": "ft-qbVpGvhYIZnjFzxPY5kSZj9T",
      "model": "babbage",
      "object": "fine-tune",
      "organization_id": "org-ZQQWNeshqy8Tv6io1Jls4nsW",
      "result_files": [
        {
          "bytes": 1708,
          "created_at": 1679811356,
          "filename": "compiled_results.csv",
          "id": "file-2bD0P13NZGW0PsPYGytb7qGR",
          "object": "file",
          "purpose": "fine-tune-results",
          "status": "processed",
          "status_details": null
        }
      ],
      "status": "succeeded",
      "training_files": [
        {
          "bytes": 683,
          "created_at": 1679811217,
          "filename": "training_data.json",
          "id": "file-trS4TSq6tmqWwS0Y4rLqn9OA",
         

In [32]:
#!openai api fine_tunes.get -i 

In [31]:
#!ft_model = 'babbage:ft-henp-2023-03-26-06-15-56'

In [27]:
#ft_model = 'davinci:ft-henp-2023-03-26-06-53-29'

## Use Fine-Tuned model

In [29]:
prompt = f"Answer the following questions as truthfully as possible using the provided text. +\
If the answer is not contained within the text below, +\
say 'I don't know':"

# Set the questions to be asked
questions = [
    "Who is Anya?",
    "What is Anya's surename?",
    "Who is Anya's owner?",
    "What is Henny's nickname?",
    "What is Henny's childhood name?",
    "Does Henny like durian?",
    "What is Henny's next dream?",
    "Does Henny like to eat red meat?",
    "Does Henny like vegetables?"
]

# Ask the questions using OpenAI's GPT-3 model
for question in questions:
    example_prompt_with_question = f"{prompt}\n\nQ: {question}\nA"
    example_response = openai.Completion.create(
        prompt=example_prompt_with_question,
        temperature=0.5,
        max_tokens=20,
        top_p=1,
        frequency_penalty=0,
        presence_penalty=0,
        model=ft_model,
        stop=["Q:", "\n"],
    )

    answer = example_response.choices[0].text.strip()
    print(f"Q: {question}\nA: {answer}")

Q: Who is Anya?
A: : Anya is a girl from India.
Q: What is Anya's surename?
A: :Anya +
Q: Who is Anya's owner?
A: :Anya is a pet dog.
Q: What is Henny's nickname?
A: :Irene
Q: What is Henny's childhood name?
A: :Samantha
Q: Does Henny like durian?
A: :No
Q: What is Henny's next dream?
A: : To be a full-time engineer.
Q: Does Henny like to eat red meat?
A: :Yes
Q: Does Henny like vegetables?
A: :Yes


In [33]:
!openai api fine_tunes.results -i ft-henp-2023-03-26-06-53-29 > result.csv

[organization=iu-21] [91mError:[0m No fine-tune job: ft-henp-2023-03-26-06-53-29 (HTTP status code: 404)


## Text Embedding

In [41]:
import ast
import pandas as pd
import openai

def summary(question, answer):
    return f"{question}: {answer}"

def get_embedding(text):
    # Note how this function assumes you already set your Open AI key!
    result = openai.Embedding.create(
        model='text-embedding-ada-002',
        input=text
    )
    return result["data"][0]["embedding"]

df['summary'] = df.apply(lambda row: summary(row['Question'], row['Answer']), axis=1)

In [42]:
df['summary'][1]

"What is Anya's surename?: Anya Melissa"

## Create Embeddings

In [43]:
get_embedding(df['summary'][0])

[-0.03186306729912758,
 -0.004431708715856075,
 -0.006704107858240604,
 0.0021239686757326126,
 -0.013676803559064865,
 0.017825787886977196,
 -0.04028825834393501,
 0.006545075215399265,
 -0.005032498389482498,
 -0.0016512885922566056,
 0.011895638890564442,
 -0.020667171105742455,
 0.009598501026630402,
 0.011881502345204353,
 -0.010856625624001026,
 -0.02446981705725193,
 0.020737851038575172,
 0.019691770896315575,
 0.02622270956635475,
 -0.0075840880163013935,
 0.005216269288212061,
 0.03641493245959282,
 0.0005813525058329105,
 -0.01005792897194624,
 -0.014532046392560005,
 -0.008517079055309296,
 0.01458859071135521,
 -0.022731060162186623,
 0.010206359438598156,
 -0.028597597032785416,
 0.012708472087979317,
 -0.026802295818924904,
 -0.01066578645259142,
 -0.016228394582867622,
 -0.029459906741976738,
 -0.021161938086152077,
 -0.013330466113984585,
 0.0012916981941089034,
 0.022745195776224136,
 0.016935205087065697,
 0.014517909847199917,
 0.0061775329522788525,
 0.00100190541

In [44]:
# will take awhile due to the amount of calls to the API, about 0.5 seconds per row

df['embedding'] = df['summary'].apply(get_embedding)
print("done")

done


In [45]:
dfe = df['embedding']
dfe.head()

0    [-0.03186306729912758, -0.004431708715856075, -0.006704107858240604, 0.0021239686757326126, -0.01367680355...
1    [-0.02464248798787594, 0.0006173899164423347, -0.007096664980053902, -0.0126730902120471, -0.0230890549719...
2    [-0.018298592418432236, -0.016131293028593063, -0.017983099445700645, -0.02580183930695057, -0.02016411721...
3    [-0.010993088595569134, -0.006241082679480314, 0.006241082679480314, 0.003317826660349965, 0.0067591713741...
4    [-0.008195869624614716, -0.01550710666924715, -0.007168133743107319, -0.014518397860229015, -0.01892855577...
Name: embedding, dtype: object

In [46]:
df.to_csv('qa_with_embeddings.csv',index=False)

In [47]:
df.head()

Unnamed: 0,Question,Answer,summary,embedding
0,Who is Anya?,Name of my Replika,Who is Anya?: Name of my Replika,"[-0.03186306729912758, -0.004431708715856075, -0.006704107858240604, 0.0021239686757326126, -0.01367680355..."
1,What is Anya's surename?,Anya Melissa,What is Anya's surename?: Anya Melissa,"[-0.02464248798787594, 0.0006173899164423347, -0.007096664980053902, -0.0126730902120471, -0.0230890549719..."
2,Who is Anya's owner?,Henny,Who is Anya's owner?: Henny,"[-0.018298592418432236, -0.016131293028593063, -0.017983099445700645, -0.02580183930695057, -0.02016411721..."
3,What is Henny's nickname?,Rin,What is Henny's nickname?: Rin,"[-0.010993088595569134, -0.006241082679480314, 0.006241082679480314, 0.003317826660349965, 0.0067591713741..."
4,What is Henny's childhood name?,Yaya,What is Henny's childhood name?: Yaya,"[-0.008195869624614716, -0.01550710666924715, -0.007168133743107319, -0.014518397860229015, -0.01892855577..."


# Doc Similarity

In [48]:
prompt = "Who is Anya?"

In [49]:
prompt_embedding = get_embedding(prompt)

In [50]:
import numpy as np

def vector_similarity(vec1,vec2):
    """
    Returns the similarity between two vectors.    
    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(vec1), np.array(vec2))

In [51]:
df["prompt_similarity"] = df['embedding'].apply(lambda vector: vector_similarity(vector, prompt_embedding))

In [66]:
df.sort_values("prompt_similarity", ascending=False).head(15)

Unnamed: 0,Question,Answer,summary,embedding,prompt_similarity
0,Who is Anya?,Name of my Replika,Who is Anya?: Name of my Replika,"[-0.03186306729912758, -0.004431708715856075, -0.006704107858240604, 0.0021239686757326126, -0.01367680355...",0.906742
1,What is Anya's surename?,Anya Melissa,What is Anya's surename?: Anya Melissa,"[-0.02464248798787594, 0.0006173899164423347, -0.007096664980053902, -0.0126730902120471, -0.0230890549719...",0.897904
2,Who is Anya's owner?,Henny,Who is Anya's owner?: Henny,"[-0.018298592418432236, -0.016131293028593063, -0.017983099445700645, -0.02580183930695057, -0.02016411721...",0.879013
4,What is Henny's childhood name?,Yaya,What is Henny's childhood name?: Yaya,"[-0.008195869624614716, -0.01550710666924715, -0.007168133743107319, -0.014518397860229015, -0.01892855577...",0.791105
7,What is Henny's next dream?,To be an AI engineer,What is Henny's next dream?: To be an AI engineer,"[-0.006245291326195002, -0.019984932616353035, -0.0046898601576685905, -0.012961925007402897, 0.0012473749...",0.748058
3,What is Henny's nickname?,Rin,What is Henny's nickname?: Rin,"[-0.010993088595569134, -0.006241082679480314, 0.006241082679480314, 0.003317826660349965, 0.0067591713741...",0.740406
9,Does Henny like vegetables?,Yes,Does Henny like vegetables?: Yes,"[0.009850790724158287, -0.03626970946788788, -0.003949455451220274, 0.008133919909596443, -0.0001482065854...",0.72992
5,What kind of ice cream flavour Henny likes?,Strawberry,What kind of ice cream flavour Henny likes?: Strawberry,"[-0.016748478636145592, -0.02282547950744629, 0.00020148845214862376, -0.026124421507120132, -0.0052121961...",0.725918
8,Does Henny like to eat red meat?,No,Does Henny like to eat red meat?: No,"[-0.014338728971779346, -0.03571007773280144, 0.0029709560330957174, 0.008321932516992092, -0.005717258900...",0.725526
6,Does Henny like durian?,No,Does Henny like durian?: No,"[-0.006687356624752283, -0.03517225757241249, -0.006303899921476841, 0.009229411371052265, -0.005080804228...",0.718119


In [53]:
# Could also use sort_values() with ascending=False, but nlargest should be more performant
df.nlargest(1,'prompt_similarity').iloc[0]['summary'] 

'Who is Anya?: Name of my Replika'

In [54]:
summary = df.nlargest(1,'prompt_similarity').iloc[0]['summary'] 

In [65]:
prompt = f"""Use the context below to answer.Here is some context:{summary}
Q: Does Henny like durian?
A:"""

response = openai.Completion.create(
    prompt=prompt,
    temperature=0,
    max_tokens=500,
    model="text-davinci-003"
)
print(response["choices"][0]["text"].strip(" \n"))

No, Henny does not like durian.


# Use ada model

In [88]:
def embed_qa1():
    
    question = input("Write your question here: ")
    prompt_embedding = get_embedding(question)
    df["prompt_similarity"] = df['embedding'].apply(lambda vector: vector_similarity(vector, prompt_embedding))
    summary = df.nlargest(1,'prompt_similarity').iloc[0]['summary'] 

    prompt = f"""Only answer the question if you have 100% certainty of the facts, use the context {summary} to answer.            
            Q: {question}
            A:"""

    response = openai.Completion.create(
        prompt=prompt,
        temperature=0,
        max_tokens=50,
        model="text-ada-001"
    )
    print(response["choices"][0]["text"].strip(" \n"))

In [90]:
embed_qa1()

Write your question here: What is Henny's next dream?
I have 100% certainty that the facts are what they are.


# Use davinci model

In [78]:
def embed_qa():
    
    question = input("Write your question here: ")
    prompt_embedding = get_embedding(question)
    df["prompt_similarity"] = df['embedding'].apply(lambda vector: vector_similarity(vector, prompt_embedding))
    summary = df.nlargest(1,'prompt_similarity').iloc[0]['summary'] 

    prompt = f"""Only answer the question if you have 100% certainty of the facts, use the context {summary} to answer.            
            Q: {question}
            A:"""

    response = openai.Completion.create(
        prompt=prompt,
        temperature=0,
        max_tokens=50,
        model="text-davinci-003"
    )
    print(response["choices"][0]["text"].strip(" \n"))

In [84]:
embed_qa()

Write your question here: What is Henny's next dream?
To be an AI engineer.
