# Build a Custom OpenAI Chatbot with ML-Driven Prompt Engineering

## Step 1

### Loading the Data with `pandas`

In [1]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

In [2]:
ukraine_prompt = """
Question: When did Russia invade Ukraine?
Answer: 
"""

ukraine_answer = openai.Completion.create(
    model="gpt-3.5-turbo-instruct",
    prompt=ukraine_prompt
)["choices"][0]["text"]
print(ukraine_answer)

Russia invaded Ukraine on February 20, 2014, with troops entering the


In [3]:
twitter_prompt = """
Question: "Who owns Twitter?"
Answer:
"""
initial_twitter_answer = openai.Completion.create(
    model="gpt-3.5-turbo-instruct",
    prompt=twitter_prompt,
    max_tokens=150
)["choices"][0]["text"].strip()
print(initial_twitter_answer)

As of July 2021, Twitter is publicly owned and traded on the New York Stock Exchange. Therefore, the company does not have a single owner, but its shareholders collectively own the company. However, the co-founder and CEO, Jack Dorsey, owns the largest individual stake in the company at approximately 11% of the outstanding shares.


In [4]:
import requests

response = requests.get("https://en.wikipedia.org/w/api.php?action=query&prop=extracts&exlimit=1&titles=2022&explaintext=1&formatversion=2&format=json")

In [5]:
response.json()["query"]["pages"][0]["extract"].split("\n")

['2022 (MMXXII) was a common year starting on Saturday of the Gregorian calendar, the 2022nd year of the Common Era (CE) and Anno Domini (AD) designations, the 22nd  year of the 3rd millennium and the 21st century, and the  3rd   year of the 2020s decade.  ',
 'The year saw the removal of nearly all COVID-19 restrictions and the reopening of international borders in most countries, while the global rollout of COVID-19 vaccines continued. The global economic recovery from the pandemic continued, though many countries experienced an ongoing inflation surge; in response, many central banks raised their interest rates to landmark levels. The world population reached eight billion people in 2022. The year also witnessed numerous natural disasters, including two devastating Atlantic hurricanes (Fiona and Ian), and the most powerful volcano eruption of the century so far. The later part of the year also saw the first public release of ChatGPT by OpenAI starting an arms race in artificial inte

In [6]:
import pandas as pd

In [7]:
df = pd.DataFrame()
df["text"] = response.json()["query"]["pages"][0]["extract"].split("\n")
df

Unnamed: 0,text
0,2022 (MMXXII) was a common year starting on Sa...
1,The year saw the removal of nearly all COVID-1...
2,2022 was also dominated by wars and armed conf...
3,
4,
...,...
254,
255,== Nobel Prizes ==
256,
257,


In [8]:
# remvoe blank lines
df = df[df["text"].str.len() > 0]
df

Unnamed: 0,text
0,2022 (MMXXII) was a common year starting on Sa...
1,The year saw the removal of nearly all COVID-1...
2,2022 was also dominated by wars and armed conf...
5,== Events ==
8,=== January ===
...,...
248,== Demographics ==
249,The world population was estimated to have rea...
252,== Deaths ==
255,== Nobel Prizes ==


In [9]:
df = df[~df["text"].str.startswith("==")]
df

Unnamed: 0,text
0,2022 (MMXXII) was a common year starting on Sa...
1,The year saw the removal of nearly all COVID-1...
2,2022 was also dominated by wars and armed conf...
9,January 1 – The Regional Comprehensive Econom...
10,January 2 – Abdalla Hamdok resigns as Prime Mi...
...,...
242,December 21–December 26 – A major winter storm...
243,December 24 – 2022 Fijian general election: Th...
244,December 29 – Brazilian football legend Pelé d...
245,December 31 – Former Pope Benedict XVI dies at...


In [10]:
from dateutil.parser import parse

prefix = ""
for (i, row) in df.iterrows():
    # If the row already has " - ", it already has the needed date prefix
    if " – " not in row["text"]:
        try:
            # If the row's text is a date, set it as the new prefix
            parse(row["text"])
            prefix = row["text"]
        except:
            # If the row's text isn't a date, add the prefix
            row["text"] = prefix + " – " + row["text"]
df = df[df["text"].str.contains(" – ")]
df.tail(20)

Unnamed: 0,text
223,November 15 – The 2022 G20 Bali summit in Bali...
224,"November 16 – NASA launches Artemis 1, the fir..."
225,November 19 – The 2022 Malaysian general elect...
226,November 19–November 26 – The 2022 Central Ame...
227,November 20–December 18 – The 2022 FIFA World ...
228,November 20 – 2022 Nepalese general election: ...
229,November 21 – A 5.6 earthquake strikes near Ci...
230,"November 30 – OpenAI releases ChatGPT, an arti..."
234,December 2 – The G7 and Australia join the EU ...
235,December 5 – The National Ignition Facility ac...


In [11]:
df.reset_index(inplace=True, drop=True)

In [12]:
df

Unnamed: 0,text
0,– 2022 (MMXXII) was a common year starting on...
1,– The year saw the removal of nearly all COVI...
2,– 2022 was also dominated by wars and armed c...
3,January 1 – The Regional Comprehensive Econom...
4,January 2 – Abdalla Hamdok resigns as Prime Mi...
...,...
180,December 21–December 26 – A major winter storm...
181,December 24 – 2022 Fijian general election: Th...
182,December 29 – Brazilian football legend Pelé d...
183,December 31 – Former Pope Benedict XVI dies at...


In [13]:
df.to_csv("text.csv")

### Creating an Embeddings Index with `openai.Embedding`

In [14]:
import pandas as pd
df = pd.read_csv("text.csv", index_col=0)
df

Unnamed: 0,text
0,– 2022 (MMXXII) was a common year starting on...
1,– The year saw the removal of nearly all COVI...
2,– 2022 was also dominated by wars and armed c...
3,January 1 – The Regional Comprehensive Econom...
4,January 2 – Abdalla Hamdok resigns as Prime Mi...
...,...
180,December 21–December 26 – A major winter storm...
181,December 24 – 2022 Fijian general election: Th...
182,December 29 – Brazilian football legend Pelé d...
183,December 31 – Former Pope Benedict XVI dies at...


In [15]:
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
response = openai.Embedding.create(
    input=df["text"].tolist(),
    model=EMBEDDING_MODEL_NAME
)

In [16]:
type(response)

openai.openai_object.OpenAIObject

In [17]:
response.keys()

dict_keys(['object', 'data', 'model', 'usage'])

In [18]:
type(response["data"])

list

In [19]:
response["data"][0]

<OpenAIObject embedding at 0x10cb83390> JSON: {
  "object": "embedding",
  "index": 0,
  "embedding": [
    4.099189391126856e-05,
    -0.01798599772155285,
    -0.017182154580950737,
    -0.012315132655203342,
    -0.009627281688153744,
    -0.001225390238687396,
    -0.00828335527330637,
    0.013162937015295029,
    -0.02260809764266014,
    -0.0020425787661224604,
    0.03011900931596756,
    0.024404184892773628,
    -0.018940560519695282,
    -0.016843032091856003,
    -0.0020912489853799343,
    0.004553019534796476,
    0.01848839968442917,
    -0.004596979822963476,
    0.021201370283961296,
    0.009696361608803272,
    0.002344020176678896,
    0.009746601805090904,
    -0.0020645589102059603,
    -0.014067260548472404,
    0.010010362602770329,
    0.01754639483988285,
    0.0045090592466294765,
    -0.004983201622962952,
    0.03589663282036781,
    -0.022947218269109726,
    -0.00015572505071759224,
    -0.010858166962862015,
    -0.01505950465798378,
    -0.0169309526681

In [20]:
response["data"][0]["embedding"]

[4.099189391126856e-05,
 -0.01798599772155285,
 -0.017182154580950737,
 -0.012315132655203342,
 -0.009627281688153744,
 -0.001225390238687396,
 -0.00828335527330637,
 0.013162937015295029,
 -0.02260809764266014,
 -0.0020425787661224604,
 0.03011900931596756,
 0.024404184892773628,
 -0.018940560519695282,
 -0.016843032091856003,
 -0.0020912489853799343,
 0.004553019534796476,
 0.01848839968442917,
 -0.004596979822963476,
 0.021201370283961296,
 0.009696361608803272,
 0.002344020176678896,
 0.009746601805090904,
 -0.0020645589102059603,
 -0.014067260548472404,
 0.010010362602770329,
 0.01754639483988285,
 0.0045090592466294765,
 -0.004983201622962952,
 0.03589663282036781,
 -0.022947218269109726,
 -0.00015572505071759224,
 -0.010858166962862015,
 -0.01505950465798378,
 -0.016930952668190002,
 0.003570195287466049,
 -0.02896348387002945,
 -0.013903980143368244,
 -0.0017536975210532546,
 0.004832480568438768,
 -0.01227745320647955,
 0.012139292433857918,
 0.02115113101899624,
 0.0128238154

In [21]:
len(response["data"][0]["embedding"])

1536

In [22]:
embeddings = [data["embedding"] for data in response["data"]]
embeddings

[[4.099189391126856e-05,
  -0.01798599772155285,
  -0.017182154580950737,
  -0.012315132655203342,
  -0.009627281688153744,
  -0.001225390238687396,
  -0.00828335527330637,
  0.013162937015295029,
  -0.02260809764266014,
  -0.0020425787661224604,
  0.03011900931596756,
  0.024404184892773628,
  -0.018940560519695282,
  -0.016843032091856003,
  -0.0020912489853799343,
  0.004553019534796476,
  0.01848839968442917,
  -0.004596979822963476,
  0.021201370283961296,
  0.009696361608803272,
  0.002344020176678896,
  0.009746601805090904,
  -0.0020645589102059603,
  -0.014067260548472404,
  0.010010362602770329,
  0.01754639483988285,
  0.0045090592466294765,
  -0.004983201622962952,
  0.03589663282036781,
  -0.022947218269109726,
  -0.00015572505071759224,
  -0.010858166962862015,
  -0.01505950465798378,
  -0.016930952668190002,
  0.003570195287466049,
  -0.02896348387002945,
  -0.013903980143368244,
  -0.0017536975210532546,
  0.004832480568438768,
  -0.01227745320647955,
  0.01213929243385

In [23]:
df["embeddings"] = embeddings
df

Unnamed: 0,text,embeddings
0,– 2022 (MMXXII) was a common year starting on...,"[4.099189391126856e-05, -0.01798599772155285, ..."
1,– The year saw the removal of nearly all COVI...,"[-0.010697541758418083, -0.023004746064543724,..."
2,– 2022 was also dominated by wars and armed c...,"[-0.009626180864870548, -0.015301118604838848,..."
3,January 1 – The Regional Comprehensive Econom...,"[-0.0005404727999120951, -0.024158069863915443..."
4,January 2 – Abdalla Hamdok resigns as Prime Mi...,"[-0.015138540416955948, 0.0011573187075555325,..."
...,...,...
180,December 21–December 26 – A major winter storm...,"[-0.024877460673451424, -0.023879770189523697,..."
181,December 24 – 2022 Fijian general election: Th...,"[-0.011605652049183846, -0.009253676049411297,..."
182,December 29 – Brazilian football legend Pelé d...,"[-0.007616951130330563, 0.004034563899040222, ..."
183,December 31 – Former Pope Benedict XVI dies at...,"[0.023607414215803146, 0.0077504320070147514, ..."


In [24]:
df.to_csv("embeddings.csv")

## Step 2

### Finding Relevant Data with Cosine Similarity

In [25]:
import numpy as np
import pandas as pd

df = pd.read_csv("embeddings.csv", index_col=0)
df["embeddings"] = df["embeddings"].apply(eval).apply(np.array)
df

Unnamed: 0,text,embeddings
0,– 2022 (MMXXII) was a common year starting on...,"[4.099189391126856e-05, -0.01798599772155285, ..."
1,– The year saw the removal of nearly all COVI...,"[-0.010697541758418083, -0.023004746064543724,..."
2,– 2022 was also dominated by wars and armed c...,"[-0.009626180864870548, -0.015301118604838848,..."
3,January 1 – The Regional Comprehensive Econom...,"[-0.0005404727999120951, -0.024158069863915443..."
4,January 2 – Abdalla Hamdok resigns as Prime Mi...,"[-0.015138540416955948, 0.0011573187075555325,..."
...,...,...
180,December 21–December 26 – A major winter storm...,"[-0.024877460673451424, -0.023879770189523697,..."
181,December 24 – 2022 Fijian general election: Th...,"[-0.011605652049183846, -0.009253676049411297,..."
182,December 29 – Brazilian football legend Pelé d...,"[-0.007616951130330563, 0.004034563899040222, ..."
183,December 31 – Former Pope Benedict XVI dies at...,"[0.023607414215803146, 0.0077504320070147514, ..."


In [26]:
question = "When did Russia invade Ukraine?"

In [27]:
from openai.embeddings_utils import get_embedding

In [28]:
EMBEDDING_MODEL_NAME = "text-embedding-ada-002"
question_embeddings = get_embedding(question, engine=EMBEDDING_MODEL_NAME)
question_embeddings

[0.0016092117875814438,
 -0.019320106133818626,
 0.0034623160026967525,
 -0.013989541679620743,
 -0.025224534794688225,
 0.0019989614374935627,
 -0.013683481141924858,
 -0.02472718432545662,
 -0.013415677472949028,
 -0.021271245554089546,
 0.02238071709871292,
 0.024612411856651306,
 -0.00902242586016655,
 -0.011834361590445042,
 -0.006341203115880489,
 -0.01049534510821104,
 0.010520849376916885,
 -0.003924595657736063,
 0.0334116667509079,
 -0.018695231527090073,
 -0.01435936614871025,
 -0.016246743500232697,
 0.0033156618010252714,
 0.0013342350721359253,
 -0.014792952686548233,
 0.006765225436538458,
 0.013708986341953278,
 -0.029126813635230064,
 0.015379569493234158,
 -0.01438487134873867,
 -0.011171229183673859,
 -0.022929076105356216,
 -0.02053159847855568,
 -0.016195733100175858,
 -0.03514601290225983,
 -0.032365959137678146,
 0.009137198328971863,
 -0.009392249397933483,
 0.015124518424272537,
 -0.004074438009411097,
 0.009398626163601875,
 0.017904574051499367,
 -0.004600480

In [29]:
from openai.embeddings_utils import distances_from_embeddings

In [30]:
distances = distances_from_embeddings(question_embeddings, df["embeddings"].tolist(), distance_metric="cosine")
distances

[0.2909613892583721,
 0.27673005769757675,
 0.1853606978911877,
 0.28970156573242467,
 0.2616243455596384,
 0.23042806436673502,
 0.21798036234680718,
 0.17785755720157825,
 0.26210795110120444,
 0.26298015395705576,
 0.2869219081741987,
 0.25380893298863894,
 0.26446127216404014,
 0.262418316187513,
 0.23039939745450222,
 0.2729059590211441,
 0.2554111481803091,
 0.24660419296416458,
 0.20029308894592757,
 0.2614078624977876,
 0.25022493536684487,
 0.25328838774444773,
 0.28309850999376207,
 0.12138305461205379,
 0.14701911834980363,
 0.13766803967927976,
 0.13844884944578917,
 0.1798042408273689,
 0.271475278109114,
 0.1259842810702715,
 0.15641151851980217,
 0.19414521627500747,
 0.16554700161521674,
 0.12620714465184235,
 0.10917997938278867,
 0.13501440729875203,
 0.15473786511549736,
 0.1472285357088602,
 0.24189311380847667,
 0.2612882115143508,
 0.21810348076039188,
 0.26763825865411306,
 0.2699269372949805,
 0.1939912210153606,
 0.2546676622891242,
 0.12697175399869598,
 0.235

In [31]:
df["distances"] = distances
df

Unnamed: 0,text,embeddings,distances
0,– 2022 (MMXXII) was a common year starting on...,"[4.099189391126856e-05, -0.01798599772155285, ...",0.290961
1,– The year saw the removal of nearly all COVI...,"[-0.010697541758418083, -0.023004746064543724,...",0.276730
2,– 2022 was also dominated by wars and armed c...,"[-0.009626180864870548, -0.015301118604838848,...",0.185361
3,January 1 – The Regional Comprehensive Econom...,"[-0.0005404727999120951, -0.024158069863915443...",0.289702
4,January 2 – Abdalla Hamdok resigns as Prime Mi...,"[-0.015138540416955948, 0.0011573187075555325,...",0.261624
...,...,...,...
180,December 21–December 26 – A major winter storm...,"[-0.024877460673451424, -0.023879770189523697,...",0.259998
181,December 24 – 2022 Fijian general election: Th...,"[-0.011605652049183846, -0.009253676049411297,...",0.247962
182,December 29 – Brazilian football legend Pelé d...,"[-0.007616951130330563, 0.004034563899040222, ...",0.287856
183,December 31 – Former Pope Benedict XVI dies at...,"[0.023607414215803146, 0.0077504320070147514, ...",0.293258


In [32]:
df.to_csv("distances.csv")

## Shortest Distance

In [33]:
import pandas as pd
df = pd.read_csv("distances.csv", index_col=0)
df

Unnamed: 0,text,embeddings,distances
0,– 2022 (MMXXII) was a common year starting on...,[ 4.09918939e-05 -1.79859977e-02 -1.71821546e-...,0.290961
1,– The year saw the removal of nearly all COVI...,[-0.01069754 -0.02300475 -0.00018612 ... -0.01...,0.276730
2,– 2022 was also dominated by wars and armed c...,[-0.00962618 -0.01530112 0.01076647 ... -0.00...,0.185361
3,January 1 – The Regional Comprehensive Econom...,[-0.00054047 -0.02415807 -0.00532435 ... -0.00...,0.289702
4,January 2 – Abdalla Hamdok resigns as Prime Mi...,[-0.01513854 0.00115732 -0.02068717 ... -0.00...,0.261624
...,...,...,...
180,December 21–December 26 – A major winter storm...,[-0.02487746 -0.02387977 0.00331537 ... 0.00...,0.259998
181,December 24 – 2022 Fijian general election: Th...,[-0.01160565 -0.00925368 -0.02326271 ... -0.00...,0.247962
182,December 29 – Brazilian football legend Pelé d...,[-0.00761695 0.00403456 0.00835572 ... 0.00...,0.287856
183,December 31 – Former Pope Benedict XVI dies at...,[ 0.02360741 0.00775043 -0.01279436 ... 0.00...,0.293258


In [34]:
current_shortest = df.iloc[0]["distances"]
current_shortest_index = 0
current_shortest, current_shortest_index

(0.2909613892583721, 0)

In [35]:
for index, distance in enumerate(df["distances"].values):
    if distance < current_shortest:
        current_shortest = distance
        current_shortest_index = index
current_shortest, current_shortest_index

(0.1091799793827886, 34)

In [36]:
df.iloc[34]["text"]

'March 2 – Russian invasion of Ukraine: Russia captures its first large city, the Black Sea port of Kherson, as shelling intensifies across many parts of Ukraine, including civilian areas.'

In [37]:
df.sort_values(by="distances")

Unnamed: 0,text,embeddings,distances
34,March 2 – Russian invasion of Ukraine: Russia ...,[ 0.00069361 -0.01837528 0.01264494 ... 0.01...,0.109180
56,April 3 – Russian invasion of Ukraine: As Russ...,[-0.01213233 -0.01239847 0.00537484 ... 0.01...,0.111454
163,November 11 – Russian invasion of Ukraine: Ukr...,[-0.01230958 -0.01390749 0.01656015 ... 0.02...,0.115620
135,September 21 – Russian invasion of Ukraine: Fo...,[-0.02564049 -0.02205057 0.013261 ... 0.00...,0.116937
155,October 29 – Russian invasion of Ukraine: In r...,[-0.00990201 -0.03036184 0.01550558 ... 0.01...,0.117836
...,...,...,...
55,March 31 – Expo 2020 closes in Dubai after a 6...,[-0.00312684 -0.04655766 -0.00169539 ... -0.00...,0.292121
183,December 31 – Former Pope Benedict XVI dies at...,[ 0.02360741 0.00775043 -0.01279436 ... 0.00...,0.293258
162,"November 11 – The cryptocurrency exchange FTX,...",[ 0.00279875 -0.02541494 0.00015377 ... 0.00...,0.293906
170,November 20 – 2022 Nepalese general election: ...,[-0.00423679 -0.00072679 -0.00159202 ... -0.00...,0.295495


In [38]:
df.iloc[56]["text"]

"April 3 – Russian invasion of Ukraine: As Russia's forces retreat from areas near Kyiv, it is accused by Ukraine of war crimes, amid mounting evidence of indiscriminate civilian killings, including the Bucha massacre."

In [39]:
df.iloc[163]["text"]

'November 11 – Russian invasion of Ukraine: Ukrainian forces recapture Kherson, the only regional capital to be taken by Russia since the start of the war.'

In [40]:
df.sort_values(by="distances").to_csv("distances_sorted.csv")

## Step 3

### Tokenizing with `tiktoken`

In [41]:
import tiktoken

ModuleNotFoundError: No module named 'tiktoken'

In [None]:
tokenizer = tiktoken.get_encoding("cl100k_base")

In [None]:
tokenizer

In [None]:
tokenizer.encode("This is a question")

In [None]:
question = "When did Russia invade Ukraine?"

In [None]:
tokenizer.encode(question)

In [None]:
len(tokenizer.encode(question))

### Composing a Custom Text Prompt

In [None]:
prompt_template = """
Answer the question based on the context below, and if the question
can't be answered based on the context, say "I don't know"

Context:

{}

---

Question: {}
Answeer:"""

In [None]:
question = "When did Russia invade Ukraine?"

In [None]:
print(prompt_template.format("context", question))

In [None]:
max_token_count = 1000

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("cl100k_base")
tokenizer

In [None]:
tokenizer.encode(question)

In [None]:
len(tokenizer.encode(question))

In [None]:
current_token_count = len(tokenizer.encode(prompt_template)) + len(tokenizer.encode(question))
current_token_count

In [None]:
context = []

In [None]:
import pandas as pd
df = pd.read_csv("distances_sorted.csv", index_col=0)
df

In [None]:
for text in df["text"].values:
    text_token_count = len(tokenizer.encode(text))
    current_token_count += text_token_count
    if current_token_count <= max_token_count:
        context.append(text)
    else:
        break

In [None]:
context

In [None]:
print(prompt_template.format(context, question))

In [None]:
print(prompt_template.format("\n\n###\n\n".join(context), question))

In [None]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
prompt = prompt_template.format("\n\n###\n\n".join(context), question)

openai.Completion.create(
    model="gpt-3.5-turbo-instruct",
    prompt=prompt,
)["choices"][0]["text"]

## Step 4

### Getting a Custom Q&A Response with `openai.Completion`

In [None]:
import os
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")

In [None]:
import pandas as pd
df = pd.read_csv("distances_sorted.csv", index_col=0)

In [None]:
import tiktoken

COMPLETION_MODEL_NAME = "gpt-3.5-turbo-instruct"

def answer_question(
    question, df, max_prompt_tokens=1800,max_answer_tokens=150
):
    tokenizer = tiktoken.get_encoding("cl100k_base")
    
    prompt_template =  """
Answer the question based on the context below, and if the question
can't be answered based on the context, say "I don't know"

Context: 

{}

---

Question: {}
Answer:"""
    
    current_token_count = len(tokenizer.encode(prompt_template)) + len(tokenizer.encode(question))
    context = []
    for text in df["text"].values:
        text_token_count = len(tokenizer.encode(text))
        current_token_count += text_token_count
        if current_token_count <= max_prompt_tokens:
            context.append(text)
        else:
            break
    prompt = prompt_template.format("\n\n###\n\n".join(context), question)
    
    try:
        response = openai.Completion.create(
            model=COMPLETION_MODEL_NAME,
            prompt=prompt,
            max_tokens=max_answer_tokens
        )
        return response["choices"][0]["text"]
    except Exception as e:
        print(e)
        return ""

In [None]:
custom_ukraine_answer = answer_question("When did Russia invade Ukraine?", df)
print(custom_ukraine_answer)

In [None]:
custom_twitter_answer = answer_question("Who owns Twitter?", df)
print(custom_twitter_answer)