In [1]:

import os
import pandas as pd
import tiktoken
import openai
import numpy as np
import os
from openai.embeddings_utils import distances_from_embeddings
import time

openai.api_key = os.getenv('OPEN_AI_KEY') #Add your API key here

COMPLETIONS_MODEL = "text-davinci-003"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [2]:
tokenizer = tiktoken.get_encoding("cl100k_base")

df = pd.read_csv('merged_columns_file.csv', index_col=None )
df.columns = ['title','heading', 'content']
df.head()

Unnamed: 0,title,heading,content
0,nan Preface,Usage Survey and Feedback We would love to hea...,nan Preface Usage Survey and Feedback We would...
1,nan Preface,About this Book Editors\n• Kimberly Ernstmeye...,nan Preface About this Book Editors\n• Kimber...
2,nan Preface,Licensing/Terms of Use This textbook is licens...,nan Preface Licensing/Terms of Use This textbo...
3,nan Standards & Conceptual Approach,External Standards American Nurses Association...,nan Standards & Conceptual Approach External S...
4,nan Standards & Conceptual Approach,Conceptual Approach The Open RN Nursing Skills...,nan Standards & Conceptual Approach Conceptual...


In [3]:
df['n_tokens'] = df.content.apply(lambda x: len(tokenizer.encode(x)))
df.head()

Unnamed: 0,title,heading,content,n_tokens
0,nan Preface,Usage Survey and Feedback We would love to hea...,nan Preface Usage Survey and Feedback We would...,59
1,nan Preface,About this Book Editors\n• Kimberly Ernstmeye...,nan Preface About this Book Editors\n• Kimber...,3003
2,nan Preface,Licensing/Terms of Use This textbook is licens...,nan Preface Licensing/Terms of Use This textbo...,713
3,nan Standards & Conceptual Approach,External Standards American Nurses Association...,nan Standards & Conceptual Approach External S...,503
4,nan Standards & Conceptual Approach,Conceptual Approach The Open RN Nursing Skills...,nan Standards & Conceptual Approach Conceptual...,549


In [4]:
max_tokens = 500

# Function to split the text into chunks of a maximum number of tokens
def split_into_many(text, max_tokens = max_tokens):

    # Split the text into sentences
    sentences = text.split('. ')

    # Get the number of tokens for each sentence
    n_tokens = [len(tokenizer.encode(" " + sentence)) for sentence in sentences]
    
    chunks = []
    tokens_so_far = 0
    chunk = []

    # Loop through the sentences and tokens joined together in a tuple
    for sentence, token in zip(sentences, n_tokens):

        # If the number of tokens so far plus the number of tokens in the current sentence is greater 
        # than the max number of tokens, then add the chunk to the list of chunks and reset
        # the chunk and tokens so far
        if tokens_so_far + token > max_tokens:
            chunks.append(". ".join(chunk) + ".")
            chunk = []
            tokens_so_far = 0

        # If the number of tokens in the current sentence is greater than the max number of 
        # tokens, go to the next sentence
        if token > max_tokens:
            continue

        # Otherwise, add the sentence to the chunk and add the number of tokens to the total
        chunk.append(sentence)
        tokens_so_far += token + 1

    return chunks
    

shortened = []

# Loop through the dataframe
for row in df.iterrows():

    # If the text is None, go to the next row
    if row[1]['content'] is None:
        continue

    # If the number of tokens is greater than the max number of tokens, split the text into chunks
    if row[1]['n_tokens'] > max_tokens:
        shortened += split_into_many(row[1]['content'])
    
    # Otherwise, add the text to the list of shortened texts
    else:
        shortened.append( row[1]['content'] )

In [5]:
df['content'] = pd.DataFrame(shortened, columns = ['content'])
df['n_tokens'] = df.content.apply(lambda x: len(tokenizer.encode(x)))
df.head()

Unnamed: 0,title,heading,content,n_tokens
0,nan Preface,Usage Survey and Feedback We would love to hea...,nan Preface Usage Survey and Feedback We would...,59
1,nan Preface,About this Book Editors\n• Kimberly Ernstmeye...,nan Preface About this Book Editors\n• Kimber...,338
2,nan Preface,Licensing/Terms of Use This textbook is licens...,"Julie Teeter, DNP, RN, CNE, Gateway Technical ...",489
3,nan Standards & Conceptual Approach,External Standards American Nurses Association...,"Joseph’s Hospitals\n• Sheri Johnson, UW Popul...",486
4,nan Standards & Conceptual Approach,Conceptual Approach The Open RN Nursing Skills...,"David’s\nSchool of Nursing, Round Rock, TX\n• ...",401


In [6]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    time.sleep(7)
    return result["data"][0]["embedding"]

In [14]:
df['embeddings'] = df.content.apply(lambda x: get_embedding(x))
df.head()


APIError: Internal server error {
    "error": {
        "message": "Internal server error",
        "type": "auth_subrequest_error",
        "param": null,
        "code": "internal_error"
    }
}
 500 {'error': {'message': 'Internal server error', 'type': 'auth_subrequest_error', 'param': None, 'code': 'internal_error'}} {'Date': 'Tue, 07 Mar 2023 20:09:22 GMT', 'Content-Type': 'application/json; charset=utf-8', 'Content-Length': '166', 'Connection': 'keep-alive', 'Vary': 'Origin', 'X-Request-Id': 'e2f44905b2ff381cf4fb9eae3b281641', 'Strict-Transport-Security': 'max-age=15724800; includeSubDomains'}

In [None]:
df.to_csv('processed/embeddings.csv')

In [None]:
df=pd.read_csv('processed/embeddings.csv', index_col=0)
df.head()
df['embeddings'] = df['embeddings'].apply(eval).apply(np.array)

In [None]:
df.head()

Unnamed: 0,title,heading,content,n_tokens,embeddings
0,nan Preface,Usage Survey and Feedback We would love to hea...,nan Preface Usage Survey and Feedback We would...,59,"[-0.017017267644405365, 0.0022193468175828457,..."
1,nan Preface,About this Book Editors\n• Kimberly Ernstmeye...,nan Preface About this Book Editors\n• Kimber...,338,"[0.010825054720044136, -0.006509614177048206, ..."
2,nan Preface,Licensing/Terms of Use This textbook is licens...,"Julie Teeter, DNP, RN, CNE, Gateway Technical ...",489,"[0.012713366188108921, -0.01143588125705719, 0..."
3,nan Standards & Conceptual Approach,External Standards American Nurses Association...,"Joseph’s Hospitals\n• Sheri Johnson, UW Popul...",486,"[0.004502557683736086, 0.004485967569053173, 0..."
4,nan Standards & Conceptual Approach,Conceptual Approach The Open RN Nursing Skills...,"David’s\nSchool of Nursing, Round Rock, TX\n• ...",401,"[-0.014607994817197323, 0.01296240370720625, 0..."


In [None]:

from openai.embeddings_utils import distances_from_embeddings

In [None]:
def create_context(
    question, df, max_len=1800, size="ada"
):
    """
    Create a context for a question by finding the most similar context from the dataframe
    """

    # Get the embeddings for the question
    q_embeddings = openai.Embedding.create(input=question, engine='text-embedding-ada-002')['data'][0]['embedding']

    # Get the distances from the embeddings
    df['distances'] = distances_from_embeddings(q_embeddings, df['embeddings'].values, distance_metric='cosine')


    returns = []
    cur_len = 0

    # Sort by distance and add the text to the context until the context is too long
    for i, row in df.sort_values('distances', ascending=True).iterrows():
        
        # Add the length of the text to the current length
        cur_len += row['n_tokens'] + 4
        
        # If the context is too long, break
        if cur_len > max_len:
            break
        
        # Else add it to the text that is being returned
        returns.append(row["content"])

    # Return the context
    return "\n\n###\n\n".join(returns)

def answer_question(
    df,
    model="text-davinci-003",
    question="Am I allowed to publish model outputs to Twitter, without a human review?",
    max_len=1800,
    size="ada",
    debug=False,
    max_tokens=150,
    stop_sequence=None
):
    """
    Answer a question based on the most similar context from the dataframe texts
    """
    context = create_context(
        question,
        df,
        max_len=max_len,
        size=size,
    )
    # If debug, print the raw model response
    if debug:
        print("Context:\n" + context)
        print("\n\n")

    try:
        # Create a completions using the questin and context
        response = openai.Completion.create(
            prompt=f"Answer the question based on the context below, and if the question can't be answered based on the context, say \"I don't know\"\n\nContext: {context}\n\n---\n\nQuestion: {question}\nAnswer:",
            temperature=0,
            max_tokens=max_tokens,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=stop_sequence,
            model=model,
        )
        return response["choices"][0]["text"].strip()
    except Exception as e:
        print(e)
        return ""

In [None]:
print(answer_question(df, question="What is Infusion by Pump?"))

Infusion by pump is a type of new technology used to continuously deliver subcutaneous insulin. It works by closely mimicking the body’s normal release of insulin. Insulin doses are delivered through a flexible plastic tube called a catheter. With the aid of a small needle, the catheter is inserted through the skin into the fatty tissue and is taped in place.
