In [7]:
# i will use the same txt as in the "processing large documents" notebook

import tiktoken
import pandas as pd
from openai.embeddings_utils import get_embedding, cosine_similarity


In [8]:
# Open the file in read mode
with open('my_life_wagner.txt', 'r', encoding="utf-8") as file:
    # Read the contents of the file into a variable
    full_txt = file.read()



EMBEDDING_CTX_LENGTH = 8000  # the maximum for text-embedding-ada-002 is 8191
EMBEDDING_ENCODING = 'cl100k_base'

def split_paragraphs(paragraphs):
    encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
    max_token_length = EMBEDDING_CTX_LENGTH

    split_paragraphs_list = []

    for paragraph in paragraphs:
        paragraph_tokens = encoding.encode(paragraph)
        
        if len(paragraph_tokens) <= max_token_length:
            split_paragraphs_list.append(paragraph)
        else:
            sentences = paragraph.split('.')
            current_chunk = ""
            current_chunk_tokens = []
            
            for sentence in sentences:
                sentence_tokens = encoding.encode(sentence)
                
                if len(current_chunk_tokens) + len(sentence_tokens) <= max_token_length:
                    if current_chunk:
                        current_chunk += '.'
                        current_chunk_tokens.append(encoding.encode('.'))
                    current_chunk += sentence
                    current_chunk_tokens.extend(sentence_tokens)
                else:
                    split_paragraphs_list.append(current_chunk)
                    current_chunk = sentence
                    current_chunk_tokens = sentence_tokens
            
            if current_chunk:
                split_paragraphs_list.append(current_chunk)
    
    return split_paragraphs_list

# Example narrative text
narrative_text = full_txt

# Split the text into paragraphs using empty lines as separators
paragraphs = narrative_text.split('\n\n')

# Filter paragraphs with 100 characters or more and no [ and ] characters
filtered_paragraphs = []
for paragraph in paragraphs:
    if '[' not in paragraph and ']' not in paragraph:
        if len(paragraph) >= 100:
            filtered_paragraphs.append(paragraph)

# Split paragraphs based on token length
split_paragraphs_list = split_paragraphs(filtered_paragraphs)

print (len(filtered_paragraphs), len(split_paragraphs_list))


941 941


In [9]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

df = pd.DataFrame(split_paragraphs_list, columns=["doc"])
df["embedding"] = df.doc.apply(lambda x: get_embedding(x, engine=embedding_model))
df.head()

Unnamed: 0,doc,embedding
0,\nI was born at Leipzig on the 22nd of May 181...,"[-0.006653045304119587, 0.0030317874625325203,..."
1,"My father, Friedrich Wagner, was at the time o...","[0.016334692016243935, 0.006687802262604237, -..."
2,My uncle subsequently exercised no small influ...,"[-0.012458647601306438, -0.0385390929877758, 0..."
3,"\nHow deeply the homeless artist, hard pressed...","[0.010684369131922722, -0.0035003472585231066,..."
4,In this onerous undertaking he was favoured by...,"[-0.006251770071685314, -0.0062179770320653915..."


In [10]:
# Calculate total number of tokens
total_tokens = sum(df.doc.apply(lambda x: len(tiktoken.get_encoding(EMBEDDING_ENCODING).encode(x))))

# Calculate the cost based on cost rate
cost_per_1000_tokens = 0.0001
cost = (total_tokens / 1000) * cost_per_1000_tokens
cost

0.027874000000000003

In [12]:
df.to_csv("my_life_embeddings.csv")

In [14]:
def find_closest_doc(q_embedding, df, n=3):
    
    # create another column in the df and store the similarity
    df["similarity"] = df.embedding.apply(lambda x: cosine_similarity(x, q_embedding))
    
    # now we sort the df based on the new created column 
    results = df.sort_values("similarity", ascending = False).head(n)["doc"].tolist()
 
    return results

    
# now we put all in one cell to play around 

while True:
    question = input("Your query:")
    q_embedding = get_embedding(question, engine=embedding_model)
    print ("\n")
    for result in find_closest_doc(q_embedding, df, 3):
        print (result,"\n")

Your query:where he speaks of dogs and other small animals


The sense of contentment involuntarily aroused by our passage through
the fruitful Courland in the luxuriant month of July, and by the sweet
illusion that now at last I had cut myself loose from a hateful
existence, to enter upon a new and boundless path of fortune, was
disturbed from its very outset by the miserable inconveniences
occasioned by the presence of a huge Newfoundland dog called Robber.
This beautiful creature, originally the property of a Riga merchant,
had, contrary to the nature of his race, become devotedly attached to
me. After I had left Riga, and during my long stay in Mitau, Robber
incessantly besieged my empty house, and so touched the hearts of my
landlord and the neighbours by his fidelity, that they sent the dog
after me by the conductor of the coach to Mitau, where I greeted him
with genuine effusion, and swore that, in spite of all difficulties, I
would never part with him again. Whatever might happ

Your query:mathematics 


Luckily these fantastic humours, merging from the gruesome into the
mawkish, were counteracted and balanced by more serious influences
undergone at school at the hands of my teachers and schoolfellows. Even
there, it was chiefly the weird that aroused my keenest interest. I can
hardly judge whether I had what would be called a good head for study.
I think that, in general, what I really liked I was soon able to grasp
without much effort, whereas I hardly exerted myself at all in the
study of subjects that were uncongenial. This characteristic was most
marked in regard to arithmetic and, later on, mathematics. In neither
of these subjects did I ever succeed in bringing my mind seriously to
bear upon the tasks that were set me. In the matter of the Classics,
too, I paid only just as much attention as was absolutely necessary to
enable me to get a grasp of them; for I was stimulated by the desire to
reproduce them to myself dramatically. In this way Greek particu

Your query:artificial intelligence


During my walks, which I now took absolutely alone, I thought ever more
deeply—and much to the relief of my mind—over my ideas concerning that
state of human society for which the boldest hopes and efforts of the
socialists and communists, then busily engaged in constructing their
system, offered me but the roughest foundation. These efforts could
begin to have some meaning and value for me only when they had attained
to that political revolution and reconstruction which they aimed at;
for it was only then that I, in my turn, could start my reforms in art. 

On this occasion I had listened to the conversation of these two men on
philosophy and philosophers, which made a tremendous impression on me.
I remember that Weiss was an absent-minded man, with a hasty and abrupt
manner of speaking; he had an interesting and pensive expression which
impressed me immensely. I recollect how, on being accused of a want of
clearness in his writing and style, he ju

KeyboardInterrupt: Interrupted by user