In [6]:
# let's work with the amazing "my life" from r wagner. Here we have a nice txt version of volume 1

# Open the file in read mode
with open('my_life_wagner.txt', 'r', encoding="utf-8") as file:
    # Read the contents of the file into a variable
    full_txt = file.read()

print (full_txt[:300])

﻿MY LIFE

by Richard Wagner



PART I
1813-1842


I was born at Leipzig on the 22nd of May 1813, in a room on the second
floor of the ‘Red and White Lion,’ and two days later was baptized at
St. Thomas’s Church, and christened Wilhelm Richard.

My father, Friedrich Wagner, was at the time of my birt


In [7]:
len (full_txt)

1249453

divide the input text into chunks and then embed each chunk individually. Then, we can either use the chunk embeddings separately, or combine them in some way, such as averaging (weighted by the size of each chunk)

In [20]:
import tiktoken

'''The text-embedding-ada-002 model has a context length of 8191 tokens 
with the cl100k_base encoding, and we can see that going over that limit causes an error.'''

EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'

encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
len(encoding.encode(full_txt))

286941

we need now to make sure our chuncks have less than 8191 tokens
in this case we have a narrative text, let's see if we can group by paragraphs and see what happens

In [21]:
paragraphs = full_txt.split('\n\n')
paragraphs[:10]

['\ufeffMY LIFE',
 'by Richard Wagner',
 '',
 'PART I\n1813-1842',
 '\nI was born at Leipzig on the 22nd of May 1813, in a room on the second\nfloor of the ‘Red and White Lion,’ and two days later was baptized at\nSt. Thomas’s Church, and christened Wilhelm Richard.',
 'My father, Friedrich Wagner, was at the time of my birth a clerk in the\npolice service at Leipzig, and hoped to get the post of Chief Constable\nin that town, but he died in the October of that same year. His death\nwas partly due to the great exertions imposed upon him by the stress of\npolice work during the war troubles and the battle of Leipzig, and\npartly to the fact that he fell a victim to the nervous fever which was\nraging at that time. As regards his father’s position in life, I learnt\nlater that he had held a small civil appointment as toll collector at\nthe Ranstädt Gate, but had distinguished himself from those in the same\nstation by giving his two sons a superior education, my father,\nFriedrich, study

In [22]:
# let's keep only the paragraphs that are longer than x

min_paragraph_chars = 100
filtered_paragraphs = [paragraph for paragraph in paragraphs if len(paragraph) >= min_paragraph_chars]

filtered_paragraphs[:10]


['\nI was born at Leipzig on the 22nd of May 1813, in a room on the second\nfloor of the ‘Red and White Lion,’ and two days later was baptized at\nSt. Thomas’s Church, and christened Wilhelm Richard.',
 'My father, Friedrich Wagner, was at the time of my birth a clerk in the\npolice service at Leipzig, and hoped to get the post of Chief Constable\nin that town, but he died in the October of that same year. His death\nwas partly due to the great exertions imposed upon him by the stress of\npolice work during the war troubles and the battle of Leipzig, and\npartly to the fact that he fell a victim to the nervous fever which was\nraging at that time. As regards his father’s position in life, I learnt\nlater that he had held a small civil appointment as toll collector at\nthe Ranstädt Gate, but had distinguished himself from those in the same\nstation by giving his two sons a superior education, my father,\nFriedrich, studying law, and the younger son, Adolph, theology.',
 'My uncle subseq

In [24]:
# let's remove the notes, identified by [x]

# Filter paragraphs with x characters or more and no [ and ] characters

min_paragraph_chars = 100

filtered_paragraphs = []
for paragraph in paragraphs:
    # Check if the paragraph contains [ and ] characters
    if '[' not in paragraph and ']' not in paragraph:
        if len(paragraph) >= min_paragraph_chars:
            filtered_paragraphs.append(paragraph)

filtered_paragraphs[:10]

['\nI was born at Leipzig on the 22nd of May 1813, in a room on the second\nfloor of the ‘Red and White Lion,’ and two days later was baptized at\nSt. Thomas’s Church, and christened Wilhelm Richard.',
 'My father, Friedrich Wagner, was at the time of my birth a clerk in the\npolice service at Leipzig, and hoped to get the post of Chief Constable\nin that town, but he died in the October of that same year. His death\nwas partly due to the great exertions imposed upon him by the stress of\npolice work during the war troubles and the battle of Leipzig, and\npartly to the fact that he fell a victim to the nervous fever which was\nraging at that time. As regards his father’s position in life, I learnt\nlater that he had held a small civil appointment as toll collector at\nthe Ranstädt Gate, but had distinguished himself from those in the same\nstation by giving his two sons a superior education, my father,\nFriedrich, studying law, and the younger son, Adolph, theology.',
 'My uncle subseq

In [32]:
# lets write a function that will check if the chuncks have the right token lenght
# if so, keep it as it is
# if not, split it in N pieces but always keeping sentences

import tiktoken

EMBEDDING_CTX_LENGTH = 8191
EMBEDDING_ENCODING = 'cl100k_base'

def split_paragraphs(paragraphs):
    encoding = tiktoken.get_encoding(EMBEDDING_ENCODING)
    max_token_length = EMBEDDING_CTX_LENGTH

    split_paragraphs_list = []

    for paragraph in paragraphs:
        paragraph_tokens = encoding.encode(paragraph)
        
        if len(paragraph_tokens) <= max_token_length:
            split_paragraphs_list.append(paragraph)
        else:
            sentences = paragraph.split('.')
            current_chunk = ""
            current_chunk_tokens = []
            
            for sentence in sentences:
                sentence_tokens = encoding.encode(sentence)
                
                if len(current_chunk_tokens) + len(sentence_tokens) <= max_token_length:
                    if current_chunk:
                        current_chunk += '.'
                        current_chunk_tokens.append(encoding.encode('.'))
                    current_chunk += sentence
                    current_chunk_tokens.extend(sentence_tokens)
                else:
                    split_paragraphs_list.append(current_chunk)
                    current_chunk = sentence
                    current_chunk_tokens = sentence_tokens
            
            if current_chunk:
                split_paragraphs_list.append(current_chunk)
    
    return split_paragraphs_list

# Example narrative text
narrative_text = full_txt

# Split the text into paragraphs using empty lines as separators
paragraphs = narrative_text.split('\n\n')

# Filter paragraphs with 100 characters or more and no [ and ] characters
filtered_paragraphs = []
for paragraph in paragraphs:
    if '[' not in paragraph and ']' not in paragraph:
        if len(paragraph) >= 100:
            filtered_paragraphs.append(paragraph)

# Split paragraphs based on token length
split_paragraphs_list = split_paragraphs(filtered_paragraphs)

split_paragraphs_list[:10]

# now this is perfectly embeddable XD

['\nI was born at Leipzig on the 22nd of May 1813, in a room on the second\nfloor of the ‘Red and White Lion,’ and two days later was baptized at\nSt. Thomas’s Church, and christened Wilhelm Richard.',
 'My father, Friedrich Wagner, was at the time of my birth a clerk in the\npolice service at Leipzig, and hoped to get the post of Chief Constable\nin that town, but he died in the October of that same year. His death\nwas partly due to the great exertions imposed upon him by the stress of\npolice work during the war troubles and the battle of Leipzig, and\npartly to the fact that he fell a victim to the nervous fever which was\nraging at that time. As regards his father’s position in life, I learnt\nlater that he had held a small civil appointment as toll collector at\nthe Ranstädt Gate, but had distinguished himself from those in the same\nstation by giving his two sons a superior education, my father,\nFriedrich, studying law, and the younger son, Adolph, theology.',
 'My uncle subseq