# This notebook creates a jsonl file of text chunks with vector embeddings to upload to a vector database from a directory of .txt files, using batch processing to reduce cost of embedding

### Splitting all documents into a series of chunks saved in a list

In [2]:
import json
import tiktoken
from tqdm import tqdm
import os

def split_into_chunks(directory: str, encoding_name: str, chunk_size: int, overlap: int) -> list:
    all_chunks = []
    files = [f for f in os.listdir(directory) if f.endswith(".md")]
    chunk_id = 0  # Initialize chunk ID counter
    for filename in tqdm(files, desc="Splitting files into chunks"):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            encoding = tiktoken.get_encoding(encoding_name)
            tokens = encoding.encode(content)
            start = 0
            while start < len(tokens):
                end = start + chunk_size
                if end > len(tokens):
                    end = len(tokens)
                chunk_data = {
                    "id": str(chunk_id),  # Assign the current chunk ID
                    "filename": filename,
                    "chunk_start_index": start,
                    "chunk_end_index": end,
                    "raw_string": encoding.decode(tokens[start:end]),
                    "values": [],
                    "sparse_values": []
                }
                all_chunks.append(chunk_data)
                start += (chunk_size - overlap)
                chunk_id += 1  # Increment chunk ID for the next chunk
    return all_chunks

# Usage
directory = "/Users/adamhunter/Documents/misc/actualism_chat/af_knowledge_base2"
chunks = split_into_chunks(directory, "cl100k_base", 800, 400)
print(f"Total chunks created: {len(chunks)}")

Splitting files into chunks: 100%|██████████| 1281/1281 [00:07<00:00, 175.32it/s]

Total chunks created: 29018





### Begin with counting the tokens in your chunks, checking the cost before you embed

In [3]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

def count_tokens_in_chunks(chunks: list, encoding_name: str) -> int:
    total_tokens = 0
    for chunk in tqdm(chunks, desc="Counting tokens in chunks"):
        raw_text = chunk['raw_string']
        total_tokens += num_tokens_from_string(raw_text, encoding_name)
    return total_tokens

# Assuming 'chunks' is the list of chunks created from the previous function
total_tokens = count_tokens_in_chunks(chunks, "cl100k_base")
print(f"Total tokens in chunks: {total_tokens}")


Counting tokens in chunks: 100%|██████████| 29018/29018 [00:10<00:00, 2827.94it/s]

Total tokens in chunks: 22215464





In [4]:
def calculate_embedding_cost(num_tokens: int, price_per_million_tokens: float, batch_processing: bool = False) -> float:
    """Calculate the cost of embedding a given number of tokens, with an option for batch processing.
    
    Args:
    num_tokens (int): The number of tokens to be embedded.
    price_per_million_tokens (float): The price per million tokens for embedding.
    batch_processing (bool): If True, applies a discount for batch processing.
    
    Returns:
    float: The total cost of embedding the tokens.
    """
    if batch_processing:
        price_per_million_tokens /= 2
    cost = (num_tokens / 1_000_000) * price_per_million_tokens
    return cost

# Price per million tokens for text-embedding-3-large
price_per_million_tokens = 0.13

# Calculate the embedding cost for the total tokens with batch processing option
batch_processing = True  # Set to True to apply batch processing discount
embedding_cost = calculate_embedding_cost(total_tokens, price_per_million_tokens, batch_processing = False)
print(f"Cost to embed {total_tokens} tokens: ${embedding_cost:.2f}")


Cost to embed 22215464 tokens: $2.89


### Check a sample chunk

In [5]:
chunks[1]

{'id': '1',
 'filename': 'richard---abditorium---psyche.md',
 'chunk_start_index': 400,
 'chunk_end_index': 1200,
 'raw_string': ' a princess loved by Cupid; 2. soul (the immaterial essence,  animating principle, or actuating cause of an individual life; the spiritual principle embodied in human beings, all rational and spiritual  beings, or the universe); self. *(© 1994 Merriam-Webster, Inc).*  **Psychotherapy:**  \x95 [Dictionary Definition]: \x91psychotherapy: the treatment of disorders of emotion or personality by psychological methods;  formerly, the treatment of disease by psychic or hypnotic influence; psychotherapist: a specialist in or practitioner of psychotherapy\x92. *(Oxford Dictionary).*    ---   **Psychic Network:** The following is the specific sense in which I use the adjective psychic in terms such as \x91psychic network\x92 and \x91psychic  currents\x92/\x91psychic energies\x92. Vis.:  \x95 psychic (adj.): of or pertaining to the human mind or psyche. *(Oxford Dictio

### Fitting a sparse vector encoder on the knowledge base

In [26]:
from pinecone_text.sparse import BM25Encoder
import pickle

corpus = [chunk['raw_string'] for chunk in chunks]

# Assuming bm25 is your pre-fit BM25Encoder object
bm25 = BM25Encoder()
bm25.fit(corpus)  # Fit the encoder on your corpus

# Save the fitted encoder to a file
with open('bm25_encoder.pkl', 'wb') as f:
    pickle.dump(bm25, f)


  0%|          | 0/29018 [00:00<?, ?it/s]

In [20]:
for chunk in chunks:
    raw_text = chunk['raw_string']
    # Encode the document as a sparse vector
    doc_sparse_vector = bm25.encode_documents([raw_text])  # Ensure raw_text is passed as a list if required

    # Since doc_sparse_vector is already in the correct format, directly add it to the chunk
    chunk['sparse_values'] = doc_sparse_vector

# Now each chunk in `chunks` has a new key 'sparse_values' containing the sparse embedding vector

In [21]:
import json

# Save chunks to a JSONL file for backup
with open('chunks_backup.jsonl', 'w') as file:
    for chunk in chunks:
        json_record = json.dumps(chunk)
        file.write(json_record + '\n')


In [24]:
from openai import OpenAI
from tqdm import tqdm

# Initialize the OpenAI client with your API key
client = OpenAI()

def to_dense_vector_openAI(text, client, model):
    """Generate a dense vector for a given text using OpenAI's embedding model."""
    response = client.embeddings.create(
        model=model,
        input=text,
        encoding_format="float",  # Using float for direct use in Python
        dimensions=1024
    )
    # Extract the dense vector values from the response
    return response.data[0].embedding  # Assuming single text input for simplicity

model = "text-embedding-3-small"  

# Iterate over each chunk and generate a dense embedding only if 'values' is empty
for chunk in tqdm(chunks, desc="Generating dense embeddings"):
    if not chunk['values']:  # Check if 'values' is empty
        raw_text = chunk['raw_string']
        dense_vector = to_dense_vector_openAI(raw_text, client, model)
        chunk['values'] = dense_vector

# Now each chunk in `chunks` that had an empty 'values' list has a new key 'values' containing the dense embedding vector


Generating dense embeddings: 100%|██████████| 29018/29018 [2:39:40<00:00,  3.03it/s]  


In [25]:
# Save chunks with dense embeddings to a JSONL file
with open('embedded_chunks.jsonl', 'w') as file:
    for chunk in chunks:
        json_record = json.dumps(chunk)
        file.write(json_record + '\n')
