In [None]:
def open_file(filepath):
    with open(filepath, "r", encoding="utf-8", errors="ignore") as infile:
        return infile.read()
    
def save_file(filepath, content):
    with open(filepath, 'w', encoding='utf-8') as outfile:
        outfile.write(content)

In [None]:
import tiktoken

def count_openai_tokens(input_string: str, model: str = "gpt-4o") -> int:
    """
    Calculate the total number of OpenAI tokens a string will consume.

    Args:
        input_string (str): The string to tokenize.
        model (str): The OpenAI model to use for tokenization.
                     Defaults to "gpt-3.5-turbo".

    Returns:
        int: The total number of tokens consumed by the string.
    """
    # Load the tokenizer for the specified model
    encoding = tiktoken.encoding_for_model(model)
    
    # Encode the input string to tokens
    tokens = encoding.encode(input_string)
    
    # Return the token count
    return len(tokens)

In [None]:
text = open_file("../docs/truein/split/truein.txt_Part_105")
count_openai_tokens(text)

In [None]:
from openai import OpenAI
from langchain_text_splitters import RecursiveCharacterTextSplitter
import uuid

client = OpenAI()

# Load the tokenizer for the specified model
encoding = tiktoken.encoding_for_model("gpt-4o")

def get_embedding(text, model="text-embedding-3-small"):
   text = text.replace("\n", " ")
   return client.embeddings.create(input = [text], model=model).data[0].embedding

# create the length function
def tiktoken_len(text):
    tokens = encoding.encode(text, disallowed_special=())
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=7500, # the maximum for text-embedding-3-small is 8191
    chunk_overlap=100,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""],
)

# get a UUID - URL safe, Base64 for the split documents we need an iid for each
def get_a_uuid():
    return str(uuid.uuid4())

In [None]:
import os
import pandas as pd

def split_file_to_df(input_dir: str, category: str, output_dir: str):

    #directory = "../docs/full"
    chunk = {}
    txt = []

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(input_dir, filename), "r", encoding='UTF-8') as f:
                text = f.read()
                texts = text_splitter.create_documents([text])
                doc_count = 0
                for i in texts:
                    chunk = {
                            "id": get_a_uuid(),  # generate a random uuid for the document
                            "title": f"{filename[:-4]}_Part_{doc_count}",  # remove the .txt extension from the filename and use this as the title
                            "content": i.page_content,
                            "sourcefile": filename,
                            "content_tokens": count_openai_tokens(i.page_content, "gpt-4o"),
                            "category": category,
                            "contentVector": get_embedding(i.page_content)
                            }
                    txt.append(chunk)
                    doc_count += 1
                    save_file(f"{output_dir}/{filename[:-4]}_Part_{doc_count}.txt", i.page_content)

    df = pd.DataFrame(txt)

    # Get the total number of rows
    total_rows = len(df)
    print("Total number of rows:", total_rows)

In [None]:
import os
import pandas as pd

def split_file_to_files(input_dir: str, output_dir: str):

    #directory = "../docs/full"
    txt = []

    for filename in os.listdir(input_dir):
        if filename.endswith(".txt"):
            with open(os.path.join(input_dir, filename), "r", encoding='UTF-8') as f:
                text = f.read()
                texts = text_splitter.create_documents([text])  # create chunks of smaller documents based on text_splitter parameters
                doc_count = 0
                for i in texts:
                    txt.append(i.page_content)
                    doc_count += 1
                    save_file(f"{output_dir}/{filename[:-4]}_Part_{doc_count}.txt", i.page_content)

    df = pd.DataFrame(txt)

    # Get the total number of rows
    total_rows = len(df)
    print("Total number of rows:", total_rows)

In [None]:
split_file_to_files("../docs/northside/", "../docs/northside/split/")