In [55]:
import os
from dotenv import load_dotenv, find_dotenv
import tiktoken
import openai
from langchain.schema import Document
from langchain.document_transformers import DoctranTextTranslator
from langchain.llms import OpenAI
from langchain.callbacks import get_openai_callback
from langchain.text_splitter import MarkdownHeaderTextSplitter
import nest_asyncio

a = load_dotenv(find_dotenv())  # read local .env file
opena_api_key = os.environ['OPENAI_API_KEY']

In [19]:
def num_tokens_from_content(content, model:str = "gpt-3.5-turbo") -> int:
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        encoding = tiktoken.get_encoding("cl100k_base")
    num_tokens = len(encoding.encode(content))
    return num_tokens

# Read file

In [63]:
source_txt_filepath = "../docs/EN_Toward an Ontology for Third Generation Systems Thinking.txt"

with open(source_txt_filepath, "r") as f:
    source_text = f.read()

filesize = len(source_text)
wordssize = len(source_text.split("\n"))
number_of_tokens = num_tokens_from_content(source_text)

print(f"Filesize:\n{filesize} bytes\n{wordssize} words\nNumber of tokens (GPT3.5): {number_of_tokens}")

Filesize:
42830 bytes
599 words
Number of tokens (GPT3.5): 9541


# Split file into chunks

In [41]:
headers_to_split_on = [
    ("#", "Header 1"),
    # ("##", "Header 2"),
    # ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(source_text)

print(f"Splits into {len(md_header_splits)} chunks")

for item in md_header_splits:
    size = len(item.page_content.split("\n"))
    number_of_tokens = num_tokens_from_content(item.page_content)

    print(f"{item.metadata} - Chunk size: {size} bytes. Tokens (GPT3.5): {number_of_tokens}")



Splits into 10 chunks
{} - Chunk size: 10 bytes. Tokens (GPT3.5): 105
{'Header 1': '1. Introduction'} - Chunk size: 26 bytes. Tokens (GPT3.5): 363
{'Header 1': '2. Systems Ontology and Epistemology'} - Chunk size: 55 bytes. Tokens (GPT3.5): 815
{'Header 1': '3. The first-generation approach'} - Chunk size: 49 bytes. Tokens (GPT3.5): 747
{'Header 1': '4. The second-generation approach'} - Chunk size: 52 bytes. Tokens (GPT3.5): 839
{'Header 1': '5. Continuous development'} - Chunk size: 73 bytes. Tokens (GPT3.5): 1136
{'Header 1': '6. Scaleless descriptions of physical systems'} - Chunk size: 65 bytes. Tokens (GPT3.5): 998
{'Header 1': '7. Constructivism'} - Chunk size: 97 bytes. Tokens (GPT3.5): 1496
{'Header 1': '8. Thermodynamics of system evolution'} - Chunk size: 42 bytes. Tokens (GPT3.5): 664
{'Header 1': '9. Conclusion'} - Chunk size: 121 bytes. Tokens (GPT3.5): 2300


In [59]:
nest_asyncio.apply()

# documents = [Document(page_content=source_text)]
documents = [md_header_splits[0]]

doc_translator = DoctranTextTranslator(
        openai_api_key=opena_api_key, 
        language='russian',
        # openai_api_model="gpt-3.5-turbo-16k"
        openai_api_model="gpt-3.5-turbo"
        )

# translated_document = await doc_translator.atransform_documents(documents)

In [None]:
loop = asncio.get_event_loop()

In [60]:
await doc_translator.atransform_documents(documents)

TypeError: object Document can't be used in 'await' expression

In [46]:
translated_document

NameError: name 'translated_document' is not defined

In [None]:
dest_txt_filepath = "../docs/RU_Toward an Ontology for Third Generation Systems Thinking.txt"

with open(dest_txt_filepath, "w") as f:
    f.write(translated_document)

In [35]:
for item in md_header_splits:
    filesize = len(item.page_content.split("\n"))
    number_of_tokens = num_tokens_from_content(item.page_content)

    print(item.metadata)
    print(f"Filesize: {filesize} bytes\nNumber of tokens (GPT3.5): {number_of_tokens}")

{}
Filesize: 10 bytes
Number of tokens (GPT3.5): 105
{'Header 1': '1. Introduction'}
Filesize: 26 bytes
Number of tokens (GPT3.5): 363
{'Header 1': '2. Systems Ontology and Epistemology'}
Filesize: 55 bytes
Number of tokens (GPT3.5): 815
{'Header 1': '3. The first-generation approach'}
Filesize: 49 bytes
Number of tokens (GPT3.5): 747
{'Header 1': '4. The second-generation approach'}
Filesize: 52 bytes
Number of tokens (GPT3.5): 839
{'Header 1': '5. Continuous development'}
Filesize: 73 bytes
Number of tokens (GPT3.5): 1136
{'Header 1': '6. Scaleless descriptions of physical systems'}
Filesize: 65 bytes
Number of tokens (GPT3.5): 998
{'Header 1': '7. Constructivism'}
Filesize: 97 bytes
Number of tokens (GPT3.5): 1496
{'Header 1': '8. Thermodynamics of system evolution'}
Filesize: 42 bytes
Number of tokens (GPT3.5): 664
{'Header 1': '9. Conclusion'}
Filesize: 121 bytes
Number of tokens (GPT3.5): 2300


In [28]:
type(md_header_splits[2])

langchain.schema.document.Document

In [29]:
a = md_header_splits[2]

In [33]:
a.page_content

'An ontology is a way to express what a subject area represents by defining\na set of concepts and categories as well as how the concepts and categories are\nrelated to each other. An ontology should indicate the objects that would be good\nto distinguish in the world for reliable active/embodied inference within the\nsubject area [2]. We formulate the task of creating an ontology in terms of\nattention management, and вЂњexplicit formal specificationвЂќ which here indicates\nthat these objects of attention are not spontaneously singled out but according to\nsome explicit model as ontology specified by using another model as its formalism\n(such as one of the foundational ontologies).\nWe resolve the issue of creating an ontology according to Popperian\nepistemology [3]: objects in an ontology appear by guessing, the acceptability of\nthese guesses for judgments about the world is questioned, but guesses that\nsurvive criticism are вЂњtaken seriously.вЂќ Good guesses about how the worl

In [34]:
a.metadata

{'Header 1': '2. Systems Ontology and Epistemology'}