In [1]:
%load_ext autoreload
%autoreload 2

In [None]:
from getpass import getpass

from docu_bot.document_loaders.git_document_loader import GitDocumentLoader
from docu_bot.document_loaders.zip_document_loader import ZipDocumentLoader
from docu_bot.document_loaders.utils import LoadedRepositoriesAndFiles
from docu_bot.retrievals.document_retrival import DocumentRetrieval
from docu_bot.constants import PROMPTS
from docu_bot.stores.utils import LoadedVectorStores, create_vector_store_from_document_loader
from docu_bot.stores.docstore import DocumentStore
from docu_bot.stores.vectorstore import MultiVectorStore
from docu_bot.chat.answer_openai import prepare_retriever, get_documents, rag, RETRIEVAL_TYPES

In [None]:
API_KEY = getpass("Enter your OpenAI API key: ")

### Test document loading

In [3]:
cache = LoadedRepositoriesAndFiles()
document_loader = GitDocumentLoader(repo_path="https://github.com/jinymusim/GPT-Czech-Poet.git", branch="main", loaded_repositories_and_files=cache)
documents =  document_loader.load()
documents[:10]

[Document(metadata={'ItemId': 'https://github.com/jinymusim/GPT-Czech-Poet/blob/main/co_dal.txt', 'source': 'git'}, page_content='Do dat fonetika (Do radku)\n\nEarly stopping na validation loss. + Pridat test set\n\nKdybychom delali dalsi sloky, tak spojit Unicode a Syllable tokenizer (Dohromady pod 28 000 tokenu) \na kodovat Syllable s posunem o len(Unicode). Kontext Syllable tokenizrem, aktualni sloka Unicodem.'),
 Document(metadata={'ItemId': 'https://github.com/jinymusim/GPT-Czech-Poet/blob/main/README.md', 'source': 'git'}, page_content='# GPT Czech Poet\nLearning models for Czech Poet Generation\n\n## Getting training data\nTraining data needs to be separately downloaded from github project `https://github.com/versotym/corpusCzechVerse`  \nThis project by Institute of Czech Literature, Czech Academy of Sciences, incorporates 1~305 books of poetry.  \nFor use, all modeling scripts automatically converts data to needed format.  \n\n## GPT Models\nFor learning Czech GPT Models use s

In [4]:
zip_document_loader = ZipDocumentLoader(temp_file=r"C:\Users\qwert\Git\Docu-Bot\requirements.txt", loaded_repositories_and_files=cache)
zip_document = zip_document_loader.load()
zip_document

[Document(metadata={'ItemId': 'c:\\users\\qwert\\git\\docu-bot\\data\\document_cache\\requirements\\requirements.txt', 'source': 'user'}, page_content="#\n# This file is autogenerated by pip-compile with Python 3.11\n# by the following command:\n#\n#    pip-compile '.\\requirements.in'\n#\naiofiles==23.2.1\n    # via gradio\naiohappyeyeballs==2.4.3\n    # via aiohttp\naiohttp==3.10.10\n    # via\n    #   langchain\n    #   langchain-community\naiosignal==1.3.1\n    # via aiohttp\nannotated-types==0.7.0\n    # via pydantic\nanyio==4.6.2.post1\n    # via\n    #   gradio\n    #   httpx\n    #   openai\n    #   starlette\n    #   watchfiles\nasgiref==3.8.1\n    # via opentelemetry-instrumentation-asgi\nattrs==24.2.0\n    # via aiohttp\nbackoff==2.2.1\n    # via posthog\nbcrypt==4.2.0\n    # via chromadb\nbuild==1.2.2.post1\n    # via chromadb\ncachetools==5.5.0\n    # via google-auth\ncertifi==2024.8.30\n    # via\n    #   httpcore\n    #   httpx\n    #   kubernetes\n    #   requests\nchar

### Test document vectorization and retrieval

In [5]:
vectorstores = LoadedVectorStores()
docstore = DocumentStore()
cache = LoadedRepositoriesAndFiles()

In [6]:
document_loader = GitDocumentLoader(repo_path="https://github.com/jinymusim/GPT-Czech-Poet.git", branch="main", loaded_repositories_and_files=cache)
chroma_vector = create_vector_store_from_document_loader(document_loader=document_loader, vectorstores=vectorstores, docstore=docstore)
multi_vector_store = MultiVectorStore(chroma_vectors=list(vectorstores._vectorstores.values()))
document_retrieval = DocumentRetrieval(vectorstore=multi_vector_store, docstore=docstore, search_kwargs={"min_score": 0.1, "k": 5})

In [7]:
document_retrieval._get_relevant_documents("What is Czech Poet?", run_manager=None)

[Document(metadata={'ItemId': 'https://github.com/jinymusim/GPT-Czech-Poet/blob/main/README.md', 'source': 'git', 'doc_id': '5bc8f45a-8190-4905-9452-e15d5587f135', 'sub_docs': [Document(metadata={'ItemId': 'https://github.com/jinymusim/GPT-Czech-Poet/blob/main/README.md', 'doc_id': '5bc8f45a-8190-4905-9452-e15d5587f135', 'source': 'git', 'score': 0.26036688131935415}, page_content='parser.add_argument("--prompt_ending", default=True, type=bool, help="Ending of Verse is prompted into training data")')]}, page_content='parser.add_argument("--prompt_ending", default=True, type=bool, help="Ending of Verse is prompted into training data")'),
 Document(metadata={'ItemId': 'https://github.com/jinymusim/GPT-Czech-Poet/blob/main/PoetGen\\corpusCzechVerse\\README.md', 'source': 'git', 'doc_id': '0d125fb5-515c-42c3-abe2-e1d317292d57', 'sub_docs': [Document(metadata={'ItemId': 'https://github.com/jinymusim/GPT-Czech-Poet/blob/main/PoetGen\\corpusCzechVerse\\README.md', 'doc_id': '0d125fb5-515c-42c

In [8]:
docstore.save()

### Test Question Answering

In [9]:
vectorstores = LoadedVectorStores()
docstore = DocumentStore()
cache = LoadedRepositoriesAndFiles()

In [None]:
retriver = prepare_retriever(
    long_branches=cache.get_cached_repo_branches(cache.get_cached_repositories()[0]),
    zip_files= [],
    docstore=docstore,
    loaded_vectorstores=vectorstores,
    model_type="Llama-3.2-1B",
    api_key=API_KEY,
    search_kwargs={"min_score": 0.5, "k": 10}
)
documents = get_documents("What is Czech Poet?", retriver)
rag([{"role": "user", "content": "What is Czech Poet?"}], documents, "Llama-3.2-1B", API_KEY, prompt=PROMPTS.INPUT_PROMPT)

[{'role': 'user', 'content': 'What is Czech Poet?'},
 {'role': 'assistant',
  'content': 'According to the text, "Czech Poet" is a learning model that generates text based on input data.'}]

In [None]:
retriver = prepare_retriever(
    long_branches=cache.get_cached_repo_branches(cache.get_cached_repositories()[0]),
    zip_files= [],
    docstore=docstore,
    loaded_vectorstores=vectorstores,
    model_type="Llama-3.2-1B",
    api_key=API_KEY,
    search_kwargs={"min_score": 0.1, "k": 10},
    retrieval_type=RETRIEVAL_TYPES.RERANK,
)
documents = get_documents("What is Czech Poet?", retriver)
rag([{"role": "user", "content": "What is Czech Poet?"}], documents, "Llama-3.2-1B", API_KEY, prompt=PROMPTS.INPUT_PROMPT)

[{'role': 'user', 'content': 'What is Czech Poet?'},
 {'role': 'assistant',
  'content': 'I cannot provide a direct quote from the provided text as it is not a source of information. However, I can provide a definition of "Czech Poet" based on general knowledge.\n\n"Czech Poet" typically refers to a person who writes poetry.'}]

In [None]:
retriver = prepare_retriever(
    long_branches=cache.get_cached_repo_branches(cache.get_cached_repositories()[0]),
    zip_files= [],
    docstore=docstore,
    loaded_vectorstores=vectorstores,
    model_type="Llama-3.2-1B",
    api_key=API_KEY,
    search_kwargs={"min_score": 0.5, "k": 10},
    retrieval_type=RETRIEVAL_TYPES.QUERY_ALTERATION,
)
documents = get_documents("What is Czech Poet?", retriver)
rag([{"role": "user", "content": "What is Czech Poet?"}], documents, "Llama-3.2-1B", API_KEY, prompt=PROMPTS.INPUT_PROMPT)

[{'role': 'user', 'content': 'What is Czech Poet?'},
 {'role': 'assistant',
  'content': 'Czech Poet is a project that aims to generate poetry based on a large corpus of Czech texts. The project uses machine learning models to analyze the structure and grammar of the text, and then generates poetry based on that analysis.\n\nThe project consists of several components:\n\n1. **Corpus of Czech Verse**: A large corpus of Czech texts, which includes books, articles, and other written works.\n2. **Tokenizer**: A custom tokenizer that breaks down the text into smaller units, such as st'}]

#### Continue Conversation

In [None]:
retriver = prepare_retriever(
    long_branches=cache.get_cached_repo_branches(cache.get_cached_repositories()[0]),
    zip_files= [],
    docstore=docstore,
    loaded_vectorstores=vectorstores,
    model_type="Llama-3.2-1B",
    api_key=API_KEY,
    search_kwargs={"min_score": 0.5, "k": 10}
)
documents = get_documents("What is Czech Poet?", retriver)
messages = rag([{"role": "user", "content": "What is Czech Poet?"}], documents, "Llama-3.2-1B", API_KEY, prompt=PROMPTS.INPUT_PROMPT)
rag(messages + [{"role": "user", "content": "How do I use it?"}], documents, "Llama-3.2-1B", API_KEY, prompt=PROMPTS.INPUT_PROMPT)

[{'role': 'user', 'content': 'What is Czech Poet?'},
 {'role': 'assistant',
  'content': 'According to the document, Czech Poet refers to a learning model that is used for generating text in Czech (the official language of the Czech Republic).'},
 {'role': 'user', 'content': 'How do I use it?'},
 {'role': 'assistant',
  'content': "Unfortunately, I don't have any specific instructions or guidelines on how to use a specific Czech Poet learning model. However, I can provide some general guidance on how to use a machine learning model for text generation, which is likely the Czech Poet model in this case.\n\nHere are some possible ways to use the Czech Poet learning model:\n\n1. **Text input**: You can input a text prompt or a portion of text to the Czech Poet model to generate a response. You can use"}]