## Get File

In [1]:
# %pip install domolibrary_extensions

In [23]:
from math import ceil
import process_files as pf

def chunk_into_n(ls, chunk_size):
  return [ls[i * chunk_size:(i + 1) * chunk_size] for i in range((len(ls) + chunk_size - 1) // chunk_size )] 

file_ls = pf.crawl_directory(file_path = "./content")
chunk_ls = chunk_into_n(file_ls, 5)
chunk_ls

[['./content/s_article_1500000572542/index.html',
  './content/s_article_360056669354/index.html',
  './content/s_article_360043427513/index.html',
  './content/s_article_360043431553/index.html',
  './content/s_article_1500003263261/index.html'],
 ['./content/s_article_360042928494/index.html',
  './content/s_article_360042933854/index.html',
  './content/s_article_360042923674/index.html',
  './content/s_article_1500003956542/index.html',
  './content/s_article_360043436053/index.html'],
 ['./content/s_article_360044876614/index.html',
  './content/s_article_360060598333/index.html',
  './content/s_article_360060507713/index.html',
  './content/s_article_360042930034/index.html',
  './content/s_article_9337403057943/index.html'],
 ['./content/s_article_360042927114/index.html',
  './content/s_article_360043434613/index.html',
  './content/s_article_4420450699415/index.html',
  './content/s_article_360047553253/index.html',
  './content/s_article_360043430973/index.html'],
 ['./conten

In [24]:

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_community.chat_models import ChatOllama

summarize_docs_chain = (
    ChatPromptTemplate.from_template("Summarize the following document for retrieval with LLMs.  Emphasize hyptheticals that could be answered with this content:\n\n{doc}")
    | ChatOllama(
        model="mistral:instruct",
        api_key="not needed",
        # base_url="http://localhost:11434/v1",
        max_retries=0,
        stream= True
    )
    | StrOutputParser()
)

# summarize_docs_chain.invoke("i really like apples, apples consist of pears and oranges")

In [25]:
from langchain.docstore.document import Document

def load_document(file_path)-> Document:
    """returns content complete with metadata for document"""
    soup = None
    try:
        soup = pf.extract_content_soup(file_path)
        res = pf.extract_article(soup)
        res['metadata'].update({
            "file_path" : file_path,
        })

        res.update({"summary" : summarize_docs_chain.invoke(res['page_content'])})

        return res

    except Exception as e:
        print(e, file_path)
        return False

docs = [load_document(file_path) for file_path in chunk_ls[0]]
docs = [doc for doc in docs if doc]
docs

[{'page_content': 'Intro\n-----\n\n\nRetail Express is a cloud-based POS, inventory, and eCommerce software that provides inventory-based Australian and NZ retailers with a central platform. It helps to manage sales and services, inventory and logistics, marketing, and loyalty, business intelligence practices, and much more across multiple online stores. As a cloud solution, Retail Express gives you the flexibility to manage your business anytime, anywhere, via any internet-enabled devices. Reports and dashboards give users valuable insight into business performance. The Retail Express Web Store API and the IPS API are the web services that allow e-commerce web sites to communicate with Retail Express point of sale and stock control software. This enables the synchronization of products, pricing, purchase orders, customers, internal transfers, and other business-specific information to help reduce the amount of administration required to maintain an online e-commerce website and an IPS

In [4]:
from langchain_community.embeddings import OllamaEmbeddings

persist_directory = './chroma_db'
embedding_function = OllamaEmbeddings(model="nomic-embed-text")

In [16]:
from typing import List
from langchain_chroma import Chroma
from langchain.storage import InMemoryByteStore
from langchain.retrievers.multi_vector import MultiVectorRetriever

vectorstore_summary = Chroma(   
    collection_name="summaries",
    persist_directory=persist_directory,
    embedding_function=embedding_function,
)

retriever = MultiVectorRetriever(
    vectorstore=vectorstore_summary,
    docstore=InMemoryByteStore(),
    id_key="article_number",
)

def summarize_chunk(cdocs : List[Document]):
    summary_docs = [
        Document(page_content=doc["summary"], metadata=doc["metadata"]) for doc in cdocs
    ]

    retriever.vectorstore.add_documents(summary_docs, ids  = [doc.metadata['article_number'] for doc in summary_docs])
    retriever.docstore.mset(
        list(
            zip(
                [doc["metadata"]["article_number"] for doc in docs],
                [Document(**doc) for doc in docs],
            )
        )
    )

In [6]:
for x in range(len(vectorstore_summary.get()["ids"])):
    print(vectorstore_summary.get()["metadatas"][x])
    # doc = vectorstore_summary.get()["metadatas"][x]
    # source = doc["source"]

{'article_number': '000003250', 'article_total_view_count': 6143, 'file_path': './content/s_article_1500000572542/index.html', 'first_published_date': '2022-10-24', 'language': 'English', 'title': 'Retail Express Connector', 'url': 'https://domo-support.domo.com/s/article/1500000572542'}
{'article_number': '000003740', 'article_total_view_count': 12571, 'file_path': './content/s_article_360043431553/index.html', 'first_published_date': '2022-10-24', 'language': 'English', 'title': 'Marketo Connector', 'url': 'https://domo-support.domo.com/s/article/360043431553'}
{'article_number': '000004330', 'article_total_view_count': 12302, 'file_path': './content/s_article_1500003263261/index.html', 'first_published_date': '2022-10-24', 'language': 'English', 'title': 'Support for Screen Readers on Dashboards', 'url': 'https://domo-support.domo.com/s/article/1500003263261'}
{'article_number': '000004372', 'article_total_view_count': 13708, 'file_path': './content/s_article_360043427513/index.html

In [7]:
print('done')

done


In [8]:
sub_docs = vectorstore_summary.similarity_search('how do i reset my password')
sub_docs

[Document(page_content=" This article provides instructions on how to change your Domo password and troubleshoot potential issues in different scenarios: direct sign-on with user settings, forgotten or expired passwords, and changing a password before it expires.\n\n1. Change password in user settings:\n   - Access the user Settings page by selecting your profile picture and choosing Settings.\n   - Under Security, select Change your password.\n   - Enter current and new passwords, then save or cancel the changes.\n\n2. Change a forgotten or expired password:\n   - If you forget or if it expires, enter your organization's domo URL and select Forgot password? to initiate the password reset process.\n   - Enter your email address and request a reset link.\n   - Open the email message, change your password, and save the changes.\n\n3. Change your password before it expires:\n   - If password expiration is enabled, you'll receive a prompt to change your password before it expires.\n   - En

In [9]:
# pip install langchain_chroma --upgrade

# Test

In [10]:
query = "can i reset my password if my organization uses SSO?"
sub_docs = retriever.get_relevant_documents(query,k=1)

from pprint import pprint
pprint(
    sub_docs[0].__dict__
)

{'metadata': {'article_number': '000004372',
              'article_total_view_count': 13708,
              'file_path': './content/s_article_360043427513/index.html',
              'first_published_date': '2022-10-24',
              'language': 'English',
              'title': 'Change Your Password',
              'url': 'https://domo-support.domo.com/s/article/360043427513'},
 'page_content': 'Intro\n'
                 '-----\n'
                 '\n'
                 '*Note:** \xa0If your organization uses single sign-on (SSO), '
                 'you cannot change your password using the steps described in '
                 'this article. Instead, contact your internal IT team to '
                 'change your password. If you have SSO configured, the login '
                 'screen displays as in this example:\n'
                 '\n'
                 '\n'
                 'This article describes how to change your Domo password and '
                 'troubleshoot potential is

In [11]:
from langchain_community.chat_models.ollama import ChatOllama as Ollama

rag_client = Ollama(
    model="mistral:instruct",
    verbose=True,
    temperature = 0
)

In [12]:
# %pip install langchain_experimental --upgrade

In [13]:
example_q = """What Brian's height?

Source: 1
Information: Suzy is 6'2"

Source: 2
Information: Jeremiah is blonde

Source: 3
Information: Brian is 3 inches shorted than Suzy"""

In [14]:
from typing import List, Optional
from langchain_experimental.llms.ollama_functions import OllamaFunctions

from langchain_core.pydantic_v1 import BaseModel, Field

class cited_answer(BaseModel):
    """Answer the user question based only on the given sources, and cite the sources used."""

    answer: str = Field(
        ...,
        description="The answer to the user question, which is based only on the given sources.",
    )
    citations: List[int] = Field(
        ...,
        description="The integer IDs of the SPECIFIC sources which justify the answer.",
    )


model = OllamaFunctions(model="mistral:instruct", verbose = True)

model = rag_client.bind(functions = [cited_answer])

model.invoke(example_q)

TypeError: Object of type ModelMetaclass is not JSON serializable

In [None]:
from langchain.output_parsers.openai_tools import JsonOutputKeyToolsParser

output_parser = JsonOutputKeyToolsParser(key_name="cited_answer", first_tool_only=True)



rag_client_func.invoke(example_q)

In [None]:
from operator import itemgetter
from typing import List


from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import (
    RunnableLambda,
    RunnableParallel,
    RunnablePassthrough,
)

from urban_waffle import client as uwc


def format_docs(docs: List[Document]) -> str:
    """Convert Documents to a single string.:"""
    formatted = [
        f"Article Title: {doc.metadata['title']}\nArticle Snippet: {doc.page_content}"
        for doc in docs
    ]
    return "\n\n" + "\n\n".join(formatted)


format_chain = (
    itemgetter("docs") | RunnableLambda(format_docs)
)

# subchain for generating an answer once we've done retrieval
answer_chain = uwc.rag_prompt | rag_client_func | StrOutputParser()
# complete chain that calls wiki -> formats docs to string -> runs answer subchain -> returns just the answer and retrieved docs.

citation_chain = (
    RunnableParallel(question=RunnablePassthrough(), docs=retriever)
    .assign(context=format_chain)
    .assign(answer=answer_chain)
    .pick(["answer", "docs"])
)

In [None]:
citation_chain.invoke("can i reset my password if my company uses SSO?")

In [None]:
from urban_waffle import client as uwc

rag_chain = uwc.generate_rag_chain(retriever = retriever, rag_prompt = uwc.rag_prompt)

In [None]:
rag_chain.invoke("can i reset my password if my company uses SSO?")