In [None]:
import os
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings)
from langchain.document_loaders import ConfluenceLoader
from langchain.text_splitter import CharacterTextSplitter, TokenTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from atlassian import Confluence
from InstructorEmbedding import INSTRUCTOR
import getpass
import json
import pickle
import uuid
import chromadb
import re
from chromadb.config import Settings
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

In [None]:
tokens = 
confluence_url = 
user = 
space_key = 

foundation_model = "gpt-3.5-turbo"
vector_store = r"./Documents/chroma_db/"

In [None]:
with open(tokens, 'r') as file:
    content = file.read()

key_dict = json.loads(content)
atlassian_token = key_dict['atlassian']
os.environ["OPENAI_API_KEY"] = key_dict['openai']

In [None]:
confluence = Confluence(url=confluence_url,
               username=user,
               password=atlassian_token)

In [None]:
llm = ChatOpenAI(model_name=foundation_model,
                 temperature=0)

In [None]:
loader = ConfluenceLoader(
    url=confluence_url,
    username = user,
    api_key= atlassian_token)

documents = loader.load(
    space_key=space_key,
    limit=100)

In [None]:
stop = stopwords.words('english')

def clean_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
    text = " ".join([word for word in text.split() if word not in (stop)])
    text = re.sub("(\n+)", "", text)
    text = re.sub("(\s){2,}", " ", text)
    return text

In [None]:
for i, _ in enumerate(documents):
    documents[i].page_content = clean_text(documents[i].page_content)

In [None]:
# save locally as a checkpoint
with open('algo_confluence.pkl', 'wb') as pickle_file:
    pickle.dump(documents, pickle_file)

In [None]:
text_splitter = CharacterTextSplitter(chunk_size=100,
                                      chunk_overlap=0)
texts = text_splitter.split_documents(documents)

token_splitter = TokenTextSplitter(chunk_size=1000,
                                  chunk_overlap=10,
                                  encoding_name="cl100k_base") #text-embedding-ada-002
tokenized_text = token_splitter.split_documents(texts)

In [None]:
# Example of confluence page metadata
confluence.get_page_by_id('261470',
                          expand=True,
                          status=None,
                          version=None)

## Embedding options

In [None]:
## 1. BERT (opensource)
# embedding_function = SentenceTransformerEmbeddings(model_name="all-MiniLM-L6-v2") 

## 2. OpenAI - $$$
embedding_function = OpenAIEmbeddings()

##3. Instructor (instruction-Finetuned Text Embeddings) (opensource)
# model = INSTRUCTOR('hkunlp/instructor-large')
# instruction = "Represent the document for retrieval: "
# instruction_pairs = [[instruction, i.page_content] for i in tokenized_text]
# customized_embeddings = model.encode(texts_with_instructions)


## Set up vector DB to store embeddings

#### Local disk

In [None]:
# if persist_directory  and os.path.exists(vector_store):
#     vectordb = Chroma(persist_directory=vector_store, embedding_function=embedding)
# else:
vectordb = Chroma.from_documents(documents=tokenized_text, embedding=embedding)

In [None]:
# get sample key 
sample_key = vectordb.get()['ids'][0]
print(sample_key)

In [None]:
vectordb.get(sample_key)

In [None]:
vectordb.get(sample_key, include=['embeddings'])

In [None]:
len(vectordb.get(sample_key, include=['embeddings'])['embeddings'][0])

#### Docker

In [None]:
!git clone git@github.com:chroma-core/chroma.git
# docker-compose up -d --build

In [None]:
# Or via docker

client = chromadb.HttpClient(settings=Settings(allow_reset=True))
client.reset()  # resets the database
collection = client.create_collection("test_collection")

for doc in tokenized_text:
    collection.add(
        ids=[str(uuid.uuid1())], metadatas=doc.metadata, documents=doc.page_content
    )

# For langchain
vectordb = Chroma(client=client,
                  collection_name="test_collection",
                  embedding_function=embedding_function,
                 )

## Prompt engineering and querying LLM

In [None]:
retriever = vectordb.as_retriever(search_kwargs={"k":4})
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=retriever)

In [None]:
custom_prompt_template = """You are a Confluence chatbot answering questions. 
                            Use the following pieces of context to answer the question at the end. 
                            If you don't know the answer, say that you don't know, don't try to make up an answer.

                            {context}

                            Question: {question}
                            Helpful Answer:
                            """

CUSTOMPROMPT = PromptTemplate(
    template=custom_prompt_template, input_variables=["context", "question"]
)

qa.combine_documents_chain.llm_chain.prompt = CUSTOMPROMPT

In [None]:
question = "Please explain how features are calculated in the lost sales project"

answer = qa.run(question)
print(answer)

----------------

In [None]:
config_dict = {
    "llm_name": "gpt-3.5-turbo",
    "url": 'https://c-b4web.atlassian.net/',
    "user": 'jackm@cb4.com',
    "api_key": atlassian_token,
    "space": 'ALGO'
}


class ConfluenceQA:
    def __init__(self,
                config: dict={}):
        self.config = config
        self.embedding = self.init_embeddings()
        self.llm_name = self.init_model()
        self.loader = ConfluenceLoader(url=config['url'],
                                       username = config['user'],
                                       api_key= config['api_key'])
        
        self.vectordb = None
        self.texts = None
        self.retroever = None
        self.qa = None
        
    def init_embeddings(self) -> None:
        if 'embedding' in [i.lower() for i in self.config.keys()]:
            return self.config['embedding']
        else:
            return OpenAIEmbeddings()
        
    def init_model(self) -> None:
        
        if 'llm_name' in [i.lower() for i in self.config.keys()]:
            return ChatOpenAI(model_name=self.config['llm_name'],
                              temperature=0)
        else:
            return ChatOpenAI(model_name="gpt-3.5-turbo",
                              temperature=0)
    
    def get_chunk_documents(self) -> None:      
        documents = self.loader.load(space_key=self.config['space_key'], 
                                     limit=100)

        text_splitter = CharacterTextSplitter(chunk_size=100,
                                              chunk_overlap=0)
        texts = text_splitter.split_documents(documents)
        text_splitter = TokenTextSplitter(chunk_size=1000,
                                          chunk_overlap=10,
                                          encoding_name="cl100k_base")  # This the encoding for text-embedding-ada-002
        self.texts = text_splitter.split_documents(texts)
    
    def vector_db_confluence_docs(self, force_reload:bool= False) -> None:
        self.vectordb = Chroma.from_documents(documents=self.texts,
                                              embedding=self.embedding)
        
    def retreival_qa_chain(self):
        self.retriever = self.vectordb.as_retriever(search_kwargs={"k":4})
        self.qa = RetrievalQA.from_chain_type(llm=self.llm,
                                              chain_type="stuff",
                                              retriever=self.retriever)
        
    def answer_confluence(self, question: str) -> str:
        answer = self.qa.run(question)
        return answer
    
    def run_qa_setup(self) -> None:
        self.get_chunk_documents()
        self.vector_db_confluence_docs()
        self.qa()
        
    @staticmethod
    def clean_text(text: str) -> str:
        text = text.lower()
        text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)
        text = " ".join([word for word in text.split() if word not in (stop)])
        text = re.sub("(\n+)", "", text)
        text = re.sub("(\s){2,}", " ", text)
        return text

In [None]:
config_dict = {
    "llm_name": "gpt-3.5-turbo",
    "url": 'https://c-b4web.atlassian.net/',
    "user": 'jackm@cb4.com',
    "api_key": atlassian_token}

In [None]:
confluence_qa = ConfluenceQA(config=config_dict)
confluence_qa.run_qa_setup()
result = confluence_qa.answer_confluence(question)

In [None]:
# app.py

import streamlit as st
import os
import json
import time
from dotenv import load_dotenv

# Import the ConfluenceQA class
from confluence_qa import ConfluenceQA

try:
    from hyperplane.utils import is_jhub
    if is_jhub():
        openaiKeyFile = '/root/.secret/openai_key.json'
    else:
        openaiKeyFile = '/etc/hyperplane/secrets/openai_key.json'
    with open(openaiKeyFile) as f:
        os.environ["OPENAI_API_KEY"] = json.load(f)['openai_key']
except Exception as e:
    print(e)
    load_dotenv()

st.set_page_config(
    page_title='Q&A Bot for Confluence Page',
    page_icon='⚡',
    layout='wide',
    initial_sidebar_state='auto',
)
if "config" not in st.session_state:
    st.session_state["config"] = {}
if "confluence_qa" not in st.session_state:
    st.session_state["confluence_qa"] = None

@st.cache_resource
def load_confluence(config):
    # st.write("loading the confluence page")
    confluence_qa = ConfluenceQA(config=config)
    confluence_qa.init_embeddings()
    confluence_qa.init_models()
    confluence_qa.vector_db_confluence_docs()
    confluence_qa.retreival_qa_chain()
    return confluence_qa

with st.sidebar.form(key ='Form1'):
    st.markdown('## Add your configs')
    confluence_url = st.text_input("paste the confluence URL", "https://templates.atlassian.net/wiki/")
    username = st.text_input(label="confluence username",
                             help="leave blank if confluence page is public",
                             type="password")
    space_key = st.text_input(label="confluence space",
                             help="Space of Confluence",
                             value="RD")
    api_key = st.text_input(label="confluence api key",
                            help="leave blank if confluence page is public",
                            type="password")
    submitted1 = st.form_submit_button(label='Submit')

    if submitted1 and confluence_url and space_key:
        st.session_state["config"] = {
            "persist_directory": None,
            "confluence_url": confluence_url,
            "username": username if username != "" else None,
            "api_key": api_key if api_key != "" else None,
            "space_key": space_key,
        }
        with st.spinner(text="Ingesting Confluence..."):
            ### Hardcoding for https://templates.atlassian.net/wiki/ and space RD to avoid multiple OpenAI calls.
            config = st.session_state["config"]
            if  config["confluence_url"] == "https://templates.atlassian.net/wiki/" and config["space_key"] =="RD":
                config["persist_directory"] = "chroma_db"
            st.session_state["config"] = config

            st.session_state["confluence_qa"]  = load_confluence(st.session_state["config"])
        st.write("Confluence Space Ingested")
        

st.title("Confluence Q&A Demo")

question = st.text_input('Ask a question', "How do I make a space public?")

if st.button('Get Answer', key='button2'):
    with st.spinner(text="Asking LLM..."):
        confluence_qa = st.session_state.get("confluence_qa")
        if confluence_qa is not None:
            result = confluence_qa.answer_confluence(question)
            st.write(result)
        else:
            st.write("Please load Confluence page first.")

# zero shot topic modeling

In [None]:
import pickle
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired

zeroshot_topic_list = ["Lost sales", "Forecast", "C-retail"]

topic_model = BERTopic(
    embedding_model="thenlper/gte-small", 
    min_topic_size=15,
    zeroshot_topic_list=zeroshot_topic_list,
    zeroshot_min_similarity=.85,
    representation_model=KeyBERTInspired()
)

with open(r'../algo_confluence_text.pkl', 'rb') as text:
    docs = pickle.load(text)
    
topics, probs = topic_model.fit_transform(docs)

topic_model.get_topic_info()

## DSPy

In [None]:
import json
with open(tokens, 'r') as file:
    content = file.read()

key_dict = json.loads(content)
os.environ["OPENAI_API_KEY"] = key_dict['openai']

In [None]:
class GenerateAnswer(dspy.Signature):
    """Answer questions with short factoid answers."""

    context = dspy.InputField(desc="may contain relevant facts")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="often between 1 and 5 words")

In [None]:
class RAG(dspy.Module):
    def __init__(self, num_passages=3):
        super().__init__()

        self.retrieve = dspy.Retrieve(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(context=context, answer=prediction.answer)

In [None]:
from dspy.teleprompt import BootstrapFewShot

# Validation logic: check that the predicted answer is correct.
# Also check that the retrieved context does actually contain that answer.
def validate_context_and_answer(example, pred, trace=None):
    answer_EM = dspy.evaluate.answer_exact_match(example, pred)
    answer_PM = dspy.evaluate.answer_passage_match(example, pred)
    return answer_EM and answer_PM

# Set up a basic teleprompter, which will compile our RAG program.
teleprompter = BootstrapFewShot(metric=validate_context_and_answer)

# Compile!
compiled_rag = teleprompter.compile(RAG(), trainset=trainset)

In [None]:
# Ask any question you like to this simple RAG program.
my_question = "What castle did David Gregory inherit?"

# Get the prediction. This contains `pred.context` and `pred.answer`.
pred = compiled_rag(my_question)

# Print the contexts and the answer.
print(f"Question: {my_question}")
print(f"Predicted Answer: {pred.answer}")
print(f"Retrieved Contexts (truncated): {[c[:200] + '...' for c in pred.context]}")