# Data preprocessing
## Exp1: Code doesnt exist anymore. I overwrote it with Exp2 code.
Tried to load FAQ text and Course text in separate search DB, without any preprocessing. FAQ questions were not being answered correctly. Due to poor search results, and mismatch in chunking text.
## Exp2: Code below
FAQ data: Preprocessed FAQ documents to collect QnA pairs using GPT. Loaded only the Q's into search. When a student asks Q, we search in the Q's DB, and personalize the answer using GPT based on the student's question.
Course data: Loaded in search engine as overlapping chunks

In [None]:
# load the model required for search
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

In [None]:
# FAQ parser class
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from pydantic import BaseModel, Field
from enum import Enum
from langchain_openai import ChatOpenAI
from typing import List

import os
from dotenv import load_dotenv
load_dotenv()

class QnA(BaseModel):
    question: str = Field(description="one specific question from the given 'faq document'")
    answer: str = Field(description="answer to the above question from the given 'faq document'. Do not generate an answer, simply copy-paste the entire the text of the answer as-is.")

class QnAList(BaseModel):
    faq: List[QnA] = Field(description="list all the question and answer pairs from the given 'document'")

class FAQProcessor():
    def __init__(self):
        
        self.model = ChatOpenAI(
            model_name='gpt-3.5-turbo',
            openai_api_key=os.getenv("OPENAI_API_KEY"),
            openai_organization=os.getenv("OPENAI_ORGANIZATION"),
        )

        self.parser = PydanticOutputParser(pydantic_object=QnAList)
        self.fix_parser = OutputFixingParser.from_llm(parser=self.parser, llm=self.model)
        self.prompt = PromptTemplate(
            template =  '''
                You are a bot helping with text parsing. 
                Given an 'FAQ document', parse the list of question and answer pairs.\n
                The 'FAQ document' can be noisy, with some unrelated text, make sure to ignore this unrealted text.
                
                {format_instructions}\n
                
                ***
                'FAQ document' : {faq_doc}
                ***


                I am reminding you again, do not add any new questions or facts in asnwers that are not given in the 'FAQ document'.
            ''',
                input_variables=["faq_doc"],
                partial_variables={
                    "format_instructions": self.parser.get_format_instructions(),
                },
            )    
        self.search_conf_thresh = 1
        
    def parse(self, faq_doc):
        response = self.model([HumanMessage(self.prompt.format_prompt(faq_doc=faq_doc).to_string()) ])

        response_output = None
        try:
            response_output = self.parser.parse(response.content)
        except Exception as e:
            response_output = self.fix_parser.parse(response.content)                
        return response_output
        
fp = FAQProcessor()        

In [None]:
#load all the contents of the course
from langchain.document_loaders import DirectoryLoader
from langchain.document_loaders import TextLoader
from langchain.docstore.document import Document
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from bs4 import BeautifulSoup, Comment  
import re
import os
import json
import csv
import shutil
import logging 
def remove_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text  
    
def remove_html_comments(html_content):
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all comment nodes and remove them
    comments = soup.find_all(text=lambda text: isinstance(text, Comment))
    for comment in comments:
        comment.extract()
    
    # Get the HTML content without comments
    html_content_without_comments = str(soup)
    
    return html_content_without_comments

def split_to_sections(html_content):
    # Parse the HTML using BeautifulSoup
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find all headers and their corresponding sections
    header_list = []
    index_list = []
    for header in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):    
        header_list.append(header)
        index_list.append(html_content.find(str(header)))
    
    sections = []
    for ind, index in enumerate(index_list[:-1]):
        sections.append(remove_html_comments(html_content[index:index_list[ind+1]]))
    return sections

def extract_image_info(html):
    # Parse HTML
    soup = BeautifulSoup(html, 'html.parser')

    # Find all image tags
    img_tags = soup.find_all(['img', 'figure'])

    # Extract image URLs and titles
    image_data = []
    used_img_src = set()
    for tag in img_tags:
        if tag.name == 'img' and tag.get('src') not in used_img_src:
            url = tag.get('src')
            title = tag.get('alt', '')
            image_data.append({'url': url, 'title': title})
            used_img_src.add(url)
            # Remove image tags from HTML
            tag.extract()
        elif tag.name == 'figure':
            figcaption = tag.find('figcaption')
            if figcaption:
                title = figcaption.get_text()
            else:
                title = ''
            img_tag = tag.find('img')
            if img_tag and img_tag.get('src') not in used_img_src:
                url = img_tag.get('src')
                image_data.append({'url': url, 'title': title})
                used_img_src.add(url)
                # Remove image tags from HTML
                tag.extract()

    # Get modified HTML text
    modified_html = str(soup)

    return image_data, modified_html

# Read the HTML file
def read_html(html_filename):
    html_content = None
    with open(html_filename, 'r', encoding='utf-8') as f:
        html_content = f.read()
    return html_content

def parse_html(html_content):
    sections_with_images = split_to_sections(html_content)

    sections_without_images = []
    # Example usage
    for section in sections_with_images:
        image_info, html_without_images = extract_image_info(section)
        sections_without_images.append('\n{}'.format(html_without_images))
        # [TODO] Solve for images later
        # for info in image_info:
        #     print("\t\tImage Info$$$$$$$$$$$$$:")
        #     print("\t\tURL:", info['url'])
        #     print("\t\tTitle:", info['title'])
    return sections_with_images, sections_without_images


# location of the coursework. Expects a list of text files in this location.
course_dir = '''raw_webcrawl_data'''
loader = DirectoryLoader(course_dir, glob="**/*.html", loader_cls=TextLoader)
documents = loader.load()
print('Loaded {} documents'.format(len(documents)))

## output
faq_processed_dir = '{}_faq_processed'.format(course_dir)
os.makedirs(faq_processed_dir, exist_ok=True)


#save the full text in a different DB for QnA on it
all_sections = []
all_sections_html = {}
course_db = {}
qna_dict = {}
docs_procesessed_cnt = 0
section_procesessed_cnt = 0
qna_cnt = 0
faq_cnt = 0
for doc_cnt, doc in enumerate(documents):
    if 'faq' not in documents[doc_cnt].metadata['source'] and 'FAQ' not in documents[doc_cnt].metadata['source']:
        sections_with_images, sections_without_images = parse_html(documents[doc_cnt].page_content)    
        docs_procesessed_cnt +=1
        for section_cnt, section in enumerate(sections_without_images):
            # save text without html tags for retrieval purposes. embedding model doesnt do good job with html tags
            all_sections.append(Document(page_content=remove_html_tags(section), metadata={"source": documents[doc_cnt].metadata['source'], "split":section_cnt}))

            # save text in html format for equations, since equations are written with html+latex tags. ChatGPT does a good job reading and comprehending this.
            if documents[doc_cnt].metadata['source'] not in course_db:
                course_db[documents[doc_cnt].metadata['source']] = {}
            if section_cnt not in course_db[documents[doc_cnt].metadata['source']]:
                course_db[documents[doc_cnt].metadata['source']][section_cnt] = sections_with_images[section_cnt]
                section_procesessed_cnt+=1
    else:
        # load document, and compute QnA pairs using gpt
        faq_list = fp.parse(documents[doc_cnt].page_content)
        _, faq_file = os.path.split(documents[doc_cnt].metadata['source'])
        faq_cnt += 1
        with open(os.path.join(faq_processed_dir, faq_file.replace('.html','.csv')), 'w') as csvfile:
            faqwriter = csv.writer(csvfile)
            for faq in faq_list.faq:
                faqwriter.writerow([faq.question, faq.answer])
                qna_dict[faq.question] = [faq.answer]
                qna_cnt+=1

# save the documents in a dict
json.dump(course_db, open('./chroma/course_db', 'w'))
print('Processed {} documents, with {} sections'.format(docs_procesessed_cnt, section_procesessed_cnt))

# split the rest of the documents into chunks that can be handled by embedding model 
text_splitter = RecursiveCharacterTextSplitter(chunk_size=512, chunk_overlap=256, length_function=len, is_separator_regex=False,)

split_docs = text_splitter.split_documents(all_sections)
print('Split the {} sections into {} splits'.format(len(all_sections), len(split_docs)))

# load split coursework and fat into separate Chroma search db
db_dir = "./chroma/db"
shutil.rmtree(db_dir)
os.makedirs(db_dir, exist_ok=True)
db = Chroma.from_documents(split_docs, embeddings, collection_name="course", persist_directory=db_dir)
print('Loaded the splits to search engine')


# save the QnA dict
json.dump(qna_dict, open('./chroma/qna_dict', 'w'))
db_faq_dir = "./chroma/db_faq"
shutil.rmtree(db_faq_dir)
os.makedirs(db_faq_dir, exist_ok=True)
db_faq = Chroma.from_texts(list(qna_dict.keys()), embeddings, collection_name="faq", persist_directory=db_faq_dir)
print('Processed {} FAQ documents, with {} QnA pairs, and loaded it to search engine.'.format(faq_cnt, qna_cnt))


In [None]:
# Quickly test if search is working
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

query = "Why use such a simplistic model?"

test_embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
test_db = Chroma(persist_directory="./chroma/db", embedding_function=test_embedding_function, collection_name="course")
test_db_faq = Chroma(persist_directory="./chroma/db_faq", embedding_function=test_embedding_function, collection_name="faq")

results = test_db.similarity_search_with_score(query, k=3)
results_faq = test_db_faq.similarity_search_with_score(query, k=3)


# search in faq
print('Retrieved {} FAQ docs'.format(len(results_faq)))
for i, val in enumerate(results_faq):
    print('\nDocument {}'.format(i))
    print('Score: {}'.format(val[1])) #lower the score, more relevant is the answer
    print('Chapter: {}'.format(val[0].metadata))
    print('Section: {}'.format(val[0].page_content))


print('-------------------------------------------------------')

# search in course work
print('Retrieved {} docs'.format(len(results)))
for i, val in enumerate(results):
    print('\nDocument {}'.format(i))
    print('Score: {}'.format(val[1])) #lower the score, more relevant is the answer
    print('Chapter: {}'.format(val[0].metadata))
    print('Section: {}'.format(val[0].page_content))


# FAQ Bot v1
Step1: Search in a list of Qs, to find the nearest Q and personalize the answer with GPT.

Step2: If no good FAQ is found, search in the course content and answer with GPT.

Step3: If no good answer is found, send a static message  - I dont think I know the answer for this, let me check with the professor.

This works fine. But sometimes it feels like the answer could have been well rounded, and the FAQ doesnt have enough context and information. Hence tried V2 below.

In [None]:
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from pydantic import BaseModel, Field
from enum import Enum
from langchain_openai import ChatOpenAI

import os
from dotenv import load_dotenv
load_dotenv()

class IsAnswerable(Enum):
    YES = "YES - the given 'question' can be confidently answered using the given 'context'"
    NO = "NO - the given 'question' cannot be answered with the given 'context'"  

class AnswerStatus(BaseModel):
    status: IsAnswerable = Field(description="")
    answer: str = Field(description="answer the student's 'question' based solely on the given 'context'.")

class FAQBot():
    def __init__(self):
        
        self.model = ChatOpenAI(
            model_name='gpt-3.5-turbo',
            openai_api_key=os.getenv("OPENAI_API_KEY"),
            openai_organization=os.getenv("OPENAI_ORGANIZATION"),
        )
        
        self.parser = PydanticOutputParser(pydantic_object=AnswerStatus)
        self.fix_parser = OutputFixingParser.from_llm(parser=self.parser, llm=self.model, max_retries=3)
        self.prompt = PromptTemplate(
            template =  '''
                You're a helpful teaching assistant for a technical course on {course}. You will only answer student's 'question' based on the given 'context' of the course.\n
                The 'context' can be noisy, with some unrelated text. Make sure to ignore this unrelated text while answering. 
                Provide as detailed answer as possible, with sufficient examples that are part of the 'context' text.
                
                {format_instructions}\n
                
                ***
                'query' : {question}
                ***
                
                $$$
                'context' : {context}
                $$$

                I am reminding you again, you are a teaching assistant, do not add any facts into the answer that is not given in the 'context'
            ''',
                input_variables=["question", "context", "course"],
                partial_variables={
                    "format_instructions": self.parser.get_format_instructions(),
                },
            )    
        self.search_conf_thresh = 1
        
    def get_completion_from_messages(self, question, verbose=True):
        # Check if FAQ has answer.
                
        ## search in faq
        if verbose:
            print('Search in FAQ')

        results_faq = db_faq.similarity_search_with_score(question, k=3)

        ### save only the high confidence search results
        if verbose:
            print('\tanswers retrieved')
        retrieved_answers = ''
        for i, val in enumerate(results_faq):
            if verbose:
                print('\t\ttext: {}\n\tconf:{}\n'.format(val[0].page_content, val[1]))
            if val[1] < self.search_conf_thresh: 
                # collect the corresponding answers of the qna pair for gpt 
                retrieved_answers += ' Question:{}\n Answer:{}\n'.format(val[0].page_content, qna_dict[val[0].page_content])    

        #### if there is atleast one search result ask GPT to answer
        if len(retrieved_answers):
            # ask GPT to answer
            prompt_string = self.prompt.format_prompt(question=question, context=retrieved_answers, course = 'Distributed Algorithms').to_string()

            if verbose:
                print(prompt_string)
            response = self.model([
                HumanMessage(
                    prompt_string
                ) 
            ])

            if verbose:
                print('\t\t\tRaw GPT response: {}\n'.format(response))
                
            faq_response = None
            try:
                faq_response = self.parser.parse(response.content)
            except Exception as e:
                faq_response = self.fix_parser.parse(response.content)                

            if verbose:
                print('\t\t\tfinal response: {}\n'.format(faq_response))

            if faq_response != None and faq_response.status == IsAnswerable.YES:
                return faq_response.answer
            else:
                ## search in coursework
                if verbose:
                    print('Search in coursework')            
                results_faq = db.similarity_search_with_score(question, k=5)
        
                ### save only the high confidence search results
                if verbose:
                    print('\tanswers retrieved')
                retrieved_answers = ''
                for i, val in enumerate(results_faq):
                    if verbose:
                        print('\t\ttext: {}\n\tconf:{}\n'.format(val[0].page_content, val[1]))
                    if val[1] < self.search_conf_thresh: 
                        retrieved_answers += ' {}'.format(val[0].page_content)    
        
                #### if there is atleast one search result ask GPT to answer
                if len(retrieved_answers):
                    # ask GPT to answer
                    prompt_string = self.prompt.format_prompt(question=question, context=retrieved_answers, course = 'Distributed Algorithms').to_string()
        
                    if verbose:
                        print(prompt_string)
                    response = self.model([
                        HumanMessage(
                            prompt_string
                        ) 
                    ])
        
                    if verbose:
                        print('\t\t\tRaw GPT response: {}\n'.format(response))
                        
                    faq_response = None
                    try:
                        faq_response = self.parser.parse(response.content)
                    except Exception as e:
                        faq_response = self.fix_parser.parse(response.content)                
        
                    if verbose:
                        print('\t\t\tfinal response: {}\n'.format(faq_response))
                        
                if faq_response != None:
                    if faq_response.status == IsAnswerable.YES:
                        return faq_response.answer
                    else:
                        return ''' I dont think I know the answer for this, let me check with the professor.'''
                else:
                    return ''' I dont think I know the answer for this, let me check with the professor.'''
        else:
            return ''' I dont think I know the answer for this, let me check with the professor.'''
                                
        
fb = FAQBot( )        

In [None]:
# Have the viva 
while True:
    query = input("\nUser: ").strip()
    print()
    if query=="":
        break
    answer = fb.get_completion_from_messages(query)
    print('bot answer: {}'.format(answer))
    print('----------------------\n')

# FAQ Bot v2
Step1: Search both in FAQ (Search in a list of Qs, to find the nearest Q and personalize the answer with GPT) and coursework.

Step2: Answer based on both FAQ and coursework.

Step3: If no good answer is found, send a static message  - I dont think I know the answer for this, let me check with the professor.

I am not sure if this is better than v1. Need to test with more examples

In [None]:
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from pydantic import BaseModel, Field
from enum import Enum
from langchain_openai import ChatOpenAI

import os
from dotenv import load_dotenv
load_dotenv()

class IsAnswerable(Enum):
    YES = "YES - the given 'question' can be confidently answered using the given 'context'"
    NO = "NO - the given 'question' cannot be answered with the given 'context'"  

class AnswerStatus(BaseModel):
    status: IsAnswerable = Field(description="")
    answer: str = Field(description="answer the student's 'question' based solely on the given 'context'.")

class FAQBot():
    def __init__(self):
        
        self.model = ChatOpenAI(
            model_name='gpt-3.5-turbo',
            openai_api_key=os.getenv("OPENAI_API_KEY"),
            openai_organization=os.getenv("OPENAI_ORGANIZATION"),
        )

        self.parser = PydanticOutputParser(pydantic_object=AnswerStatus)
        self.fix_parser = OutputFixingParser.from_llm(parser=self.parser, llm=self.model, max_retries=3)
        self.prompt = PromptTemplate(
            template =  '''
                You're a helpful teaching assistant for a technical course on {course}. You will only answer student's 'question' based on the given 'context' of the course.\n
                The 'context' can be noisy, with some unrelated text. Make sure to ignore this unrelated text while answering. 
                
                Apply chain of thoughts, and think before you answer. The answer you are providing is it really helping the student to understand the concept completely?
                
                {format_instructions}\n
                
                ***
                'query' : {question}
                ***
                
                $$$
                'context' : {context}
                $$$

                I am reminding you again, you are a teaching assistant, do not add any facts into the answer that is not given in the 'context'
            ''',
                input_variables=["question", "context", "course"],
                partial_variables={
                    "format_instructions": self.parser.get_format_instructions(),
                },
            )    
        self.search_conf_thresh = 1
        
    def get_completion_from_messages(self, question, verbose=True):
        
        retrieved_answers = ''
        ## search in faq
        if verbose:
            print('Search in FAQ')
        results_faq = db_faq.similarity_search_with_score(question, k=3)

        ### save only the high confidence search results
        if verbose:
            print('\tanswers retrieved')
        
        for i, val in enumerate(results_faq):
            if verbose:
                print('\t\ttext: {}\n\t\tChapter: {}\n\t\tconf:{}\n'.format(val[0].page_content, val[0].metadata, val[1]))

            if val[1] < self.search_conf_thresh: 
                # collect the corresponding answers of the qna pair for gpt 
                retrieved_answers += ' Question:{}\n Answer:{}\n'.format(val[0].page_content, qna_dict[val[0].page_content])    



        ## search in coursework
        if verbose:
            print('Search in coursework')            
        results_faq = db.similarity_search_with_score(question, k=5)

        ### save only the high confidence search results
        if verbose:
            print('\tanswers retrieved')
            
        for i, val in enumerate(results_faq):
            if verbose:
                print('\t\ttext: {}\n\t\tChapter: {}\n\t\tconf:{}\n'.format(val[0].page_content, val[0].metadata, val[1]))
            if val[1] < self.search_conf_thresh: 
                retrieved_answers += ' {}'.format(val[0].page_content)    



        
        #### if there is atleast one search result ask GPT to answer
        if len(retrieved_answers):
            # ask GPT to answer
            prompt_string = self.prompt.format_prompt(question=question, context=retrieved_answers, course = 'Distributed Algorithms').to_string()

            if verbose:
                print(prompt_string)
            response = self.model([
                HumanMessage(
                    prompt_string
                ) 
            ])

            if verbose:
                print('\t\t\tRaw GPT response: {}\n'.format(response))
                
            faq_response = None
            try:
                faq_response = self.parser.parse(response.content)
            except Exception as e:
                faq_response = self.fix_parser.parse(response.content)                

            if verbose:
                print('\t\t\tfinal response: {}\n'.format(faq_response))

            if faq_response != None and faq_response.status == IsAnswerable.YES:
                return faq_response.answer
            else:
                return ''' I dont think I know the answer for this, let me check with the professor.'''
        else:
            return ''' I dont think I know the answer for this, let me check with the professor.'''
                                
        
fb = FAQBot( )        

In [None]:
# Have the viva 
while True:
    query = input("\nUser: ").strip()
    print()
    if query=="":
        break
    answer = fb.get_completion_from_messages(query)
    print('bot answer: {}'.format(answer))
    print('----------------------\n')

# FAQ Bot v3

Step1: Search both in FAQ (Search in a list of Qs, to find the nearest Q and personalize the answer with GPT) and coursework.

Step2: Answer based on both FAQ and coursework. Unlike v1 and v2, as the output of coursework search we use the extended version of the snippets, ie, we extend the context by ~512 characters from either ends of all the retrieved snippet. This is to give GPT more context to answer. 
#TODO: This extended context still has unrelated text, since we are randomly choosing ~512 characters. In the future, before indexing the coursework, we need to parse the documents using offline cheaper models, to identify sections and sub-sections in a document, and use the entire section/sub-section as the context, instead of chunks with random start and end index. During this preprocessing, we can also identify tables/images relevant to these sections, and show them when we generate an answer from the relevant section.

Step3: If no good answer is found, send a static message  - I dont think I know the answer for this, let me check with the professor.

I am not sure if this is better than v1. Need to test with more examples

In [None]:
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.prompts import PromptTemplate
from langchain.output_parsers import PydanticOutputParser, OutputFixingParser
from pydantic import BaseModel, Field
from enum import Enum
from langchain_openai import ChatOpenAI
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma
import json

import os
from dotenv import load_dotenv
load_dotenv()

class IsAnswerable(Enum):
    YES = "YES - the given 'question' can be confidently answered using the given 'context'"
    NO = "NO - the given 'question' cannot be answered with the given 'context'"  

class AnswerStatus(BaseModel):
    status: IsAnswerable = Field(description="")
    answer: str = Field(description="answer the student's 'question' based solely on the given 'context'. Answer only in HTML format, include images if relevant, and use math style for equations. ")

class FAQBot():
    def __init__(self):
        
        self.model = ChatOpenAI(
            model_name='gpt-3.5-turbo',
            openai_api_key=os.getenv("OPENAI_API_KEY"),
            openai_organization=os.getenv("OPENAI_ORGANIZATION"),
        )

        embedding_function = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
        self.db = Chroma(persist_directory="./chroma/db", embedding_function=embedding_function, collection_name="course")
        self.db_faq = Chroma(persist_directory="./chroma/db_faq", embedding_function=embedding_function, collection_name="faq")
        self.qna_dict = json.load(open('./chroma/qna_dict'))
        self.course_db = json.load(open('./chroma/course_db'))

        self.parser = PydanticOutputParser(pydantic_object=AnswerStatus)
        self.fix_parser = OutputFixingParser.from_llm(parser=self.parser, llm=self.model, max_retries=3)
        self.prompt = PromptTemplate(
            template =  '''
                You're a helpful teaching assistant for a technical course on {course}. You will only answer student's 'question' based on the given 'context' of the course.\n
                The 'context' is a combination of two things - 1) Previous question and answers on the {course} that are similar to the student's question, and 2) some snippets of text from the course contents that are relevant to the student's 'question'.
                
                {format_instructions}\n
                
                ***
                'query' : {question}
                ***
                
                $$$
                'context' : {context}
                $$$
                I am reminding you again, you are a teaching assistant, do not add any facts into the answer that is not given in the 'context'. 
                Answer only in HTML format, include images if relevant, and use math style for equations.
            ''',
                input_variables=["question", "context", "course"],
                partial_variables={
                    "format_instructions": self.parser.get_format_instructions(),
                },
            )    
        self.search_conf_thresh = 1
        self.excuse_me_msg = '''<p>I dont think I know the answer for this, let me check with the professor.</p>'''
        
    def ask_question(self, question, verbose=False):
        
        retrieved_answers = ''
        ## search in faq
        if verbose:
            print('Search in FAQ')
        results_faq = self.db_faq.similarity_search_with_score(question, k=3)

        ### save only the high confidence search results
        if verbose:
            print('\tanswers retrieved')

        is_faq_title_printed = False
        for i, val in enumerate(results_faq):
            if verbose:
                print('\t\ttext: {}\n\t\tChapter: {}\n\t\tconf:{}\n'.format(val[0].page_content, val[0].metadata, val[1]))

            if val[1] < self.search_conf_thresh: 
                if not is_faq_title_printed:
                    retrieved_answers += '''Question and Answers from the past that are similar to the student's question\n-----------------\n'''
                    is_faq_title_printed = True
                # collect the corresponding answers of the qna pair for gpt 
                retrieved_answers += ' Question:{}\n Answer:{}\n'.format(val[0].page_content, self.qna_dict[val[0].page_content])    



        ## search in coursework
        if verbose:
            print('Search in coursework')            
        results = self.db.similarity_search_with_score(question, k=5)

        ### save only the high confidence search results
        if verbose:
            print('\tanswers retrieved')

        is_snippet_title_printed = False
        max_chapters = 3
        neighboring_sections = 2 # + or -
        len_extended_text_context = 512
        chapter_cnt = 0
        seen_chapters = []
        for i, val in enumerate(results):
            if verbose:
                print('\t\ttext: {}\n\t\tChapter: {}\n\t\tSection: {}\n\t\tconf:{}\n'.format(val[0].page_content, val[0].metadata['source'], val[0].metadata['split'], val[1]))
                print(self.course_db[val[0].metadata['source']].keys())
            if val[1] < self.search_conf_thresh: 
                if not is_snippet_title_printed:
                    retrieved_answers += '''\n$$$$$$$$$$\nSnippets of text from the course that are relevant to the student's question\n-----------------\n'''
                    is_snippet_title_printed = True

                if val[0].metadata['source'] not in seen_chapters and chapter_cnt<max_chapters:
                    
                    html_str = self.course_db[val[0].metadata['source']][str((val[0].metadata['split']))]
                    extended_context = ''
                    for ind in range(val[0].metadata['split']-neighboring_sections, val[0].metadata['split']+neighboring_sections):
                        if str(ind) in self.course_db[val[0].metadata['source']]:
                            extended_context += '\n{}'.format(self.course_db[val[0].metadata['source']][str(ind)])
                    
                    retrieved_answers += '\n Relevant text snippet {}: {}\n\n '.format(chapter_cnt, extended_context) 
                    if verbose:
                        print('\n\t\tlength:({}, {})'.format(len(html_str), len(extended_context)))

                    seen_chapters.append(val[0].metadata['source'])
                    chapter_cnt += 1
                    if len(retrieved_answers)>2000:
                        if verbose:
                            print('retrieved_answers length greater than 2000 : {}'.format(len(retrieved_answers)))
                        break
                    



        
        #### if there is atleast one search result ask GPT to answer
        if len(retrieved_answers):
            # ask GPT to answer
            prompt_string = self.prompt.format_prompt(question=question, context=retrieved_answers, course = 'Distributed Algorithms').to_string()

            if verbose:
                print(prompt_string)
            response = self.model([
                HumanMessage(
                    prompt_string
                ) 
            ])

            if verbose:
                print('\t\t\tRaw GPT response: {}\n'.format(response))
                
            faq_response = None
            try:
                faq_response = self.parser.parse(response.content)
            except Exception as e:
                faq_response = self.fix_parser.parse(response.content)                

            if verbose:
                print('\t\t\tfinal response: {}\n'.format(faq_response))

            if faq_response != None and faq_response.status == IsAnswerable.YES:
                return faq_response.answer
            else:
                return self.excuse_me_msg
        else:
            return self.excuse_me_msg
                                
        
fb = FAQBot( )        

In [None]:
# Have the viva 
while True:
    query = input("\nUser: ").strip()
    print()
    if query=="":
        break
    answer = fb.ask_question(query)
    print('bot answer: {}'.format(answer))
    print('----------------------\n')

In [None]:
# html_filename = 'tutorial/mani/DISTRIBUTED_SYSTEM_MODELS/DistributedSystemModels.html'
html_filename = 'tutorial/mani/Byzantine/ByzantineOral.html'