# Script Overview

This script performs a question-answering task based on a set of FAQ documents for DataTalksClub's courses. 
Here's a breakdown of what it does:

1. **Loading Documents**: It loads documents from a JSON file containing FAQ data for various courses.

2. **Creating Elasticsearch Index**: It creates an Elasticsearch index with appropriate settings and mappings 
   to store the FAQ documents.

3. **Indexing Documents**: It indexes the loaded documents into the Elasticsearch index.

4. **Retrieving Documents**: It retrieves relevant documents from Elasticsearch based on a user's question.

5. **Building Context**: It builds a context string from the retrieved documents.

6. **Building Prompt**: It constructs a prompt for OpenAI GPT-3 chat completion, incorporating the user's question 
   and the context retrieved from Elasticsearch.

7. **Asking OpenAI**: It generates an answer using OpenAI's chat completion API based on the constructed prompt.

8. **QA Bot**: It combines the functionalities of retrieving documents, building context, building a prompt, and 
   asking OpenAI into a single question-answering bot.

9. **Main Execution**: It executes the QA bot on a predefined user question, retrieves an answer, and prints it.


In [None]:
import json
from elasticsearch import Elasticsearch
from openai import OpenAI
from tqdm.auto import tqdm

In [None]:
def load_documents(file_path):
    """
    Load documents from a JSON file.
    
    Parameters:
    file_path (str): Path to the JSON file containing documents.
    
    Returns:
    list: List of dictionaries containing document information.
    """
    with open(file_path, 'rt') as file:
        documents_data = json.load(file)
    
    documents = []
    for course_info in documents_data:
        course_name = course_info['course']
        for doc in course_info['documents']:
            doc['course'] = course_name
            documents.append(doc)
    
    return documents

In [None]:
def create_index(es, index_name):
    """
    Create an Elasticsearch index.
    
    Parameters:
    es: Elasticsearch client.
    index_name (str): Name of the index to be created.
    
    Returns:
    dict: Response from Elasticsearch.
    """
    index_settings = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "properties": {
                "text": {"type": "text"},
                "section": {"type": "text"},
                "question": {"type": "text"},
                "course": {"type": "keyword"} 
            }
        }
    }
    
    return es.indices.create(index=index_name, body=index_settings)

In [None]:
def index_documents(es, index_name, documents):
    """
    Index documents into Elasticsearch.
    
    Parameters:
    es: Elasticsearch client.
    index_name (str): Name of the index where documents will be indexed.
    documents (list): List of dictionaries containing document information.
    
    Returns:
    None
    """
    for doc in tqdm(documents):
        es.index(index=index_name, body=doc)

In [None]:
def retrieve_documents(es, query, index_name="course-questions", max_results=5):
    """
    Retrieve documents from Elasticsearch based on a query.
    
    Parameters:
    es: Elasticsearch client.
    query (str): Query string.
    index_name (str): Name of the index to search in.
    max_results (int): Maximum number of results to retrieve.
    
    Returns:
    list: List of dictionaries containing retrieved documents.
    """
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    return documents

In [None]:
def build_context(documents):
    """
    Build a context string from a list of documents.
    
    Parameters:
    documents (list): List of dictionaries containing document information.
    
    Returns:
    str: Context string.
    """
    context = ""
    for doc in documents:
        doc_str = f"Section: {doc['section']}\nQuestion: {doc['question']}\nAnswer: {doc['text']}\n\n"
        context += doc_str
    
    return context.strip()

In [None]:
def build_prompt(user_question, context):
    """
    Build a prompt for OpenAI GPT-3 chat completion.
    
    Parameters:
    user_question (str): User's question.
    context (str): Context string.
    
    Returns:
    str: Prompt string.
    """
    return f"""
You're a course teaching assistant.
Answer the user QUESTION based on CONTEXT - the documents retrieved from our FAQ database.
Don't use other information outside of the provided CONTEXT.

QUESTION: {user_question}

CONTEXT:

{context}
""".strip()

In [None]:
def ask_openai(prompt, client, model="gpt-3.5-turbo"):
    """
    Generate a response using OpenAI's chat completion API.
    
    Parameters:
    prompt (str): Prompt string.
    client: OpenAI client.
    model (str): Model name.
    
    Returns:
    str: Generated answer.
    """
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content


In [None]:
def qa_bot(user_question, es, openai_client):
    """
    QA bot that retrieves relevant documents and generates an answer.
    
    Parameters:
    user_question (str): User's question.
    es: Elasticsearch client.
    openai_client: OpenAI client.
    
    Returns:
    str: Generated answer.
    """
    context_docs = retrieve_documents(es, user_question)
    context = build_context(context_docs)
    prompt = build_prompt(user_question, context)
    answer = ask_openai(prompt, openai_client)
    return answer

In [None]:
# Load documents
documents = load_documents('./DocDataExtraction/documents.json')

# Elasticsearch setup
es = Elasticsearch("http://localhost:9200")
index_name = "course-questions"

# Create index
#create_index(es, index_name)

# Index documents
index_documents(es, index_name, documents)

# Openai Client
openai_client = OpenAI()

In [None]:
# User's question
user_question = "How to run a dbt-core project as an Airflow Task Group on Google Cloud Composer using a service account JSON key?"

# QA bot
answer = qa_bot(user_question, es, openai_client)

print(answer)
