# Building a Chatbot for an Online Course out of FAQ Documents

In [1]:
import requests 
from pprint import pprint

from openai import OpenAI
from elasticsearch import Elasticsearch



## Step 1: Acquire FAQ Data & Explore

In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [3]:
print(type(documents_raw))
print(len(documents_raw))
pprint(documents_raw[0].keys())
pprint(documents_raw[0]['documents'][0])

<class 'list'>
3
dict_keys(['course', 'documents'])
{'question': 'Course - When will the course start?',
 'section': 'General course-related questions',
 'text': 'The purpose of this document is to capture frequently asked '
         'technical questions\n'
         'The exact day and hour of the course will be 15th Jan 2024 at 17h00. '
         "The course will start with the first  “Office Hours'' live.1\n"
         'Subscribe to course public Google Calendar (it works from Desktop '
         'only).\n'
         'Register before the course starts using this link.\n'
         'Join the course Telegram channel with announcements.\n'
         "Don’t forget to register in DataTalks.Club's Slack and join the "
         'channel.'}


In [4]:
# denormalize data to unify into one data type

documents = []

for course in documents_raw:
    
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
print("Number of FAQs: ", len(documents), "\n")

print("Structure of an FAQ: ")
pprint(documents[0])


Number of FAQs:  948 

Structure of an FAQ: 
{'course': 'data-engineering-zoomcamp',
 'question': 'Course - When will the course start?',
 'section': 'General course-related questions',
 'text': 'The purpose of this document is to capture frequently asked '
         'technical questions\n'
         'The exact day and hour of the course will be 15th Jan 2024 at 17h00. '
         "The course will start with the first  “Office Hours'' live.1\n"
         'Subscribe to course public Google Calendar (it works from Desktop '
         'only).\n'
         'Register before the course starts using this link.\n'
         'Join the course Telegram channel with announcements.\n'
         "Don’t forget to register in DataTalks.Club's Slack and join the "
         'channel.'}


## Step 2: Build a text search engine 

In [6]:
es = Elasticsearch("http://localhost:9200")

info = es.info()

print(type(info))
print(info.keys())
pprint(info['version'])


<class 'elastic_transport.ObjectApiResponse'>
dict_keys(['name', 'cluster_name', 'cluster_uuid', 'version', 'tagline'])
{'build_date': '2022-10-04T07:17:24.662462378Z',
 'build_flavor': 'default',
 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73',
 'build_snapshot': False,
 'build_type': 'docker',
 'lucene_version': '9.3.0',
 'minimum_index_compatibility_version': '7.0.0',
 'minimum_wire_compatibility_version': '7.17.0',
 'number': '8.4.3'}


In [8]:
# Settings for elasticsearch index

index_name = "course-questions"

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

response = es.indices.create(index=index_name, body=index_settings)

In [None]:
pprint(response)

In [12]:
# Load documents (questions) into ElasticSearch

from tqdm.auto import tqdm

for doc in tqdm(documents):
    es.index(index=index_name, document=doc)

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:01<00:00, 551.74it/s]


In [13]:
# Let's see what we've got:

indices = es.cat.indices(format="json")
pprint(indices)

print("----")

# Count documents in a specific index
count = es.count(index=index_name, body={"query": {"match_all": {}}})

# Output the count
print("Number of documents in the index:", count['count'])

ListApiResponse([{'health': 'green', 'status': 'open', 'index': 'course-questions', 'uuid': 'ZKQ8YpScSYyW7ZOYkZPDkQ', 'pri': '1', 'rep': '0', 'docs.count': '948', 'docs.deleted': '2844', 'store.size': '2.6mb', 'pri.store.size': '2.6mb'}])
----
Number of documents in the index: 948


In [10]:
index_info = es.indices.get(index="course-questions")
pprint(index_info)

response = es.delete_by_query(index="course-questions", body={"query": {"match_all": {}}})

# print(response)


ObjectApiResponse({'course-questions': {'aliases': {}, 'mappings': {'properties': {'course': {'type': 'keyword'}, 'question': {'type': 'text'}, 'section': {'type': 'text'}, 'text': {'type': 'text'}}}, 'settings': {'index': {'routing': {'allocation': {'include': {'_tier_preference': 'data_content'}}}, 'number_of_shards': '1', 'provided_name': 'course-questions', 'creation_date': '1719969884844', 'number_of_replicas': '0', 'uuid': 'ZKQ8YpScSYyW7ZOYkZPDkQ', 'version': {'created': '8040399'}}}}})


### Now you're set up for RAG: Retrieval!

In [14]:
query = "How do I execute a command in a running docker container?"


In [15]:
def retrieve_documents(query, index_name="course-questions", max_results=5):
    es = Elasticsearch("http://localhost:9200")
    
    search_query = {
        "size": max_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es.search(index=index_name, body=search_query)
    documents = [hit['_source'] for hit in response['hits']['hits']]
    print([hit['_score'] for hit in response['hits']['hits']])
    pprint(documents)
    return documents

In [16]:
docs = retrieve_documents(query)

[75.54128, 43.922554, 38.684105, 38.33403, 35.94081]
[{'course': 'data-engineering-zoomcamp',
  'question': 'PGCLI - running in a Docker container',
  'section': 'Module 1: Docker and Terraform',
  'text': 'In case running pgcli  locally causes issues or you do not want to '
          'install it locally you can use it running in a Docker container '
          'instead.\n'
          'Below the usage with values used in the videos of the course for:\n'
          'network name (docker network)\n'
          'postgres related variables for pgcli\n'
          'Hostname\n'
          'Username\n'
          'Port\n'
          'Database name\n'
          '$ docker run -it --rm --network pg-network '
          'ai2ys/dockerized-pgcli:4.0.1\n'
          '175dd47cda07:/# pgcli -h pg-database -U root -p 5432 -d ny_taxi\n'
          'Password for root:\n'
          'Server: PostgreSQL 16.1 (Debian 16.1-1.pgdg120+1)\n'
          'Version: 4.0.1\n'
          'Home: http://pgcli.com\n'
          'root@

## 3. Run your LLM

In [17]:
from openai import OpenAI
client = OpenAI()

completion = client.chat.completions.create(
  model=str("gpt-3.5-turbo").encode("ascii"),
  messages=[
    {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
    {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
  ]
)

print(completion.choices[0].message)

UnicodeEncodeError: 'ascii' codec can't encode character '\u201c' in position 7: ordinal not in range(128)