In [10]:
import os

def loadFAQs(directory_path):
    faqs = {}

    for filename in os.listdir(directory_path):
        if filename.endswith(".txt"):  # assuming FAQs are in .txt files
            file_path = os.path.join(directory_path, filename)

            with open(file_path) as f:
                raw_faq = f.read()

            filename_without_ext = os.path.splitext(filename)[0]  # remove .txt extension

            faqs[filename_without_ext] = [text.strip() for text in raw_faq.split('=====')]

    return faqs

In [11]:
# Store data from txt-docs
faqs = loadFAQs('./txt-docs')
faqs


{'faq': ['Who are you and what can you do?\n\nI am a RAG based chatbot engine based on OCI Generative AI Service and Oracle Database 23ai. I can answer questions about Oracle Cloud (OCI) and especially about the Free Trial and Always Free programs.',
  'What is Oracle Cloud Free Tier?\n\nOracle Cloud Free Tier allows you to sign up for an Oracle Cloud account which provides a number of Always Free services and a Free Trial with US$300 of free credit to use on all eligible Oracle Cloud Infrastructure services for up to 30 days. The Always Free services are available for an unlimited period of time. The Free Trial services may be used until your US$300 of free credits are consumed or the 30 days has expired, whichever comes first.',
  'Who should use Oracle Cloud Free Tier?\n\nOracle Cloud Free Tier services are for everyone. Whether you’re a developer building and testing applications, a startup founder creating new systems with the intention of scaling later, an enterprise looking to t

In [12]:
docs = [{'text': filename + ' | ' + section, 'path': filename} 
        for filename, sections in faqs.items() for section in sections]

# Sample the resulting data
docs[:2]


[{'text': 'faq | Who are you and what can you do?\n\nI am a RAG based chatbot engine based on OCI Generative AI Service and Oracle Database 23ai. I can answer questions about Oracle Cloud (OCI) and especially about the Free Trial and Always Free programs.',
  'path': 'faq'},
 {'text': 'faq | What is Oracle Cloud Free Tier?\n\nOracle Cloud Free Tier allows you to sign up for an Oracle Cloud account which provides a number of Always Free services and a Free Trial with US$300 of free credit to use on all eligible Oracle Cloud Infrastructure services for up to 30 days. The Always Free services are available for an unlimited period of time. The Free Trial services may be used until your US$300 of free credits are consumed or the 30 days has expired, whichever comes first.',
  'path': 'faq'}]

In [13]:
# Connect to the Oracle Database 23ai
un = "vector"
pw = "vector"
cs = "localhost/FREEPDB1"

import oracledb

connection = oracledb.connect(user=un, password=pw, dsn=cs)


In [14]:
table_name = 'faqs'

with connection.cursor() as cursor:
    # Create the table
    create_table_sql = f"""
    CREATE TABLE IF NOT EXISTS {table_name} (
        id NUMBER PRIMARY KEY,
        payload CLOB CHECK (payload IS JSON),
        vector VECTOR
    )"""
    
    try:
        cursor.execute(create_table_sql)
    except oracledb.DatabaseError as e:
        raise

connection.autocommit = True


In [15]:
from sentence_transformers import SentenceTransformer
encoder = SentenceTransformer('all-MiniLM-L12-v2')


In [16]:
import array

# Define a list to store the data
data = [
    {"id": idx, "vector_source": row['text'], "payload": row}
    for idx, row in enumerate(docs)
]

# Collect all texts for batch encoding
texts = [f"{row['vector_source']}" for row in data]

# Encode all texts in a batch
embeddings = encoder.encode(texts, batch_size=32, show_progress_bar=True)

# Assign the embeddings back to your data structure
for row, embedding in zip(data, embeddings):
    row['vector'] = array.array("f", embedding)


Batches: 100%|██████████| 7/7 [00:04<00:00,  1.57it/s]


In [17]:
# Insert the chunks + vectors in the database
import json

with connection.cursor() as cursor:
    # Truncate the table
    cursor.execute(f"truncate table {table_name}")

    prepared_data = [
        (row['id'], json.dumps(row['payload']), row['vector'])
        for row in data
    ]

    # Insert the data
    cursor.executemany(
        f"""INSERT INTO {table_name} (id, payload, vector)
        VALUES (:1, :2, :3)""",
        prepared_data
    )

connection.commit()


In [18]:
with connection.cursor() as cursor:
    # Define the query to select all rows from a table
    query = f"SELECT * FROM {table_name}"

    # Execute the query
    cursor.execute(query)

    # Fetch all rows
    rows = cursor.fetchall()

    # Print the rows
    for row in rows[:5]:
        print(row)


(0, {'text': 'faq | Who are you and what can you do?\n\nI am a RAG based chatbot engine based on OCI Generative AI Service and Oracle Database 23ai. I can answer questions about Oracle Cloud (OCI) and especially about the Free Trial and Always Free programs.', 'path': 'faq'}, array('f', [-0.02923165075480938, -0.02979695051908493, 0.009940098971128464, -0.000592286407481879, -0.010172951966524124, -0.06049307435750961, -0.021404225379228592, 0.04285281151533127, -0.057239823043346405, 0.06151881068944931, -0.03209104761481285, -0.06717190891504288, -0.019725656136870384, -0.012832336127758026, -0.01560217048972845, -0.04707515612244606, 0.06046362221240997, 0.06774024665355682, -0.014224204234778881, -0.07412220537662506, -0.07555384933948517, 0.0347326435148716, -0.00773048447445035, -0.13549751043319702, -0.06386523693799973, -0.038585904985666275, 0.008361537009477615, 0.011630968190729618, -0.024588435888290405, -0.050505947321653366, 0.002089392626658082, 0.015346381813287735, 0.0