#### Setup and Configuration

In [1]:
import re
import numpy as np
import faiss
import openai
from tqdm.auto import tqdm
import requests
import yaml

#### Load OpenAI API Key

In [2]:
with open("../secrets/credentials.yml", "r") as stream:
    config = yaml.safe_load(stream)


OPENAI_CREDENTIALS = config["OPENAI_CREDENTIALS"]

#### Read and Preprocess Markdown File

In [3]:
# Path to Markdown File
path_to_markdown_file = "../data/raw/mock_markdown.md"

# Read Markdown File
with open(path_to_markdown_file, "r", encoding="utf-8") as file:
    markdown_text = file.read()

# Split into sections by headers
sections = re.split(r"\n(#{1,3} .*)\n", markdown_text)
processed_sections = [sections[0]]
for i in range(1, len(sections), 2):
    processed_sections.append(sections[i] + sections[i + 1])

print(processed_sections[0])
print(processed_sections[-1])

# Introduction to AI

Artificial intelligence (AI) has rapidly become a key technology in many industries, revolutionizing processes and efficiency.

# Conclusion
The rapid advancement of AI presents both opportunities and challenges. As we continue to explore the boundaries of what AI can achieve, it is crucial to address the ethical implications and ensure the technology is used for the betterment of society.


#### Generate Embeddings for Each Section

In [4]:
def query_openai_embedding(api_key, text, model="text-embedding-ada-002"):
    """
    Queries OpenAI's embedding model for a single text and returns the embedding.

    :param api_key: OpenAI API key.
    :param text: Text to generate embedding for.
    :param model: Embedding model to use.
    :return: An embedding as a numpy array or a structured error message.
    """
    url = f"https://api.openai.com/v1/embeddings"
    headers = {"Authorization": f"Bearer {api_key}"}
    payload = {
        "input": text,
        "model": model,
    }

    try:
        response = requests.post(url, headers=headers, json=payload)
        if response.status_code == 200:
            data = response.json()
            embedding = np.array(data["data"][0]["embedding"], dtype="float32")
            return embedding
        else:
            error_message = f"HTTP Error {response.status_code}"
            try:
                error_details = response.json().get("error", {})
                message = error_details.get("message", "An unspecified error occurred")
            except ValueError:
                message = "Error details unavailable"
            error_message += f": {message}"
            return {"error": error_message}

    except requests.RequestException as e:
        return {"error": f"Connection error: {e}"}

#### Usage

In [5]:
embeddings = []
for text in processed_sections:
    embedding = query_openai_embedding(OPENAI_CREDENTIALS, text)
    if not isinstance(embedding, dict):
        embeddings.append(embedding)
    else:
        print("Error retrieving embedding:", embedding["error"])

# Convert the list of embeddings into a numpy array
embeddings = np.array(embeddings)
embeddings

array([[-0.01386116, -0.0071984 ,  0.00642108, ..., -0.01926976,
        -0.02738266, -0.01477566],
       [ 0.00907874, -0.02031003,  0.00869254, ..., -0.01133259,
        -0.01924642, -0.01218096],
       [-0.00515267, -0.03167649,  0.00684401, ..., -0.01012179,
         0.00393006, -0.00900079],
       ...,
       [ 0.00073092, -0.01179264,  0.03521476, ..., -0.00540034,
        -0.03646777, -0.00526329],
       [-0.0086527 , -0.02715379,  0.01826967, ..., -0.02409384,
        -0.01860395, -0.0146183 ],
       [-0.00052276, -0.03423951,  0.0165895 , ..., -0.02817942,
        -0.01378671, -0.01450634]], dtype=float32)

#### Create FAISS Index

In [6]:
# Create FAISS Index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

#### Querying

In [7]:
# Query
query_text = "Artificial Intelligence"
query_embedding = query_openai_embedding(OPENAI_CREDENTIALS, query_text)
query_embedding = np.array(query_embedding, dtype="float32")

# Search the FAISS index
distances, indices = index.search(query_embedding.reshape(1, -1), 2)

# Display Top Similar Sections
print("Top similar sections to the query:")
for idx in indices[0]:
    print(f"\nSection {idx+1}: {processed_sections[idx][:150]}...")

Top similar sections to the query:

Section 1: # Introduction to AI

Artificial intelligence (AI) has rapidly become a key technology in many industries, revolutionizing processes and efficiency.
...

Section 2: ## History of AI
The concept of artificial intelligence has been around for centuries, but it wasn't until the 20th century that it became a field of ...


## Connect with me 🌐
<div align="center">
  <a href="https://www.linkedin.com/in/labrijisaad/">
    <img src="https://img.shields.io/badge/LinkedIn-%230077B5.svg?&style=for-the-badge&logo=linkedin&logoColor=white" alt="LinkedIn" style="margin-bottom: 5px;"/>
  </a>
  <a href="https://github.com/labrijisaad">
    <img src="https://img.shields.io/badge/GitHub-100000?style=for-the-badge&logo=github&logoColor=white" alt="GitHub" style="margin-bottom: 5px;"/>
  </a>
</div>