In [31]:
import requests
from bs4 import BeautifulSoup
import pickle
import os


def fetch_or_load_content(url, pickle_filename="response.pkl"):
    """
    Fetches content from the URL if the pickle file does not exist. 
    Otherwise, loads content from the pickle file.
    
    Args:
        url (str): The URL to fetch content from.
        pickle_filename (str): The filename of the pickle file to save/load the content.
    
    Returns:
        BeautifulSoup: Parsed HTML content as a BeautifulSoup object.
        str: Page title.
    """
    # Check if the pickle file exists
    if os.path.exists(pickle_filename):
        # Load the saved HTML content from the pickle file
        with open(pickle_filename, "rb") as file:
            response_text = pickle.load(file)
        print("Loaded content from pickle.")
    else:
        # Make the HTTP request
        response = requests.get(url)
        response_text = response.text

        # Save the response to the pickle file
        with open(pickle_filename, "wb") as file:
            pickle.dump(response_text, file)
        print("Fetched content from the web and saved to pickle.")

    # Parse HTML content
    soup = BeautifulSoup(response_text, "html.parser")
    title = soup.title.string if soup.title else "Documentation"
    
    return soup, title


In [28]:
def html_to_markdown(element):
    # Replace certain tags with markdown equivalents
    text = element.get_text(separator="\n")
    text = re.sub(r'\n+', '\n', text) 
    return f"# {title}\n\n{text}"

In [51]:
import os

def load_page_generator(docs_uris):
    for uri in docs_uris:
        url = host + uri

        file_path = f"docs/{uri.split('.')[0]}.json"

        if os.path.exists(file_path):
            print(f"{uri} exists.")
            continue

        print(url)

        soup, title = fetch_or_load_content(url, f"req/{uri}.pkl")
        
        # Extract relevant parts by targeting main body
        main_content = soup.find("div", {"id": "content"})        
        
        soup = BeautifulSoup(response.text, "html.parser")
        plain_text = soup.get_text(separator="\n")

        main_content = soup.find(id="content")
        main_content
    
        yield plain_text

    # Save to file
    # with open(f"{uri}.txt", "w") as f:
    #     f.write(plain_text)
    

In [None]:
def generate_tags(text):
    raise Exception("")

def generate_target_outcomes(text):
    raise Exception("")

def generate_summary(text):
    raise Exception("")

def generate_questions_this_answers(text):
    raise Exception("")

In [119]:
from datetime import datetime
import json
import re


def extract_text_between_markers(text):
    match = re.search(r'\[\[(.*?)\]\]', text)
    return match.group(1) if match else "_"

def extract_role(text):
    return re.findall(r'\[role="([^"]+)"\]', text)

def includes_code(text):
    return "--------------------------------------------------" in text


elastic_host = "https://www.elastic.co/guide/en/elasticsearch/reference/current/"

def transform_documentation_page_to_doc(source):
    doc_title = extract_text_between_markers(source),
    
    doc = {
        "meta": {
            "timestamp": datetime.utcnow().isoformat(),
            "size": len(source),
            
            "url": elastic_host + doc_title[0] + ".html",
            # "hash": hash_text(source),
            "type": "documentation",
            "role": extract_role(source),
            "has_code": includes_code(source),
            "title": doc_title[0],
            "version": "8.15",

            # "tag": generate_tags(source),
            # "outcomes": generate_target_outcomes(source),
            # "summary": generate_summary(source),
            # "questions": generate_questions_this_answers(source)
        },
        "doc": source,
    }
    
    return json.dumps(doc, indent=4)


def define_documentation_doc_title(content):
    return extract_text_between_markers(content)
    

In [129]:
import json
from datetime import datetime
import os


api_specification_details = """

The specification contains:

* The _name_ of the API (`indices.create`), which usually corresponds to the client calls
* Link to the documentation at the <http://elastic.co> website.

  **IMPORANT:** This should be a _live_ link. Several downstream ES clients use
  this link to generate their documentation. Using a broken link or linking to
  yet-to-be-created doc pages can break the [Elastic docs
  build](https://github.com/elastic/docs#building-documentation).
* `stability` indicating the state of the API, has to be declared explicitly or YAML tests will fail
    * `experimental` highly likely to break in the near future (minor/patch), no bwc guarantees.
    Possibly removed in the future.
    * `beta` less likely to break or be removed but still reserve the right to do so
    * `stable` No backwards breaking changes in a minor
* Request URL: HTTP method, path and parts
* Request parameters
* Request body specification

**NOTE**
If an API is stable but it response should be treated as an arbitrary map of key values please notate this as followed

```json
{
  "api.name": {
    "stability" : "stable",
    "response": {
      "treat_json_as_key_value" : true
    }
  }
}
```

## Type definition
In the documentation, you will find the `type` field, which documents which type every parameter will accept.

#### Querystring parameters
| Type  | Description  |
|---|---|
| `list`  | An array of strings *(represented as a comma separated list in the querystring)* |
| `date` | A string representing a date formatted in ISO8601 or a number representing milliseconds since the epoch *(used only in ML)*   |
| `time` | A numeric or string value representing duration |
| `string` | A string value  |
| `enum` | A set of named constants *(a single value should be sent in the querystring)*  |
| `int` | A signed 32-bit integer with a minimum value of -2<sup>31</sup> and a maximum value of 2<sup>31</sup>-1.  |
| `double` | A [double-precision 64-bit IEEE 754](https://en.wikipedia.org/wiki/Floating-point_arithmetic) floating point number, restricted to finite values.  |
| `long` | A signed 64-bit integer with a minimum value of -2<sup>63</sup> and a maximum value of 2<sup>63</sup>-1. *(Note: the max safe integer for JSON is 2<sup>53</sup>-1)* |
| `number` | Alias for `double`. *(deprecated, a more specific type should be used)*  |
| `boolean` | Boolean fields accept JSON true and false values  |

{
  "documentation" : {
    "description": "Parameters that are accepted by all API endpoints.",
    "url": "https://www.elastic.co/guide/en/elasticsearch/reference/current/common-options.html"
  },
  "params": {
    "pretty": {
      "type": "boolean",
      "description": "Pretty format the returned JSON response.",
      "default": false
    },
    "human": {
      "type": "boolean",
      "description": "Return human readable values for statistics.",
      "default": true
    },
    "error_trace": {
      "type": "boolean",
      "description": "Include the stack trace of returned errors.",
      "default": false
    },
    "source": {
      "type": "string",
      "description": "The URL-encoded request definition. Useful for libraries that do not accept a request body for non-POST requests."
    },
    "filter_path": {
      "type": "list",
      "description": "A comma-separated list of filters used to reduce the response."
    }
  }
}



"""





def transform_api_spec_to_doc(api_spec, elastic_host="https://www.elastic.co"):
    """Extracts relevant info from an API specification JSON file."""

    spec_as_json = json.loads(api_spec)
    
    api_name = list(spec_as_json.keys())[0]
    source = spec_as_json[api_name]
    
    # Extracting main details
    doc_title = source.get("documentation", {}).get("description", "")
    doc_url = source.get("documentation", {}).get("url", "")
    stability = source.get("stability", "")
    response_key_value = source.get("response", {}).get("treat_json_as_key_value", False)
    visibility = source.get("visibility", "public")
    url_paths = source.get("url", {}).get("paths", [])
    params = source.get("params", {})

    
    # Organizing parameters with their type descriptions
    param_types = {param: params[param].get("type", "unknown") for param in params}
    
    # Structuring the output document
    doc = {
        "meta": {
            "timestamp": datetime.utcnow().isoformat(),
            "api_name": api_name,
            "stability": stability,
            "visibility": visibility,
            "main_component": api_name if len(api_name.split(".")) == 1 else api_name.split(".")[0],
            "url": doc_url,
            "elastic_url": f"{elastic_host}/{api_name.replace('.', '/')}.html",
            "treat_json_as_key_value": response_key_value,
            "title": doc_title,
            "paths": url_paths,
            "parameter_types": param_types,
            "doc_version": "8.15",
        },
        "doc": source,
    }
    
    return json.dumps(doc, indent=4)


def define_api_spec_doc_title(content):
    spec_as_json = json.loads(content)
    return list(spec_as_json.keys())[0]



In [133]:
import os



def read_file_with_fallback(file_path):
    try:
        # Attempt reading with UTF-8 encoding
        with open(file_path, 'r', encoding='utf-8') as f:
            return f.read()
    except UnicodeDecodeError:
        # Fallback to reading as binary and decoding manually
        with open(file_path, 'rb') as f:
            return f.read().decode('utf-8', errors='replace')  # Replace un-decodable chars

def create_doc_from_file(filepath, transform, create_doc_filemame):
    content = read_file_with_fallback(filepath)
    processed_content = transform(content)
    doc_filename = create_doc_filemame(content)

    output_path = f"docs/{doc_filename}.json"
    with open(output_path, 'w') as output_file:
        output_file.write(processed_content)

def process_files(folder_path, transform, create_doc_filemame):
    for root, _, files in os.walk(folder_path):
        for each in files:
            file_path = os.path.join(root, each)
            create_doc_from_file(file_path, transform, create_doc_filemame)


documentation_folder_path = "data/documentation"
process_files(
    documentation_folder_path, 
    transform_documentation_page_to_doc,
    define_documentation_doc_title,
)

rest_api_doc_folder_path = "data/documentation/rest-api"
process_files(
    rest_api_doc_folder_path,
    transform_api_spec_to_doc,
    define_api_spec_doc_title,
)

In [53]:
import requests
from bs4 import BeautifulSoup
import pickle
import os


host = "https://www.elastic.co/guide/en/elasticsearch/reference/current/"


docs_uris = [each[1] for each in toc_links]


for plain_text, uri in zip(load_page_generator(docs_uris), docs_uris):
    file_path = f"docs/{uri.split('.')[0]}.json"
    
    content = parse_content_to_docs(plain_text, f"{host}/{uri}")

    with open(file_path, "w") as file:
        file.write(content)
    # break
    
    


elasticsearch-intro.html exists.
elasticsearch-intro-what-is-es.html exists.
elasticsearch-intro-deploy.html exists.
documents-indices.html exists.
es-ingestion-overview.html exists.
search-analyze.html exists.
scalability.html exists.
quickstart.html exists.
getting-started.html exists.
full-text-filter-tutorial.html exists.
setup.html exists.
run-elasticsearch-locally.html exists.
install-elasticsearch.html exists.
targz.html exists.
zip-windows.html exists.
deb.html exists.
rpm.html exists.
docker.html exists.
settings.html exists.
important-settings.html exists.
secure-settings.html exists.
auditing-settings.html exists.
circuit-breaker.html exists.
modules-cluster.html exists.
misc-cluster-settings.html exists.
ccr-settings.html exists.
modules-discovery-settings.html exists.
modules-fielddata.html exists.
health-diagnostic-settings.html exists.
ilm-settings.html exists.
data-stream-lifecycle-settings.html exists.
index-management-settings.html exists.
recovery.html exists.
indexi

In [49]:
len(docs_uris)

1584

In [54]:
from bs4 import BeautifulSoup
import requests

# Example HTML content or fetch from a URL
url = "https://www.elastic.co/guide/en/elasticsearch/reference/current/paginate-search-results.html"
html_content = requests.get(url).text

# Parse the HTML content
soup = BeautifulSoup(html_content, "html.parser")




In [56]:
print(soup.html)

<html lang="en-us">
<head>
<meta charset="utf-8"/>
<title>Paginate search results | Elasticsearch Guide [8.15] | Elastic</title>
<meta class="elastic" content="Paginate search results | Elasticsearch Guide [8.15]" name="content"/>
<link href="index.html" rel="home" title="Elasticsearch Guide [8.15]"/>
<link href="search-your-data.html" rel="up" title="The search API"/>
<link href="sort-search-results.html" rel="prev" title="Sort search results"/>
<link href="search-fields.html" rel="next" title="Retrieve selected fields from a search"/>
<meta class="elastic" content="8.15" name="product_version">
<meta class="elastic" content="Elasticsearch" name="product_name">
<meta class="elastic" content="documentation" name="website_area"/>
<meta content="Learn/Docs/Elasticsearch/Reference/8.15" name="DC.type"/>
<meta content="Elasticsearch" name="DC.subject"/>
<meta content="8.15" name="DC.identifier"/>
<meta content="text/html; charset=utf-8" http-equiv="content-type"/>
<meta content="IE=edge" h

In [63]:
main_content = soup.find(id="content")
main_content.contents

# soup2.text


['\n',
 <div class="content-wrapper">
 <section id="guide" lang="en">
 <div class="container-fluid">
 <div class="row pb-3">
 <div class="col-12 order-2 col-md-4 order-md-1 col-lg-3 h-almost-full-md sticky-top-md" id="left_col">
 <!-- The TOC is appended here -->
 </div>
 <div class="col-12 order-1 col-md-8 order-md-2 col-lg-7 order-lg-2 guide-section" id="middle_col">
 <!-- start body -->
 <div class="navheader">
 <span class="prev">
 <a href="sort-search-results.html">« Sort search results</a>
 </span>
 <span class="next">
 <a href="search-fields.html">Retrieve selected fields from a search »</a>
 </span>
 </div>
 <div class="book" lang="en">
 <div class="titlepage">
 <div class="breadcrumbs">
 <span class="breadcrumb-link"><a href="/guide/">Elastic Docs</a></span>
 <span class="chevron-right">›</span><span class="breadcrumb-link"><a href="index.html">Elasticsearch Guide [8.15]</a></span>
 <span class="chevron-right">›</span><span class="breadcrumb-link"><a href="search-with-elastics

In [24]:
import requests
from bs4 import BeautifulSoup

# Define the URL and headers for the request
url = "https://www.elastic.co/guide/en/elasticsearch/reference/current/toc.html"
headers = {
    "sec-ch-ua-platform": "\"Windows\"",
    "Referer": "https://www.elastic.co/guide/en/elasticsearch/reference/current/search-with-elasticsearch.html",
    "sec-ch-ua": "\"Chromium\";v=\"130\", \"Google Chrome\";v=\"130\", \"Not?A_Brand\";v=\"99\"",
    "sec-ch-ua-mobile": "?0",
    "X-Requested-With": "XMLHttpRequest",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36",
    "Accept": "*/*",
    "DNT": "1"
}

# Fetch the TOC HTML content
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, "html.parser")

# Find all TOC links
toc_links = []
for link in soup.select(".toc a[href]"):
    title = link.get_text(strip=True)
    href = link["href"]
    toc_links.append((title, href))

# Display the TOC links
for title, href in toc_links:
    print(f"{title}: {href}")

Elasticsearch basics: elasticsearch-intro.html
What is Elasticsearch?: elasticsearch-intro-what-is-es.html
Run Elasticsearch: elasticsearch-intro-deploy.html
Indices and documents: documents-indices.html
Add data to Elasticsearch: es-ingestion-overview.html
Search and analyze data: search-analyze.html
Get ready for production: scalability.html
Quick starts: quickstart.html
Basics: Index and search using APIs: getting-started.html
Basics: Full-text search and filtering: full-text-filter-tutorial.html
Set up Elasticsearch: setup.html
Run Elasticsearch locally: run-elasticsearch-locally.html
Installing Elasticsearch: install-elasticsearch.html
Install Elasticsearch from archive on Linux or MacOS: targz.html
Install Elasticsearch with.zipon Windows: zip-windows.html
Install Elasticsearch with Debian Package: deb.html
Install Elasticsearch with RPM: rpm.html
Install Elasticsearch with Docker: docker.html
Configuring Elasticsearch: settings.html
Important Elasticsearch configuration: importa

In [26]:
toc_links

[('Elasticsearch basics', 'elasticsearch-intro.html'),
 ('What is Elasticsearch?', 'elasticsearch-intro-what-is-es.html'),
 ('Run Elasticsearch', 'elasticsearch-intro-deploy.html'),
 ('Indices and documents', 'documents-indices.html'),
 ('Add data to Elasticsearch', 'es-ingestion-overview.html'),
 ('Search and analyze data', 'search-analyze.html'),
 ('Get ready for production', 'scalability.html'),
 ('Quick starts', 'quickstart.html'),
 ('Basics: Index and search using APIs', 'getting-started.html'),
 ('Basics: Full-text search and filtering', 'full-text-filter-tutorial.html'),
 ('Set up Elasticsearch', 'setup.html'),
 ('Run Elasticsearch locally', 'run-elasticsearch-locally.html'),
 ('Installing Elasticsearch', 'install-elasticsearch.html'),
 ('Install Elasticsearch from archive on Linux or MacOS', 'targz.html'),
 ('Install Elasticsearch with.zipon Windows', 'zip-windows.html'),
 ('Install Elasticsearch with Debian Package', 'deb.html'),
 ('Install Elasticsearch with RPM', 'rpm.html')

In [22]:
html_content




In [14]:
# Parse HTML and convert to plain text
soup = BeautifulSoup(response.text, "html.parser")
plain_text = soup.get_text(separator="\n")

In [None]:
print(plain_text)