# Ingest jambonz.org verb docs
This notebook parses the source markdown files that appear under https://www.jambonz.org/docs/webhooks and adds them to a Vectara corpus

In [97]:
import re
import json
import requests
import os
from getpass import getpass

In [66]:
customer_id = 730456851 # vectara customer id
corpus_id = 3  # vectara corpus id
api_key = getpass("Enter your Vectara API key: ")

In [111]:
class MarkdownParser:
    def parse(self, file_path, customer, corpus_id):
        with open(file_path, 'r') as file:
            content = file.read()

        # Extract H1 header
        h1 = re.search(r'^# (.+)', content, re.MULTILINE).group(1)

        # Extract the first sentence as the brief description
        description = re.search(r'^\n(.+?)\.', content, re.MULTILINE | re.DOTALL).group(1).strip() + '.'

        # Extract the prelude text (everything after the first sentence until the table starts)
        prelude_start = content.find(description) + len(description)
        prelude_end = content.find('|', prelude_start)
        prelude_text = content[prelude_start:prelude_end].strip()

        sections = [
            {
                "metadataJson": json.dumps({"type": "prelude"}),
                "text": prelude_text
            }
        ]

        # Extract table of values if present
        table_match = re.search(r'^\| (.+?) \|\n\| (.+?) \|\n((?:\| .+? \|\n)+)', content, re.MULTILINE | re.DOTALL)
        if table_match:
            headers = [header.strip() for header in table_match.group(1).split('|')]
            rows = table_match.group(3).split('\n')
            table_data = [
                {headers[i].strip(): cell.strip() for i, cell in enumerate(row.split('|')) if i < len(headers)}
                for row in rows if row
            ]
            sections.append({
                "metadataJson": json.dumps({"type": "verb properties"}),
                "text": json.dumps(table_data)
            })

            # Determine the start of the follow-up text
            follow_up_start = content.find('\n\n', table_match.end()) + 2
        else:
            follow_up_start = prelude_end

        # Extract follow-up text if present
        follow_up_end = re.search(r'\n<p class="flex">', content, re.MULTILINE | re.DOTALL)
        follow_up_end = follow_up_end.start() if follow_up_end else len(content)
        follow_up_text = content[follow_up_start:follow_up_end].strip()
        if follow_up_text:
            sections.append({
                "metadataJson": json.dumps({"type": "detail"}),
                "text": follow_up_text
            })

        # Format data for Vectara
        vectara_data = {
            "documentId": f"verb:{h1}",
            "title": f"The jambonz {h1} verb",
            "description": description,
            "metadataJson": json.dumps({"verb": h1}),
            "sections": sections
        }

        return {
            "customerId": customer_id,
            "corpusId": corpus_id,
            "document": vectara_data
        }


In [108]:
def upload_to_vectara(data, customer_id, api_key):
    url = "https://api.vectara.io/v1/index"

    headers = {
        'Content-Type': 'application/json',
        'Accept': 'application/json',
        'x-api-key': api_key,
        'customer-id': str(customer_id)
    }

    payload = json.dumps(data)
    response = requests.request("POST", url, headers=headers, data=payload)
    print(response.text)

In [109]:
def process_files(root_folder, file_names, customer_id, corpus_id):
    parser = MarkdownParser()
    
    for file_name in file_names:
        print(f"Processing {file_name}")
        file_path = os.path.join(root_folder, file_name)
        data = parser.parse(file_path, customer_id, corpus_id)
        upload_to_vectara(data, customer_id, api_key)



In [115]:
parser = MarkdownParser()
root_folder = '/Users/dhorton/beachdog-enterprises/beachdog-networks/git/jambones.org/git/hosted-cpaas/next-static-site/markdown/docs/webhooks'
file_names = [
  'conference.md',
  'hangup.md',
  'leave.md',
  'lex.md',
  'redirect.md',
  'tag.md',
  'queue-notifications.md',
  'dialogflow.md',
  'dtmf.md',
  'message.md',
  'pause.md',
  'play.md',
  'rasa.md',
  'sip-decline.md',
  'sip-refer.md',
  'sip-request.md',
  'gather.md',
  'transcribe.md',
  'say.md',
  'listen.md',
  'dequeue.md',
  'enqueue.md',
  'overview.md',
  'dial.md',
  'recognizer.md',
  'config.md'
]
process_files(root_folder, file_names, customer_id, corpus_id)

Processing conference.md
{"status":{"code":"ALREADY_EXISTS", "statusDetail":"", "cause":null}, "quotaConsumed":{"numChars":"81", "numMetadataChars":"198"}}
Processing hangup.md
{"status":{"code":"ALREADY_EXISTS", "statusDetail":"", "cause":null}, "quotaConsumed":{"numChars":"87", "numMetadataChars":"200"}}
Processing leave.md
{"status":{"code":"ALREADY_EXISTS", "statusDetail":"", "cause":null}, "quotaConsumed":{"numChars":"71", "numMetadataChars":"183"}}
Processing lex.md
{"status":{"code":"ALREADY_EXISTS", "statusDetail":"", "cause":null}, "quotaConsumed":{"numChars":"82", "numMetadataChars":"202"}}
Processing redirect.md
{"status":{"code":"ALREADY_EXISTS", "statusDetail":"", "cause":null}, "quotaConsumed":{"numChars":"139", "numMetadataChars":"254"}}
Processing tag.md
{"status":{"code":"ALREADY_EXISTS", "statusDetail":"", "cause":null}, "quotaConsumed":{"numChars":"162", "numMetadataChars":"272"}}
Processing queue-notifications.md
{"status":{"code":"ALREADY_EXISTS", "statusDetail":""