In [None]:
# !pip install unstructured

In [5]:
from langchain.document_loaders import UnstructuredMarkdownLoader
import markdown
from pathlib import Path

docs = []

# Adding OB docs into our `docs` dict

In [6]:
markdown_path = "/Users/ivantang/git_repos/oceanbase-doc/en-US"
for path in Path(markdown_path).rglob('*.md'):
    try:
        contents = open(path).read()
        md = markdown.Markdown(extensions = ['meta'])
        c = md.convert(contents)
        meta = md.Meta
        docs.append(dict(source=str(path),metadata=meta, contents=contents))
        breakpoint()
    except Exception as e:
        print(e)


# Adding Yugabyte docs into our `docs` dict

In [7]:
for path in Path("/Users/ivantang/git_repos/yugabyte-db/docs/content/stable").rglob('*.md'):
    try:
        contents = open(path).read()
        md = markdown.Markdown(extensions = ['meta'])
        c = md.convert(contents)
        meta = md.Meta
        docs.append(dict(source=str(path),metadata=meta, contents=contents))
        breakpoint()
    except Exception as e:
        print(e)

# Adding CockroachDB docs into our `docs` dict

In [8]:
for path in Path("/Users/ivantang/git_repos/cockroach-docs/src/current/v23.2").rglob('*.md'):
    try:
        contents = open(path).read()
        md = markdown.Markdown(extensions = ['meta'])
        c = md.convert(contents)
        meta = md.Meta
        docs.append(dict(source=str(path),metadata=meta, contents=contents))
        breakpoint()
    except Exception as e:
        print(e)

# Adding Pingcap's TiDB docs into our `docs` dict

In [10]:
for path in Path("/Users/ivantang/git_repos/tidb-docs").rglob('*.md'):
    try:
        contents = open(path).read()
        md = markdown.Markdown(extensions = ['meta'])
        c = md.convert(contents)
        meta = md.Meta
        docs.append(dict(source=str(path),metadata=meta, contents=contents))
        breakpoint()
    except Exception as e:
        print(e)

In [12]:
from langchain.embeddings import HuggingFaceBgeEmbeddings

model_name = "BAAI/bge-small-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
model_norm = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
)

In [13]:
import pandas as pd

df = pd.DataFrame.from_dict(docs)
df2 = df.join(pd.json_normalize(df['metadata']))
# df2['source'] = df2['source'].str.replace('/Users/ivantang/git_repos/oceanbase-doc/en-US/', 'https://github.com/oceanbase/oceanbase-doc/tree/V4.1.0/en-US/')

In [14]:
import re
import markdown

def remove_metadata(markdown_text):
    # Search for YAML metadata section using regular expression
    metadata_match = re.match(r'^---\s*\n(.+?)\s*\n---\s*\n', markdown_text, re.DOTALL)
      
    if metadata_match:
        # Remove metadata section from the content
        markdown_content = markdown_text[metadata_match.end():]
        return markdown_content
    else:
        return markdown_text


# Remove metadata section
markdown_content = remove_metadata(df2['contents'][0])

In [15]:
df2['contents'] = df2['contents'].apply(remove_metadata)
df2 = df2.drop(['metadata'], axis=1)

In [18]:
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.text_splitter import TokenTextSplitter

chunk_size = 1000
chunk_overlap = 100


loader = DataFrameLoader(df2, page_content_column='contents')
x = loader.load_and_split(TokenTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap))

In [39]:
len(x)

10890

In [19]:
from langchain.vectorstores.faiss import FAISS
from langchain.embeddings.openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv
import sys
import openai
import backoff

load_dotenv()

os.environ["OPENAI_API_TYPE"] = "azure"
os.environ["OPENAI_API_BASE"] = os.getenv("AZURE_OAI_ENDPOINT")
os.environ["OPENAI_API_KEY"] = os.getenv("AZURE_OAI_KEY")
os.environ["OPENAI_API_VERSION"] = "2023-05-15"

model_name = "BAAI/bge-small-en"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    encode_kwargs=encode_kwargs
)

In [None]:
%%time

@backoff.on_exception(backoff.expo, openai.error.RateLimitError)
def instantiate_with_backoff(**kwargs):
    return FAISS.from_documents(**kwargs)

vector_store = instantiate_with_backoff(documents=x, embedding=embeddings)

In [42]:
vector_store.save_local('./ob_vector_store_v3')