# Create Vector Index with LangChain

In [1]:
from langchain.embeddings import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

## Read the documents

In [2]:
from os import path
from glob import glob


source_path = path.expanduser("~/iCloud/nvAlt/")
all_files = glob(path.join(source_path, "**/*.txt"), recursive=True)

In [12]:
from concurrent.futures import ThreadPoolExecutor
from langchain.docstore.document import Document
from langchain.document_loaders import UnstructuredFileLoader
from tqdm import tqdm

def load_document(file_path: str) -> Document:
    return UnstructuredFileLoader(file_path).load()[0]

documents = []
with ThreadPoolExecutor() as executor:
    with tqdm(total=len(all_files), desc="Loading documents") as pbar:
        for document in executor.map(load_document, all_files):
            documents.append(document)
            pbar.update

Loading documents:   0%|                                                                               | 0/192 [00:04<?, ?it/s]


## Create text splitter for our Markdown documents

In [13]:
from langchain.text_splitter import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header1"),
    ("##", "Header2"),
    ("###", "Header3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

## Chunk the documents using a text splitter

In [15]:
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    
    chunk_size = 200
    chunk_overlap = 20
    
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    documents = text_splitter.split_documents(documents)

# Langchain PGVector

## Persist Vector Index to PGVector storage

In [16]:
from langchain.vectorstores.pgvector import PGVector

CONNECTION_STRING = "postgresql+psycopg2://mercury:m3ssenger@localhost:5432/nvalt_vector_db"
COLLECTION_NAME = "markdown_notes"

db = PGVector.from_documents(
    embedding=embeddings,
    documents=documents,
    collection_name=COLLECTION_NAME,
    connection_string=CONNECTION_STRING,
)

In [17]:
db.similarity_search_with_score("Create a video with ffmpeg.")

[(Document(page_content='f image2 "img\n\n\n\n%03d.bmp"\n\n##  Create a video from a sequence of images\n\n$> ffmpeg\n\n\n\nf image2\n\n\n\ni foo\n\n\n\n%03d.jpeg\n\n\n\nr 12\n\n\n\ns WxH foo.avi', metadata={'source': '/Users/kevinkirkup/iCloud/nvAlt/FFmpeg Commands.txt'}),
  0.30965732893083764),
 (Document(page_content='# Overview\n\n# Why should I care?\n\n# Demo / Setup\n\n## Install ffmpeg via HomeBrew\n\n## Download source content', metadata={'source': '/Users/kevinkirkup/iCloud/nvAlt/Blogx- June 15, 2013 HTTP Live Streaming Presentation.txt'}),
  0.36481841107398383),
 (Document(page_content='$> ffmpeg\n\n\n\nloop 1\n\n\n\nr 30.0\n\n\n\ni 2014\n\n\n\n09\n\n\n\n05_10\n\n\n\n34\n\n\n\n59\n\n\n\nAM_TestImage.png\n\n\n\nt 00:00:30\n\n\n\nvcodec libx264\n\n\n\npreset fast\n\n\n\npix_fmt yuv420p\n\n\n\ns 1024x600 blah.ts', metadata={'source': '/Users/kevinkirkup/iCloud/nvAlt/FFmpeg Commands.txt'}),
  0.40763599572376585),
 (Document(page_content='## Home Movie Video Camera Convertion\