# Lesson 3: Vectorstores and embeddings

# Vectorstore ingestion

In [1]:
// Import environment variables from .env file for secure API access

import "dotenv/config";

[Module: null prototype] { default: {} }

In [2]:
// Import the OpenAIEmbeddings class for generating text embeddings
import { OpenAIEmbeddings } from "@langchain/openai";

// Create an instance of OpenAIEmbeddings
const embeddings = new OpenAIEmbeddings();

// Generate an embedding for the provided text and await the result
await embeddings.embedQuery("This is some sample text");

[
   [33m-0.010393001[39m,   [33m0.0024292984[39m, [33m-0.00073567877[39m,  [33m-0.010911339[39m,    [33m-0.01144936[39m,
    [33m0.022924965[39m,   [33m-0.014644682[39m,   [33m0.0017452893[39m,  [33m-0.017505383[39m,   [33m-0.019303162[39m,
    [33m0.005166975[39m,    [33m0.034170926[39m,   [33m-0.012210463[39m,  [33m0.0019372054[39m,    [33m0.004691285[39m,
    [33m0.013155282[39m,    [33m0.024591519[39m,   [33m0.0018027001[39m,   [33m0.004596147[39m,  [33m-0.0062331758[39m,
  [33m-0.0051505715[39m, [33m-0.00069098035[39m,   [33m-0.008208108[39m,   [33m0.013975437[39m,   [33m-0.008962651[39m,
  [33m-0.0040417225[39m,  [33m-0.0007192757[39m,   [33m-0.019657468[39m,  [33m0.0040154774[39m,  [33m-0.0017288862[39m,
    [33m0.016048787[39m,   [33m-0.021862045[39m, [33m-0.00078570825[39m,  [33m-0.022295086[39m,   [33m0.0062561403[39m,
    [33m0.007040208[39m,   [33m-0.011259085[39m,   [33m-0.013417731[39m,   [33m0.

In [6]:
// Import the similarity function from the 'ml-distance' package.
// This function is typically used to calculate similarity or distance between vectors.
import { similarity } from "ml-distance";

// Assuming an 'embeddings' object from previous context, which is an instance of some model capable of generating text embeddings.

// Generate an embedding for a query about vectors in machine learning.
// 'embedQuery' is an asynchronous function that returns the embedding of the input text.
const vector1 = await embeddings.embedQuery(
    "What are vectors useful for in machine learning?"
);

// Generate an embedding for a query about parrots, likely unrelated to the first query.
// This demonstrates generating embeddings for vastly different pieces of text to compare their similarity later.
const unrelatedVector = await embeddings.embedQuery(
    "A group of parrots is called a pandemonium."
);

In [4]:
similarity.cosine(vector1, unrelatedVector);

[33m0.6961084034193643[39m

In [7]:
// Generate an embedding for a query about vector representations in information.
const similarVector = await embeddings.embedQuery(
    "Vectors are representations of information."
);

// Calculate the cosine similarity between 'vector1' and 'similarVector'.
// This measures how similar the two text embeddings are, with values closer to 1 indicating high similarity.
similarity.cosine(vector1, similarVector);


[33m0.8588060436065783[39m

In [8]:
// Importing the PDF parsing utility.
import * as parse from "pdf-parse";

// Import PDFLoader for loading PDF files from the filesystem.
import { PDFLoader } from "langchain/document_loaders/fs/pdf";

// Import RecursiveCharacterTextSplitter for splitting text into smaller chunks.
import { RecursiveCharacterTextSplitter } from "langchain/text_splitter";

// Initialize PDFLoader with the path to the PDF document.
const loader = new PDFLoader("./data/MachineLearning-Lecture01.pdf");

// Load the PDF document using the loader.
const rawCS229Docs = await loader.load();

// Initialize the text splitter with specified chunk size and overlap.
const splitter = new RecursiveCharacterTextSplitter({
  chunkSize: 128,  // Size of each text chunk.
  chunkOverlap: 0, // No overlap between chunks.
});

// Split the loaded PDF document into chunks using the text splitter.
const splitDocs = await splitter.splitDocuments(rawCS229Docs);


In [9]:
// Import MemoryVectorStore for storing vectors in memory.
import { MemoryVectorStore } from "langchain/vectorstores/memory";

// Initialize a MemoryVectorStore with preloaded embeddings.
const vectorstore = new MemoryVectorStore(embeddings);

In [10]:
// Add the split documents to the in-memory vector store.
await vectorstore.addDocuments(splitDocs);


In [11]:
// Perform a similarity search in the vector store using a query.
const retrievedDocs = await vectorstore.similaritySearch(
    "What is deep learning?", // The search query.
    4                          // Number of similar documents to retrieve.
);

// Extract the page content from each retrieved document.
const pageContents = retrievedDocs.map(doc => doc.pageContent);

// The variable 'pageContents' now holds the content of the pages
// most similar to the search query.
pageContents


[
  [32m"piece of research in machine learning, okay?"[39m,
  [32m"are using a learning algorithm, perhaps without even being aware of it."[39m,
  [32m"some of my own excitement about machine learning to you."[39m,
  [32m"of the class, and then we'll start to talk a bit about machine learning."[39m
]

# Retrievers

In [12]:
// Convert the vector store into a retriever for document retrieval.
const retriever = vectorstore.asRetriever();

In [13]:
// same documents about deep learning
await retriever.invoke("What is deep learning?")

[
  Document {
    pageContent: [32m"piece of research in machine learning, okay?"[39m,
    metadata: {
      source: [32m"./data/MachineLearning-Lecture01.pdf"[39m,
      pdf: {
        version: [32m"1.10.100"[39m,
        info: {
          PDFFormatVersion: [32m"1.4"[39m,
          IsAcroFormPresent: [33mfalse[39m,
          IsXFAPresent: [33mfalse[39m,
          Title: [32m""[39m,
          Author: [32m""[39m,
          Creator: [32m"PScript5.dll Version 5.2.2"[39m,
          Producer: [32m"Acrobat Distiller 8.1.0 (Windows)"[39m,
          CreationDate: [32m"D:20080711112523-07'00'"[39m,
          ModDate: [32m"D:20080711112523-07'00'"[39m
        },
        metadata: Metadata { _metadata: [36m[Object: null prototype][39m },
        totalPages: [33m22[39m
      },
      loc: { pageNumber: [33m8[39m, lines: { from: [33m2[39m, to: [33m2[39m } }
    }
  },
  Document {
    pageContent: [32m"are using a learning algorithm, perhaps without even being aw