In [2]:
import pandas as pd
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS,DistanceStrategy
from langchain_community.document_loaders import DataFrameLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

In [3]:
DATA_PATH = "../data/main_data/synthetic-resumes.csv"
FAISS_PATH = "../vectorstore"
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

In [4]:
df = pd.read_csv(DATA_PATH)
loader = DataFrameLoader(df, page_content_column='Resume')

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=500,
)

embedding = HuggingFaceEmbeddings(
    model_name=EMBEDDING_MODEL,
    model_kwargs={"device": "cpu"}
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
document = loader.load()
document_chunk = text_splitter.split_documents(document)

In [9]:
vectorstore_db = FAISS.from_documents(document_chunk, embedding , distance_strategy=DistanceStrategy.COSINE)
vectorstore_db.save_local(FAISS_PATH)