In [7]:
from langchain_community.embeddings import SentenceTransformerEmbeddings
from langchain_community.document_loaders import UnstructuredFileLoader, DirectoryLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Qdrant
import os
from dotenv import load_dotenv
from qdrant_client import QdrantClient

In [8]:
load_dotenv()
client = QdrantClient(
    url=os.getenv("Qdrant_HOST"), 
    api_key=os.getenv("Qdrant_API_KEY")
)

In [9]:
from qdrant_client.http import models
load_dotenv()
vector_config = models.VectorParams(
    size=768,
    distance=models.Distance.COSINE
)

client.recreate_collection(
    collection_name=os.getenv("Qdrant_Collection_name"),
    vectors_config=vector_config
)

True

In [10]:
load_dotenv()
embeddings=SentenceTransformerEmbeddings(model_name="NeuML/pubmedbert-base-embeddings")

vector_store = Qdrant(
    client=client,
    collection_name=os.getenv("Qdrant_Collection_name"),
    embeddings=embeddings
)


In [11]:
def get_chunks(text):
    text_splitter = CharacterTextSplitter(
        separator='\n',
        chunk_size=1500,
        chunk_overlap=200,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

text_directory = './data2'

# Read each text file and apply the get_chunks function
all_chunks = []
for text_file in os.listdir(text_directory):
    if text_file.endswith('.txt'):
        file_path = os.path.join(text_directory, text_file)
        with open(file_path, 'r', encoding='utf-8') as file:
            text_content = file.read()
            file_chunks = get_chunks(text_content)
            all_chunks.extend(file_chunks)

vector_store.add_texts(all_chunks)

['ab72aad91e624c148c92381b93778fa4',
 '3c440faeab5444b5945b98d432b5e022',
 'fed41bdf15f24aa1aebd28f49d461cb2',
 'b7a0df88d2764d9ca4e5b61d7aabb8a3',
 'bf70a376748f4fad945187cf1b0ff6af',
 'e2d0fc79f756405597caa66f93d21fc4',
 'eec76a71350a4aa5829e54c93b646d8e',
 '7a004e735d804d928950c8eff9311a47',
 '4ee35dff858348f1a59238d41720f742',
 'a84b972a6ecd4533b29aa9e063b847e8',
 '88970ecc53b84ebda59faf5e7c0efa79',
 'c73e665748d24f728b47f579c97bc49d',
 '38da5d5486a84ef893c16bcd874914e7',
 '8d5cfddc8c5e4fbf97e0f9325589398c',
 'd31b25a8dc19460994e8982e0ef3e971',
 'df9fc82aeb04493394b97a9352b51521',
 'bbce997932bb4ad6ba76e2ae867cae59',
 '316da1016dba4753b743b65ea83fd3cd',
 'f9087647eabe4e218533fcebd8dc8997',
 'e6eb1b369a374dfcb7b9fc702242cf6e',
 '0c9ae7df82984222853d148e77fe9feb',
 '43d67c32a5f2480e854dd157336146df',
 '5f26ab939df74695b324eaa887688561',
 '0c046fa8486b4008af23ba3dc4f75e0a',
 '2552763d37ca42ee98f96f4ff8706620',
 '31a006e001b742e8b2eb168c39275b19',
 '29fcb3932cd74fb1887b712b418de985',
 