In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import WebBaseLoader
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModel
import os
from langchain_community.embeddings import HuggingFaceEmbeddings

tokenizer = AutoTokenizer.from_pretrained("WhereIsAI/UAE-Large-V1")

website_data_path = '/kaggle/input/raw-website-data'

docs_list = []
for filename in os.listdir(website_data_path):
    file_path = os.path.join(website_data_path, filename)
    if os.path.isfile(file_path):
        with open(file_path, 'r') as f:
            docs_list.append(f.read())
            
# Initialize the text splitter with the specified chunk size and overlap
text_splitter = CharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer, chunk_size=1000, chunk_overlap=50
)

# Split the loaded documents into smaller chunks
# Since `split_documents` function is not directly shown in the provided context,
# assuming a function that accepts a list of documents and splits each accordingly
documents = [text_splitter.split_text(doc) for doc in docs_list]
flattened_documents = [chunk for doc_chunks in documents for chunk in doc_chunks]

class Document:
    def __init__(self, text):
        self.page_content = text
        self.metadata = {}

# Create Document objects for each document string
documents_with_attributes = [Document(chunk) for chunk in flattened_documents]

# Define huggingface embeddings
model_name = "WhereIsAI/UAE-Large-V1"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [None]:
from langchain_pinecone import PineconeVectorStore

index_name = 'aipi-chatbot'
docsearch = PineconeVectorStore.from_documents(documents_with_attributes, hf, index_name=index_name)