In [1]:
# NEW
from langchain_community.document_loaders import PyPDFLoader, DirectoryLoader, CSVLoader
from langchain_community.embeddings import HuggingFaceEmbeddings

#from langchain.document_loaders import PyPDFLoader, DirectoryLoader, CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
#from langchain.embeddings import HuggingFaceEmbeddings
from typing import List
from langchain.schema import Document
from langchain_community.document_loaders import DataFrameLoader
import pandas as pd



  from .autonotebook import tqdm as notebook_tqdm


In [10]:

# Extract Data From PDF Directory
def load_pdf_file(data):
    loader = DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls=PyPDFLoader
    )
    documents = loader.load()
    return documents

# Extract Data From CSV
def load_csv_file(file_path):
    df = pd.read_csv(file_path, encoding="utf-8")  # try utf-8-sig if utf-8 fails
    loader = DataFrameLoader(df, page_content_column=df.columns[0])  # use first column as text
    documents = loader.load()
    return documents

# Keep minimal metadata
def filter_to_minimal_docs(docs: List[Document]) -> List[Document]:
    minimal_docs: List[Document] = []
    for doc in docs:
        src = doc.metadata.get("source")
        minimal_docs.append(
            Document(
                page_content=doc.page_content,
                metadata={"source": src}
            )
        )
    return minimal_docs

# Split into text chunks
def text_split(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=800, chunk_overlap=50
    )
    text_chunks = text_splitter.split_documents(extracted_data)
    return text_chunks

# Multilingual embeddings (English + Tamil)
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(
        model_name='intfloat/multilingual-e5-large'  # 1024 dimensions
    )
    return embeddings


In [11]:
system_prompt = (
    "You are a Cyber Safety Assistant for question-answering tasks. "
    "Use the following pieces of retrieved context to answer "
    "the question. If you don't know the answer, say that you "
    "don't know. Use three sentences maximum and keep the "
    "answer concise."
    "\n\n"
    "{context}"
)

In [12]:
from dotenv import load_dotenv
import os
from src.helper import load_pdf_file, load_csv_file, filter_to_minimal_docs, text_split, download_hugging_face_embeddings
from pinecone import Pinecone
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore

load_dotenv()

PINECONE_API_KEY = os.environ.get('PINECONE_API_KEY')
OPENAI_API_KEY = os.environ.get('OPENAI_API_KEY')

os.environ["PINECONE_API_KEY"] = PINECONE_API_KEY
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY



In [13]:

# Load English PDFs
eng_pdfs = load_pdf_file(data='D:\CyChat\CyChat\data\d-en')

# Load Tamil PDFs
tamil_pdfs = load_pdf_file(data='D:\CyChat\CyChat\data\d-ta')

# Load CSV (Tamil/English)
csv_docs = load_csv_file(file_path='D:\CyChat\CyChat\data\d-ta\csv\data-ta.csv')

# Merge all docs
all_docs = eng_pdfs + tamil_pdfs + csv_docs
filter_data = filter_to_minimal_docs(all_docs)
text_chunks = text_split(filter_data)

In [14]:
# Embeddings
embeddings = download_hugging_face_embeddings()

# Pinecone
pc = Pinecone(api_key=PINECONE_API_KEY)
index_name = "cychat"

# Create Pinecone index if missing
if not pc.has_index(index_name):
    pc.create_index(
        name=index_name,
        dimension=1024,   # multilingual-e5-large
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )

index = pc.Index(index_name)



In [None]:
# Upload documents
docsearch = PineconeVectorStore.from_documents(
    documents=text_chunks,
    index_name=index_name,
    embedding=embeddings,
    batch_size=50     # smaller batch
)

TypeError: langchain_pinecone.vectorstores.PineconeVectorStore.add_texts() got multiple values for keyword argument 'batch_size'