In [1]:
from langchain import hub
from langchain_chroma import Chroma
from langchain_community.document_loaders import WebBaseLoader
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter,TokenTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain_openai import AzureChatOpenAI, AzureOpenAIEmbeddings
import os
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from uuid import uuid4
from dotenv import load_dotenv
load_dotenv()

USER_AGENT environment variable not set, consider setting it to identify your requests.
  from tqdm.autonotebook import tqdm


True

In [2]:
AZURE_OPENAI_ENDPOINT = os.getenv('AZURE_OPENAI_ENDPOINT')
AZURE_OPENAI_DEPLOYMENT_ID = os.getenv('AZURE_OPENAI_DEPLOYMENT_ID')
AZURE_OPENAI_KEY = os.getenv('AZURE_OPENAI_KEY')
AZURE_API_VERSION = os.getenv('AZURE_API_VERSION')
PINECONE_API_KEY = os.getenv('PINECONE_API_KEY')
pc = Pinecone(api_key=PINECONE_API_KEY)

In [3]:
llm = AzureChatOpenAI(
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            azure_deployment=AZURE_OPENAI_DEPLOYMENT_ID,
            api_version=AZURE_API_VERSION,
            api_key=AZURE_OPENAI_KEY,
            temperature=0.0,
            verbose=True,
        )

embedding_llm = AzureOpenAIEmbeddings(
            azure_endpoint=AZURE_OPENAI_ENDPOINT,
            azure_deployment='embedding-ada-crayon',
            api_key=AZURE_OPENAI_KEY,
            api_version=AZURE_API_VERSION,
        )

# using code base setting for uploading to pinecone

In [4]:
import time

index_name = "ocbc-cv-gpt"  # change if desired

existing_indexes = [index_info["name"] for index_info in pc.list_indexes()]

if index_name not in existing_indexes:
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric="cosine",
        spec=ServerlessSpec(cloud="aws", region="us-east-1"),
    )
    while not pc.describe_index(index_name).status["ready"]:
        time.sleep(1)

index = pc.Index(index_name)

In [5]:
index_name = "ocbc-cv-gpt"  # change if desired
index = pc.Index(index_name)

In [6]:
import os
from langchain_community.document_loaders import Docx2txtLoader
from langchain_community.document_loaders import UnstructuredWordDocumentLoader

# Folder containing the DOCX files
folder_path = r"C:\Users\san\Documents\Documents\OCBC\Github_ocbc_workspace\OCBC_CV_RAG\job_pdf"

# Get a list of all DOCX files in the folder
docx_files = [os.path.join(folder_path, file) for file in os.listdir(folder_path) if file.endswith('.docx')]

all_pages = []

for docx_file in docx_files:
    loader = UnstructuredWordDocumentLoader(docx_file)
    pages = loader.load_and_split()
    all_pages.extend(pages)  # Combine all pages into a single list

# Now, 'all_pages' contains the split pages from all the DOCX files in the folder


In [7]:
all_pages

[Document(metadata={'source': 'C:\\Users\\san\\Documents\\Documents\\OCBC\\Github_ocbc_workspace\\OCBC_CV_RAG\\job_pdf\\JD ON_Software_Engineer 1.docx'}, page_content='Job Description\n\nDATA JABATAN :\n\nJOB ID OPS540, OPS489, OPS536,OPS519,OPS526 NAMA JABATAN Software Engineer GRUP Operations & Technology REPORT TO POSITION Platform Lead Engineer FUNGSI UTAMA Menjalankan proses development, testing dan release sistem teknologi informasi sesuai dengan pedoman Agile Devops untuk memberikan value bisnis, serta memastikan kualitas deliverables secara konsisten dilaksanakan dengan penuh tanggungjawab\n\nTANGGUNG JAWAB/ TUGAS POKOK PEKERJAAN\n\nMemastikan penggunaan framework yang telah ditetapkan dalam pelaksanaan project dengan tetap mengacu pada Peraturan OJK sebagai landasan utama.\n\nMelaksanakan swakelola dan bekerja lintas fungsi dalam menyelesaikan pekerjaan sesuai backlog yang telah ditentukan.\n\nMenghasilkan produk bernilai lebih melalui proses development, testing, dan release 

In [8]:
all_pages_final = []

for doc in all_pages:
    filename = doc.metadata.get('source').split('\\')[-1]  # Extract the filename
    doc.page_content = f"Filename: {filename}\n\n{doc.page_content}"  # Embed the filename
    
    all_pages_final.append(doc) 

In [9]:
pc = Pinecone(api_key=PINECONE_API_KEY)
text_splitter = TokenTextSplitter(chunk_size=500, chunk_overlap=200)
splits = text_splitter.split_documents(all_pages)

In [10]:
splits

[Document(metadata={'source': 'C:\\Users\\san\\Documents\\Documents\\OCBC\\Github_ocbc_workspace\\OCBC_CV_RAG\\job_pdf\\JD ON_Software_Engineer 1.docx'}, page_content='Filename: JD ON_Software_Engineer 1.docx\n\nJob Description\n\nDATA JABATAN :\n\nJOB ID OPS540, OPS489, OPS536,OPS519,OPS526 NAMA JABATAN Software Engineer GRUP Operations & Technology REPORT TO POSITION Platform Lead Engineer FUNGSI UTAMA Menjalankan proses development, testing dan release sistem teknologi informasi sesuai dengan pedoman Agile Devops untuk memberikan value bisnis, serta memastikan kualitas deliverables secara konsisten dilaksanakan dengan penuh tanggungjawab\n\nTANGGUNG JAWAB/ TUGAS POKOK PEKERJAAN\n\nMemastikan penggunaan framework yang telah ditetapkan dalam pelaksanaan project dengan tetap mengacu pada Peraturan OJK sebagai landasan utama.\n\nMelaksanakan swakelola dan bekerja lintas fungsi dalam menyelesaikan pekerjaan sesuai backlog yang telah ditentukan.\n\nMenghasilkan produk bernilai lebih melal

In [11]:
pc = Pinecone(api_key=PINECONE_API_KEY)
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
vector_store = PineconeVectorStore(index=index, embedding=embedding_llm)
vector_store.add_documents(documents=splits)

['ec61e95b-ee4a-4d4d-b55b-a3bcde3147ac',
 '46543571-875c-41ed-acec-626fee61898e',
 'e3df8c90-fb2e-4603-9e90-7eebd603ace7',
 '98cc7ca3-c0a3-4e2f-89d3-6036ed05583a',
 'ba78cb09-72b7-4f8e-aa2d-4cc8dcb92a12',
 'b83c20bb-758c-4f52-ba2f-b5eaa2500fb1',
 '075f426c-ec39-4bd5-a63f-9c4b282baa35',
 '4596dfbd-20da-4b3b-a949-331f1245d9c5',
 '228b34c6-a9db-41de-8616-131c5decc895',
 'ba09be68-1c02-4f17-83a1-aaee9797dbf3',
 '8543091f-00ed-4b67-85a6-34390a431935',
 '8a06bdc8-ef44-437e-a5cd-4923b2ddd33c',
 '552f7379-68b6-4d65-86aa-540a33157de8',
 '1f069825-6dbf-40b1-acff-6eab26a004c4']