In [2]:
import os
from PyPDF2 import PdfReader

In [3]:
def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text()
    return text

In [5]:

pdf_files = ["PDFs/administering-oracle-identity-governance.pdf",
             "PDFs/developing-and-customizing-applications-oracle-identity-governance.pdf",
             "PDFs/help-topics-oracle-identity-governance.pdf",
             "PDFs/performing-self-service-tasks-oracle-identity-governance.pdf",
             "PDFs/reference-oracle-identity-governance-docker-and-kubernetes.pdf",
             "PDFs/release-notes-oracle-identity-management.pdf",
             "PDFs/upgrading-oracle-identity-manager.pdf"] 
all_texts = [extract_text_from_pdf(pdf) for pdf in pdf_files]


In [6]:
# Save all texts to a .txt file
output_file = "all_texts.txt"
with open(output_file, 'w', encoding='utf-8') as f:
    for i, text in enumerate(all_texts):
        f.write(f"Text from PDF {i + 1}:\n\n")
        f.write(text)
        f.write("\n" + "="*80 + "\n")  

In [7]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def chunk_text(text, chunk_size=500, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", "!", "?"]
    )
    return text_splitter.split_text(text)


In [8]:
chunks = [chunk_text(text) for text in all_texts]

In [10]:
from sentence_transformers import SentenceTransformer

def create_embeddings(chunks):
    model = SentenceTransformer('all-MiniLM-L12-v2') 
    embeddings = [model.encode(chunk) for chunk in chunks]
    return embeddings


In [11]:
all_embeddings = [create_embeddings(chunk_list) for chunk_list in chunks]




In [12]:
import pandas as pd

def save_embeddings(embeddings, output_file):
    flattened_embeddings = [embed.tolist() for sublist in embeddings for embed in sublist]
    df = pd.DataFrame(flattened_embeddings)
    df.to_csv(output_file, index=False)


In [13]:
save_embeddings(all_embeddings, "embeddings.csv")

In [14]:
def save_chunks_to_file(chunks, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for i, chunk_list in enumerate(chunks):
            f.write(f"PDF {i + 1} has {len(chunk_list)} chunks.\n")
            for j, chunk in enumerate(chunk_list):
                f.write(f"\nChunk {j + 1}:\n{chunk}\n\n")
                
save_chunks_to_file(chunks, "all_chunks.txt")


In [21]:
from langchain_core.documents import Document

def chunks_to_docs_wrapper(chunk_data):
    return Document(page_content=chunk_data['text'], metadata={"id": chunk_data['id'], "link": chunk_data['link']})

flat_chunks = [(page_num, chunk) for page_num, sublist in enumerate(chunks) for chunk in sublist]

docs = [chunks_to_docs_wrapper({'id': page_num, 'link': f'Page {page_num}', 'text': text}) 
        for page_num, text in flat_chunks]

In [None]:
def save_docs_to_file(docs, output_file):
    with open(output_file, 'w', encoding='utf-8') as f:
        for i, doc in enumerate(docs):
            f.write(f"Document {i + 1}:\n")
            f.write(f"Metadata: {doc.metadata}\n")
            f.write(f"Content:\n{doc.page_content}\n")
            f.write("\n" + "="*80 + "\n")

save_docs_to_file(docs, "all_docs.txt")

In [22]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import OracleVS
from langchain_community.vectorstores.utils import DistanceStrategy
import oracledb
import time 
from dotenv import load_dotenv

In [23]:
model_4db = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L12-v2")

In [24]:
load_dotenv()

username = os.getenv("DB_USERNAME")
password = os.getenv("DB_PASSWORD")
dsn = os.getenv("DB_DSN")
COMPARTMENT_OCID = os.getenv("COMPARTMENT_OCID")

print("The database user name is:", username)

try: 
    conn23ai = oracledb.connect(user=username, password=password, dsn=dsn)
    print("Connection successful!")
except Exception as e:
    print("Connection failed!")

The database user name is: admin
Connection successful!


In [25]:
s1time = time.time()
knowledge_base = OracleVS.from_documents(docs, model_4db, client=conn23ai, 
                                         table_name="OIG_vector", distance_strategy=DistanceStrategy.COSINE)
s2time = time.time()

print(f"Vectorizing and inserting chunks duration: {round(s2time - s1time, 1)} sec.")

2024-08-28 09:21:15,442 - ERROR - An unexpected error occurred: DPY-4011: the database or network closed the connection
Help: https://python-oracledb.readthedocs.io/en/latest/user_guide/troubleshooting.html#dpy-4011
Traceback (most recent call last):
  File "/home/opc/.conda/envs/oig/lib/python3.9/site-packages/langchain_community/vectorstores/oraclevs.py", line 54, in wrapper
    return func(*args, **kwargs)
  File "/home/opc/.conda/envs/oig/lib/python3.9/site-packages/langchain_community/vectorstores/oraclevs.py", line 672, in add_texts
    cursor.executemany(
  File "/home/opc/.conda/envs/oig/lib/python3.9/site-packages/oracledb-2.4.0b1-py3.9-linux-aarch64.egg/oracledb/cursor.py", line 751, in executemany
    self._impl.executemany(
  File "src/oracledb/impl/thin/cursor.pyx", line 218, in oracledb.thin_impl.ThinCursorImpl.executemany
  File "src/oracledb/impl/thin/protocol.pyx", line 440, in oracledb.thin_impl.Protocol._process_single_message
  File "src/oracledb/impl/thin/protocol.

RuntimeError: Failed due to a DB issue: Unexpected error: DPY-4011: the database or network closed the connection
Help: https://python-oracledb.readthedocs.io/en/latest/user_guide/troubleshooting.html#dpy-4011

In [26]:
username = os.getenv("DB_USERNAME")
password = os.getenv("DB_PASSWORD")
dsn = os.getenv("DB_DSN")
print(username)
print(password)
print(dsn)

admin
Keyboardbox123
(description= (address=(protocol=tcps)(port=1521)(host=adb.us-chicago-1.oraclecloud.com))(connect_data=(service_name=ged76958061af0b_oigchatbot_medium.adb.oraclecloud.com))(security=(ssl_server_dn_match=yes)))


In [28]:
import oracledb
import os

print("Oracledb version:", oracledb.__version__)

username = "admin"
password = "Keyboardbox123"
dsn = "(description= (address=(protocol=tcps)(port=1521)(host=adb.us-chicago-1.oraclecloud.com))(connect_data=(service_name=ged76958061af0b_oigchatbot_medium.adb.oraclecloud.com))(security=(ssl_server_dn_match=yes)))"

try: 
    conn23ai = oracledb.connect(user=username, password=password, dsn=dsn)
    print("Connection successful!")
except Exception as e:
    print("Connection failed!")

Oracledb version: 2.4.0b1
Connection successful!
