In [115]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQA

In [209]:
import os
from dotenv import load_dotenv

load_dotenv() # will search for .env file in local folder and load variables 

OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')

Cohere_API_token = os.getenv("COHERE_API_KEY")


In [210]:
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ["COHERE_API_KEY"] =Cohere_API_token

In [211]:
loader = PyPDFLoader("Annual-Report-2024-14-32.pdf")

In [212]:
documents = loader.load()

In [213]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [214]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

In [215]:
text_splits=text_splitter.split_documents(documents)
print(len(text_splits))

180


In [216]:
chunks = text_splitter.split_documents(documents)

In [217]:
from langchain_community.embeddings import OpenAIEmbeddings

In [218]:
from langchain_community.vectorstores import Chroma

In [219]:
embeddings = OpenAIEmbeddings(openai_api_key=openai_api_key)

In [220]:
vectorstore = Chroma.from_documents(chunks, embeddings, persist_directory="./chroma_db")

In [221]:
retriever_vectordb = vectorstore.as_retriever(search_kwargs={"k": 5})

In [222]:
# Hybrid Retrieval
keyword_retriever = BM25Retriever.from_documents(text_splits)
keyword_retriever.k =  5
ensemble_retriever = EnsembleRetriever(retrievers=[retriever_vectordb,keyword_retriever],
                                       weights=[0.5, 0.5])

In [223]:
query

'What is PSX?'

In [225]:
len(ensemble_retriever.get_relevant_documents(query))

6

In [57]:
# query="What is PSX?"
# docs_rel=ensemble_retriever.get_relevant_documents(query)
# docs_rel

In [88]:
from langchain.retrievers.document_compressors import CohereRerank
from langchain.retrievers import ContextualCompressionRetriever

In [89]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA
llm = ChatOpenAI(model_name="gpt-4o-mini")


In [90]:
query = "What is PSX?"

In [227]:
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain_cohere import CohereRerank
from langchain_community.llms import Cohere

# llm = Cohere(temperature=0)
# reranking

compressor = CohereRerank(model='rerank-v3.5')
compression_retriever = ContextualCompressionRetriever(
    base_compressor=compressor, base_retriever=ensemble_retriever
)

In [228]:
compressed_docs = compression_retriever.get_relevant_documents(query)
# compressed_docs

In [260]:
compressed_docs

[Document(metadata={'producer': 'iLovePDF', 'creationdate': '', 'creator': 'PyPDF', 'total_pages': 19, 'page_label': '4', 'source': 'Annual-Report-2024-14-32.pdf', 'page': 3, 'moddate': '2025-06-26T06:56:39+00:00', 'relevance_score': 0.7539928}, page_content='PSX Developments and Activities\nKey Achievements Organization Wide\nAs a frontline regulator, Pakistan Stock Exchange (PSX) continues to perform robustly for the development of the capital market \nin line with global standards and to bring innovative products and services for the beneﬁt of investors, issuers, market participants, \nand other stakeholders. In FY 2024, PSX continued to make headway in technology, listings, regulatory, and other areas with new'),
 Document(metadata={'producer': 'iLovePDF', 'creator': 'PyPDF', 'creationdate': '', 'moddate': '2025-06-26T06:56:39+00:00', 'source': 'Annual-Report-2024-14-32.pdf', 'total_pages': 19, 'page': 13, 'page_label': '14', 'relevance_score': 0.5709301}, page_content='compared to

In [168]:
def query_expansion(query):
    
    query_expansion_template = PromptTemplate(
            input_variables=["query"],
            template="""You are a search query expansion expert. Your task is to see if there is any improvement required
            in the qiven query. You can make the query in a more understable way
            Include relevant synonyms and related terms to improve retrieval.
            Return only the expanded query without any explanations or additional text.
            If you feel that the query is perfect then you can return the query as it is

            Original query: {query}

            Expanded query:"""
    )
    prompt = query_expansion_template.format(query=query)
    # model for query expansion
    query_expansion_model = ChatOpenAI(
            model_name="gpt-4o-mini",
            temperature=0.8  # Lower temperature for more focused query expansion
    )
    expanded_query = query_expansion_model.predict(prompt)
    return expanded_query
    

In [169]:
user_query = query_expansion(query)

In [170]:
user_query

'What is PSX (Pakistan Stock Exchange)?'

In [171]:

response_generation_template = PromptTemplate(
            input_variables=["query", "context"],
            template="""You are a helpful AI assistant. Use the following context to answer the user's query.
            Be clear, concise, and accurate. I don't have answer to this question if the user's query is not in the context

            Context:
            {context}

            User Query: {query}

            Response:"""
)

prompt = response_generation_template.format(
            query=user_query,
            context=compressed_docs
)


In [172]:
prompt

"You are a helpful AI assistant. Use the following context to answer the user's query.\n            Be clear, concise, and accurate. I don't have answer to this question if the user's query is not in the context\n\n            Context:\n            [Document(metadata={'producer': 'iLovePDF', 'total_pages': 19, 'page_label': '4', 'moddate': '2025-06-26T06:56:39+00:00', 'creator': 'PyPDF', 'creationdate': '', 'source': 'Annual-Report-2024-14-32.pdf', 'page': 3, 'relevance_score': 0.7539928}, page_content='PSX Developments and Activities\\nKey Achievements Organization Wide\\nAs a frontline regulator, Pakistan Stock Exchange (PSX) continues to perform robustly for the development of the capital market \\nin line with global standards and to bring innovative products and services for the beneﬁt of investors, issuers, market participants, \\nand other stakeholders. In FY 2024, PSX continued to make headway in technology, listings, regulatory, and other areas with new'), Document(metadata={'

In [183]:
response_model = ChatOpenAI(
            model_name="gpt-4o-mini",
            temperature=0.7,  # Higher temperature for more creative responses
            logprobs=True
)
response = response_model.invoke(prompt)

                    logprobs was transferred to model_kwargs.
                    Please confirm that logprobs is what you intended.


In [185]:
response.content

'The Pakistan Stock Exchange (PSX) is a frontline regulator and a key player in the development of the capital market in Pakistan. It aims to align its operations with global standards while introducing innovative products and services for the benefit of investors, issuers, market participants, and other stakeholders.'

In [186]:
response.response_metadata

{'token_usage': {'completion_tokens': 57,
  'prompt_tokens': 600,
  'total_tokens': 657,
  'completion_tokens_details': {'accepted_prediction_tokens': 0,
   'audio_tokens': 0,
   'reasoning_tokens': 0,
   'rejected_prediction_tokens': 0},
  'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}},
 'model_name': 'gpt-4o-mini',
 'system_fingerprint': 'fp_34a54ae93c',
 'finish_reason': 'stop',
 'logprobs': {'content': [{'token': 'The',
    'bytes': [84, 104, 101],
    'logprob': -0.026056190952658653,
    'top_logprobs': []},
   {'token': ' Pakistan',
    'bytes': [32, 80, 97, 107, 105, 115, 116, 97, 110],
    'logprob': -5.180879816180095e-05,
    'top_logprobs': []},
   {'token': ' Stock',
    'bytes': [32, 83, 116, 111, 99, 107],
    'logprob': 0.0,
    'top_logprobs': []},
   {'token': ' Exchange',
    'bytes': [32, 69, 120, 99, 104, 97, 110, 103, 101],
    'logprob': 0.0,
    'top_logprobs': []},
   {'token': ' (', 'bytes': [32, 40], 'logprob': 0.0, 'top_logprobs': []},
   

In [175]:
response.strip()

'The Pakistan Stock Exchange (PSX) is the frontline regulator of the capital market in Pakistan. It aims to develop the market in line with global standards and provides innovative products and services for the benefit of investors, issuers, market participants, and other stakeholders.'

In [187]:
import numpy as np

def calculate_weighted_confidence_score(llm_results_metadata):
    """
    Calculate the weighted confidence score from the given LLM results metadata containing log probabilities.

    Args:
        llm_results_metadata (dict): The metadata object containing logprobs information.

    Returns:
        float: The weighted confidence score calculated from the log probabilities.
    """
    # Extract log probabilities
    log_probs = [item['logprob'] for item in llm_results_metadata['logprobs']['content']]

    # Convert log probabilities to probabilities
    probabilities = np.exp(log_probs)

    # Get indices of top 5 probabilities
    sorted_indices = np.argsort(log_probs)[-5:]  

    # Calculate joint probability for all tokens
    joint_probability_all = np.prod(probabilities)

    # Calculate joint probability for top 5 tokens
    top_probabilities = [probabilities[i] for i in sorted_indices]
    joint_probability_top_5 = np.prod(top_probabilities)

    # Weighted confidence score (70% from top 5, 30% from all tokens)
    confidence_score = round((0.7 * joint_probability_top_5 + 0.3 * joint_probability_all) * 100, 2)

    return confidence_score

In [193]:
llm_results_metadata= response.response_metadata
int(calculate_weighted_confidence_score(llm_results_metadata))

70

In [195]:
calculate_weighted_confidence_score(llm_results_metadata)

np.float64(70.01)