### **Import Required Libraries**

In [None]:
# from pathlib import Path
# import sys
# PROJECT_ROOT=Path.cwd().parent
# sys.path.append(str(PROJECT_ROOT))

In [169]:
from pypdf import PdfReader
import yaml
from pathlib import Path
import os
from src.services.clean_text import clean_document
from src.services.llm_services import llm_configuration
from user_prompts.Question_prompt import QUESTION_PROMPT_TEMPLATE
from user_prompts.Anwser_prompt import ANWSER_PROMPT_TEMPLATE
import re
import json
import random

### **Load the PDF File**

In [170]:
## Load the yaml file
def load_yaml(file_path):
    with open(file_path,'r') as file:
        config=yaml.safe_load(file)
    
    return config


##Load the config file
config=load_yaml(Path.cwd().parent/"config.yaml")



In [171]:
## Read PDF file and extract text
def read_pdf(data_path):
    reader=PdfReader(data_path)
    text=""
    for page in reader.pages:
        text+=page.extract_text() + "\n"
    return text

##Load the data
pdf_dir=Path.cwd().parent/config['raw_data_path']
pdf_dir.mkdir(parents=True, exist_ok=True)

pdf_file=list(pdf_dir.glob("*.pdf"))

## Read the PDF file
document=read_pdf(pdf_file[0])

## Number of pages and characters in the document
print("Loading Document...")
print("="*60)
print(f"Number of pages in the document:{len(PdfReader(pdf_file[0]).pages)}")
print(f"Number of characters in the document: {len(document)}")
print("="*60)

Loading Document...
Number of pages in the document:142
Number of characters in the document: 640074


### **Clean Unecessary Things**

In [172]:
cleaned_document=clean_document([document])

In [173]:
## After cleaning number of characters
print(f"Number of characters in the cleaned document: {len(cleaned_document[0])}")

Number of characters in the cleaned document: 151814


### **Chunking Strategy**
#### **Fixed Chunking Strategy**

In [174]:
## define function for chunking the text
def chunk_text(text,chunk_size=config['chunk_size'],overlap_size=config['overlap_size']):
    chunks=[]
    start=0
    while start < len(text):
        end=start+chunk_size
        chunk=text[start:end]
        chunks.append(chunk)
        start+=chunk_size-overlap_size
    return chunks


# Create chunks from cleaned document
text_chunks=chunk_text(cleaned_document[0])

### **Generation Loop**

#### **Step A Question Generation**

In [175]:
## Generate 10 Q/A pairs using each chunk
print("Chunks created...")
print("="*60)
print(f"Total number of chunks created:{len(text_chunks)}")
print("="*60)

Chunks created...
Total number of chunks created:113


#### **AI model configuration**

In [176]:
ai_model_config=llm_configuration(config)

In [177]:
print("="*60)
print(f"AI Model Name:{ai_model_config.model_name}")
print("="*60)

AI Model Name:gpt-4o-mini


In [178]:
## call the AI model to generate Q/A pairs
def generate_qa_pairs(chunk,chunk_id):
    ### define the prompt for question
    prompt=QUESTION_PROMPT_TEMPLATE.replace("{text_chunk}",chunk)

    ##call the AI client via ChatopenAI
    response=ai_model_config.invoke(prompt)
    ## generate the 10 questions per chunk
    questions=response.content.strip()

    result={
        "Chunk ID":chunk_id,
        "Chunk":chunk,
        "Gnerated Questions":questions
    }

    return result


In [179]:
##define an empty list
qa_pairs=[]
## loop though each chunk
for chunk_id,chunk in enumerate(text_chunks):
    qa_pairs.append(generate_qa_pairs(chunk_id=chunk_id,chunk=chunk))


#### **Step B Anwser Generation**

In [180]:
def parse_numbered_answers(answer_text, expected=10):
    answers = {}

    pattern = r"(\d+)\.\s+(.*?)(?=\n\d+\.|\Z)"
    matches = re.findall(pattern, answer_text, re.S)

    for num, ans in matches:
        answers[int(num)] = ans.strip()

    for i in range(1, expected + 1):
        answers.setdefault(i, "")

    return answers

In [181]:
###(optional) remove the leading digit in the question 
def remove_leading_numbers(question: str) -> str:
    return re.sub(r"^\s*\d+\.\s*", "", question)

In [182]:
## call the AI model to anwser the generated questions of each chunk
def generate_anwsers(chunk_id,chunk,generated_questions):
    ##define the prompt for anwser
    prompt=(
        ANWSER_PROMPT_TEMPLATE
        .replace("{text_chunk}",chunk)
        .replace("{questions}",generated_questions)
        
        )
    
   ##call the AI clinet via langchain ChatopenAI
    response=ai_model_config.invoke(prompt)
   ## Get the answers for each generated question
    raw_anwsers=response.content.strip()


    ##Parse answers
    parsed_answers= parse_numbered_answers(raw_anwsers)

    # Split questions into list
    question_list = [
        q.strip()
        for q in generated_questions.split("\n")
        if q.strip()
    ]

    qa_pairs = []
    for i, question in enumerate(question_list, start=1):
        qa_pairs.append({
            "question_id": i,
            "question": remove_leading_numbers(question),
            "answer": parsed_answers.get(i, "")
        })

    

    return{
        "chunk_id":chunk_id,
        "qa_pairs":qa_pairs
    }


In [183]:
##define an empty list
final_qa_pair=[]
for pair in qa_pairs:
    final_qa_pair.append(generate_anwsers(pair["Chunk ID"],pair["Chunk"],pair["Gnerated Questions"]))

### **Save to JSON file**

In [188]:
save_path=Path.cwd().parent/config["artifacts"]
with open(save_path/"qa_dataset.json","w",encoding="utf-8") as f:
     json.dump(final_qa_pair,f,indent=2,ensure_ascii=False)

### **Split the Data into Training and Testing**

In [189]:
## flattend the dataset
def flattend_qa_dataset(final_qa_pair):
     flattend_data=[] ##defined an empty list
     for chunk in final_qa_pair:
          chunk_id=chunk["chunk_id"]
          for qa in chunk["qa_pairs"]:
               flattend_data.append({
                    "chunk_id":chunk_id,
                    "question_id":qa["question_id"],
                    "question":qa["question"],
                    "anwser":qa["answer"]

               })
     return flattend_data

##call the function
flattend_data=flattend_qa_dataset(final_qa_pair)

In [190]:
##divide the dataset into train and testing
def train_test_split(data,train_ratio=config["train_ratio"],seed=42):
    random.seed(seed)
    random.shuffle(data)

    split_idx=int(len(data)*train_ratio) ##get the cuting benchmark

    train_data=data[:split_idx]
    test_data=data[split_idx:]

    return train_data,test_data

##call the function
train_data,test_data=train_test_split(flattend_data)


In [191]:
##save the file as jsonl format
def save_jsonl(data, file_path):
    with open(file_path, "w", encoding="utf-8") as f:
        for item in data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

##call the function
save_jsonl(data=train_data,file_path=Path.cwd().parent/config["final"]/"train.jsonl")
save_jsonl(data=test_data,file_path=Path.cwd().parent/config["final"]/"test.jsonl")