In [1]:
from langchain_community.document_loaders import PDFMinerLoader
import pandas as pd
import os
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.platypus import SimpleDocTemplate, Paragraph, PageBreak
import re

In [2]:
import json
with open('task170_hotpotqa_answer_generation.json', 'r', encoding='utf-8') as file:
    data = json.load(file)

In [3]:
data = data["Instances"]
len(data)

6500

In [4]:
for i, example in enumerate(data):
    # Extract the input text and output text from the example
    input_text = example['input']
    output_text = example['output']
    
    # Split the input text into context and question parts based on "Question:"
    context_part, question_part = input_text.rsplit("Question:",1)
    example['context'] = context_part
    example['question'] = question_part
data[1504]['context']

'Context_1 : Central Park in the Dark is a music composition by Charles Ives for chamber orchestra. It was composed in 1906 and has been paired with "The Unanswered Question" as part of “Two Contemplations” and with "Hallowe’en" and "The Pond" in “Three Outdoor Scenes.” Context_2 : United States v. Barker, 15 U.S. 395 (1817), was a case decided by the United States Supreme Court upholding the common law tradition that private citizens may not demand costs from the federal government. The case involved a motion for costs filed against the United States Government and resolved the previously unanswered question of whether courts could award costs against the United States federal government. The Court\'s opinion read, in its entirety, "[t]he United States never pay costs." Jurists have remarked that Chief Justice John Marshall\'s six-word opinion is one of the shortest Supreme Court cases ever written. Context_3 : Charles Edward Ives ( ; October 20, 1874May 19, 1954) was an American mode

In [5]:
data[1504]['question']

' The Unanswered Question is a ballet made from work by a man who was regarded as what?'

In [6]:
len(data)

6500

In [7]:
def clean_context_text(context_text):
    # Remove 'Context_i' and 'fact_i' patterns from the context text
    cleaned_text = re.sub(r'Context_\d+ : ', '', context_text)
    cleaned_text = re.sub(r'fact_\d+ : ', '', cleaned_text)
    return cleaned_text.strip()
for i, example in enumerate(data):
    example['context'] = clean_context_text(example['context'])

In [8]:
import random
def clean_paragraph_text(text):
    # Remove any HTML-like tags and invalid characters from the text
    cleaned_text = re.sub(r'<[^>]+>', '', text)  # Remove HTML tags
    cleaned_text = re.sub(r'[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\xff]', '', cleaned_text)  # Remove invalid characters
    return cleaned_text.strip()


def create_single_context_pdf(filename, context_text, data, i):
    doc = SimpleDocTemplate(filename, pagesize=letter)
    styles = getSampleStyleSheet()

    # Define a custom Paragraph style for the main content
    body_style = styles["BodyText"]
    body_style.alignment = 0  # Left alignment
    body_style.wordWrap = 'LTR'  # Enable word wrapping

    # Create a list to hold the flowable elements (content) of the PDF
    story = []

    # Create a Paragraph with the context text and apply the body style
    context_text = clean_paragraph_text(context_text)
    p = Paragraph(context_text, style=body_style)

    # Add the Paragraph to the story
    story.append(p)

    # Build the PDF document with the story content and handle overflow
    doc.build(story, onFirstPage=lambda canvas, doc: None, onLaterPages=lambda canvas, doc: None)




In [9]:
for i, example in enumerate(data):
    if i < 50:
        # Create a PDF file for the context
        filename = f"pdfs/example_{i}_context.pdf"

        # Generate the PDF
        c = canvas.Canvas(filename, pagesize=letter)
        text = example['context']  # Single line context text

        create_single_context_pdf(filename, text, data, i)

In [None]:
import PyPDF2
import os

def merge_pdfs(input_dir, output_dir, batch_size=10):
    # Filter PDF files in the input directory and sort them based on the index 'i' in the filename
    pdf_files = [f for f in os.listdir(input_dir) if f.endswith('.pdf')]
    pdf_files.sort(key=lambda x: int(x.split('_')[1]))

    pdf_writer = PyPDF2.PdfWriter()
    current_batch = []
    batch_count = 0

    # Iterate over sorted PDF files
    for pdf_file in pdf_files:
        pdf_path = os.path.join(input_dir, pdf_file)
        pdf_reader = PyPDF2.PdfReader(pdf_path)

        # Add each page of the current PDF file to the writer
        for page_num in range(len(pdf_reader.pages)):
            page = pdf_reader.pages[page_num]
            pdf_writer.add_page(page)

        current_batch.append(pdf_file)

        # If the current batch size reaches the specified batch_size or it's the last PDF
        if len(current_batch) == batch_size or pdf_file == pdf_files[-1]:
            batch_count += 1
            output_filename = f'context_{batch_count*10-10}_{batch_count*10-1}.pdf'
            output_path = os.path.join(output_dir, output_filename)

            # Write the current batch to a merged PDF file
            with open(output_path, 'wb') as out_pdf:
                pdf_writer.write(out_pdf)

            # Reset the writer and current_batch for the next batch
            pdf_writer = PyPDF2.PdfWriter()
            current_batch = []

    print(f'Merged PDFs saved to: {output_dir}')
# Example usage:
input_directory = 'D:/Local-RAG-Assistant-Chatbot/eval/pdfs'
output_directory = 'D:/Local-RAG-Assistant-Chatbot/eval/merged_pdfs'

if not os.path.exists(output_directory):
    os.makedirs(output_directory)

merge_pdfs(input_directory, output_directory)