In [1]:
import json
import os
import glob
import subprocess
import sys
import textwrap

from reportlab.lib.pagesizes import A4
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.enums import TA_RIGHT
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont

# --- Installation of Required Libraries ---

def install_package(package):
    """A helper function to install pip packages."""
    try:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])
    except subprocess.CalledProcessError as e:
        print(f"Failed to install {package}: {e}")
        sys.exit(1)

try:
    import arabic_reshaper
    from bidi.algorithm import get_display
    print("✅ Text processing libraries are already installed.")
except ImportError:
    print("⏳ Installing required text processing libraries...")
    install_package("arabic-reshaper")
    install_package("python-bidi")
    import arabic_reshaper
    from bidi.algorithm import get_display
    print("✅ Successfully installed text processing libraries.")

print("Libraries loaded.")


✅ Text processing libraries are already installed.
Libraries loaded.


In [2]:
# --- Font Setup ---

def setup_persian_font():
    """
    Registers a local Persian font for ReportLab, with a fallback to download.
    Returns the name of the registered font.
    """
    local_font_path = "/home/liamirali/.local/share/fonts/shamsiCalendarFonts/Vazirmatn.ttf"
    font_name = "Vazirmatn"

    if os.path.exists(local_font_path):
        print(f"✅ Local font found at: {local_font_path}")
        try:
            pdfmetrics.registerFont(TTFont(font_name, local_font_path))
            print(f"✅ Font '{font_name}' registered successfully.")
            return font_name
        except Exception as e:
            print(f"❌ Could not register local font: {e}")
    else:
        print(f"⚠️ Local font not found at the specified path.")

    # Fallback to downloading if local font fails
    import urllib.request
    import tempfile
    
    dl_font_name = "Vazirmatn-Regular"
    font_filename = f"{dl_font_name}.ttf"
    temp_dir = tempfile.gettempdir()
    font_path = os.path.join(temp_dir, font_filename)
    
    if not os.path.exists(font_path):
        print(f"⏳ Attempting to download {dl_font_name} font...")
        font_url = f"https://github.com/rastikerdar/vazirmatn/releases/download/v33.003/{font_filename}"
        try:
            urllib.request.urlretrieve(font_url, font_path)
            print("✅ Font downloaded successfully.")
        except Exception as e:
            print(f"❌ Error downloading font: {e}")
            print("Falling back to default Helvetica font.")
            return 'Helvetica'
            
    try:
        pdfmetrics.registerFont(TTFont(dl_font_name, font_path))
        print(f"✅ Downloaded font '{dl_font_name}' registered successfully.")
        return dl_font_name
    except Exception as e:
        print(f"❌ Could not register downloaded font: {e}")
        return 'Helvetica'

# Register the font and define styles
PERSIAN_FONT_NAME = setup_persian_font()

from reportlab.lib.colors import darkred

styles = getSampleStyleSheet()
styles.add(ParagraphStyle(
    name='PersianRight',
    alignment=TA_RIGHT,
    fontName=PERSIAN_FONT_NAME,
    fontSize=12,
    leading=20
))

styles.add(ParagraphStyle(
    name='PersianTitle',
    parent=styles['PersianRight'],
    fontSize=16,
    leading=24
))

styles.add(ParagraphStyle(
    name='PersianQuestion',
    parent=styles['PersianRight'],
    textColor=darkred,
    fontSize=13
))

print("Font setup complete.")


✅ Local font found at: /home/liamirali/.local/share/fonts/shamsiCalendarFonts/Vazirmatn.ttf
✅ Font 'Vazirmatn' registered successfully.
Font setup complete.


In [3]:
# --- Core Text Processing Logic ---

def process_persian_text(text):
    """
    Processes a string of Persian text to be correctly displayed in ReportLab.
    This version reverses word order *before* reshaping and bidi processing.
    It lets ReportLab handle all text wrapping.
    """
    from bidi.algorithm import get_display
    import arabic_reshaper

    if not text:
        return ""

    text = str(text)
    
    # Reverse the order of words in the string
    words = text.split()
    words.reverse()
    reversed_text = " ".join(words)
    
    # Reshape the full reversed text
    reshaped_text = arabic_reshaper.reshape(reversed_text)
    
    # Apply the bidi algorithm to get the correct display order for ReportLab
    bidi_text = get_display(reshaped_text)
    
    return bidi_text

# --- Test the text processing ---
test_text = "این یک متن نمونه برای آزمایش است تا ببینیم آیا به درستی نمایش داده می‌شود یا خیر."
processed_text = process_persian_text(test_text)

print("Original Text:\n", test_text)
print("\nProcessed Text for PDF:\n", processed_text)


Original Text:
 این یک متن نمونه برای آزمایش است تا ببینیم آیا به درستی نمایش داده می‌شود یا خیر.

Processed Text for PDF:
 ﻦﯾﺍ ﮏﯾ ﻦﺘﻣ ﻪﻧﻮﻤﻧ ﯼﺍﺮﺑ ﺶﯾﺎﻣﺯﺁ ﺖﺳﺍ ﺎﺗ ﻢﯿﻨﯿﺒﺑ ﺎﯾﺁ ﻪﺑ ﯽﺘﺳﺭﺩ ﺶﯾﺎﻤﻧ ﻩﺩﺍﺩ ﺩﻮﺷﯽﻣ ﺎﯾ .ﺮﯿﺧ


In [4]:
# --- PDF Generation ---

def create_pdf(jsonl_data, output_path):
    """
    Creates a PDF for a given set of interview data.
    """
    doc = SimpleDocTemplate(output_path, pagesize=A4, rightMargin=72, leftMargin=72, topMargin=72, bottomMargin=72)
    story = []

    # Persona Information (once at the top)
    persona = jsonl_data[0]['persona']
    story.append(Paragraph(str(process_persian_text(f"مصاحبه با: {persona['name']}")), styles['PersianTitle']))
    story.append(Spacer(1, 12))
    
    # Define persona details as a dictionary to separate labels from values
    persona_details_dict = {
        "نام": persona['name'],
        "سن": persona['age'],
        "جنسیت": 'مرد' if persona['gender'] == 'male' else 'زن',
        "پیشینه": persona['background'],
        "ویژگی‌ها": ', '.join(persona['traits']),
        "لحن": persona['tone']
    }

    # Process labels and values separately to protect HTML tags
    for label, value in persona_details_dict.items():
        processed_label = str(process_persian_text(f"{label}:"))
        processed_value = str(process_persian_text(str(value)))
        
        # For RTL rendering, we construct the string with the value first, 
        # then the bolded label. ReportLab's Paragraph will render it correctly.
        final_text = f"{processed_value} <b>{processed_label}</b>"
        
        story.append(Paragraph(final_text, styles['PersianRight']))

    story.append(Spacer(1, 24))

    # Questions and Answers
    for entry in jsonl_data:
        story.append(Paragraph(str(process_persian_text(f"سوال: {entry['question']}")), styles['PersianQuestion']))
        story.append(Spacer(1, 6))
        story.append(Paragraph(str(process_persian_text(entry['answer'])), styles['PersianRight']))
        story.append(Spacer(1, 18))
        
    doc.build(story)
    print(f"✅ PDF successfully created at: {output_path}")

# --- Test with a sample file ---
def test_single_file():
    print("\\n--- Testing with a single file ---")
    test_file = "data/synthetic_elder_fa_20250813_003102_gpt-4o_0000.jsonl"
    output_pdf = "test_output.pdf"
    
    with open(test_file, 'r', encoding='utf-8') as f:
        jsonl_data = [json.loads(line) for line in f]
    
    create_pdf(jsonl_data, output_pdf)
    print("--- Single file test complete ---")

test_single_file()


\n--- Testing with a single file ---
✅ PDF successfully created at: test_output.pdf
--- Single file test complete ---


In [5]:
# --- Batch Processing ---

def process_all_files():
    """
    Processes all .jsonl files in the 'data' directory and creates PDFs for each.
    """
    data_dir = "data"
    output_dir = "pdf_output"
    os.makedirs(output_dir, exist_ok=True)
    
    jsonl_files = glob.glob(os.path.join(data_dir, "*.jsonl"))
    
    if not jsonl_files:
        print("❌ No .jsonl files found in the 'data' directory.")
        return

    print(f"\\n--- Starting batch processing of {len(jsonl_files)} files ---")
    
    for file_path in jsonl_files:
        print(f"Processing {os.path.basename(file_path)}...")
        
        output_filename = os.path.splitext(os.path.basename(file_path))[0] + ".pdf"
        output_path = os.path.join(output_dir, output_filename)
        
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                jsonl_data = [json.loads(line) for line in f if line.strip()]
            
            if jsonl_data:
                create_pdf(jsonl_data, output_path)
            else:
                print(f"⚠️ Skipping empty file: {file_path}")
        
        except json.JSONDecodeError:
            print(f"❌ Error decoding JSON in {file_path}. Skipping.")
        except Exception as e:
            print(f"❌ An unexpected error occurred with {file_path}: {e}")

    print("--- Batch processing complete ---")

# Run the batch processing
process_all_files()


\n--- Starting batch processing of 9 files ---
Processing synthetic_elder_fa_20250813_003102_gpt-4o_0000.jsonl...
✅ PDF successfully created at: pdf_output/synthetic_elder_fa_20250813_003102_gpt-4o_0000.pdf
Processing synthetic_elder_fa_20250813_003358_gpt-4o_0000.jsonl...
✅ PDF successfully created at: pdf_output/synthetic_elder_fa_20250813_003358_gpt-4o_0000.pdf
Processing synthetic_elder_fa_20250813_003520_gemini-2.5-pro-preview-06-05_0000.jsonl...
✅ PDF successfully created at: pdf_output/synthetic_elder_fa_20250813_003520_gemini-2.5-pro-preview-06-05_0000.pdf
Processing synthetic_elder_fa_20250813_003520_gpt-4o_0000.jsonl...
✅ PDF successfully created at: pdf_output/synthetic_elder_fa_20250813_003520_gpt-4o_0000.pdf
Processing synthetic_elder_fa_20250813_003520_grok-3_0000.jsonl...
✅ PDF successfully created at: pdf_output/synthetic_elder_fa_20250813_003520_grok-3_0000.pdf
Processing synthetic_elder_fa_20250813_071909_gpt-4o_0000.jsonl...
✅ PDF successfully created at: pdf_output/