In [None]:
import nltk
from nltk.tokenize import sent_tokenize
import csv
from collections import defaultdict
import json
import re
import os
import time
from datetime import datetime
from openai import OpenAI
from langchain.prompts import PromptTemplate
from bs4 import BeautifulSoup


# ==============================================================================
# 1. UTILITY FUNCTIONS
# ==============================================================================

class OpenAIWrapper:
    def __init__(self, client, model_name="gpt-5", temperature=0.1):
        self.client = client
        self.model_name = model_name
        self.temperature = temperature
    
    def get_config(self):
        return {
            "model": self.model_name,
            "temperature": self.temperature,
            "system_prompt": "You are a Talmud scholar expert in translation."
        }

def daf_amud_generator():
    """Generator for Daf/Amud references (e.g., 2a, 2b, 3a, 3b)"""
    daf = 2
    amud = 'a'
    while True:
        yield f"{daf}{amud}"
        if amud == 'a':
            amud = 'b'
        else: # amud == 'b'
            amud = 'a'
            daf += 1

def strip_html(text):
    return BeautifulSoup(text, "html.parser").get_text(separator=" ", strip=True)

# Placeholderâ€”actual cleaning is done during retrieval
def clean_translation_output(text):
    return text.strip() 


# ==============================================================================
# 2. BATCH PREPARATION LOGIC (Top Level Function)
# ==============================================================================

def prepare_batch_file(full_tractate_data, promt_template, batch_file_path, model_name, book_identifier, is_test):
    """
    Prepares a .jsonl file for the OpenAI Batch API.
    book_identifier: The clean name of the tractate (e.g., 'Bekhorot').
    Returns: total number of requests generated, full batch file path.
    """
    all_hebrew_pages = full_tractate_data.get('hebrew', [])
    all_english_pages = full_tractate_data.get('english', [])
    passages_to_process = list(zip(all_hebrew_pages, all_english_pages))

    if is_test:
        passages_to_process = passages_to_process[:2]
        
    daf_amud_gen = daf_amud_generator()
    requests_generated = 0
    batch_requests = []
    system_prompt = "You are a Talmud scholar expert in translation." 

    try:
        for hebrew_page, _ in passages_to_process:
            daf_amud_key = next(daf_amud_gen)
            
            if isinstance(hebrew_page, str): hebrew_page = [hebrew_page]
            
            for line_idx, hebrew_line in enumerate(hebrew_page):
                hebrew_line_clean = strip_html(hebrew_line)

                if not hebrew_line_clean.strip():
                    continue

                user_content = promt_template.format(text=hebrew_line_clean)
                
                # --- Custom ID Format: [BOOK]_[DAF_AMUD]_[LINE_IDX] ---
                # Example: 'Bekhorot_2a_1'
                custom_id = f"{book_identifier}_{daf_amud_key}_{line_idx+1}"
                
                batch_request = {
                    "custom_id": custom_id, 
                    "method": "POST",
                    "url": "/v1/chat/completions",
                    "body": {
                        "model": model_name,
                        "messages": [
                            {"role": "system", "content": system_prompt},
                            {"role": "user", "content": user_content}
                        ],
                        "temperature": 0.1, 
                        "max_completion_tokens": 1024 
                    }
                }
                
                batch_requests.append(batch_request)
                requests_generated += 1

        # Write all requests to the .jsonl file
        with open(batch_file_path, 'w', encoding='utf-8') as f:
            for req in batch_requests:
                f.write(json.dumps(req) + '\n')

        print(f"  > Prepared {requests_generated} requests.")
        return requests_generated, batch_file_path

    except Exception as e:
        print(f"Error preparing batch file: {e}")
        return 0, None


# ==============================================================================
# 3. BATCH SUBMISSION LOGIC (Top Level Function)
# ==============================================================================

def analyze_translations_batch_submit(
        llm, 
        promt_template, 
        file_name_prefix, 
        input_file_path, 
        is_test=True):
    
    # 1. Determine file names and identifiers
    base_file_name = os.path.splitext(os.path.basename(input_file_path))[0]
    starttime = datetime.now().strftime('%Y%m%d_%H%M%S')
    batch_dir = "batch_input_files"
    os.makedirs(batch_dir, exist_ok=True)
    batch_file_path = f"{batch_dir}/{file_name_prefix}{base_file_name}_{starttime}.jsonl"
    
    model_name = llm.model_name
    book_identifier = base_file_name # e.g., 'Bekhorot'
    
    try:
        with open(input_file_path, 'r', encoding='utf-8') as f:
            full_tractate_data = json.load(f)
    except Exception as e:
        print(f"  > Error loading input data: {e}")
        return None

    # 2. PREPARE INPUT FILE (Calls the helper function)
    requests_generated, input_path = prepare_batch_file(
        full_tractate_data, 
        promt_template, 
        batch_file_path, 
        model_name,
        book_identifier, 
        is_test
    )

    if requests_generated == 0:
        print("  > No requests generated. Aborting batch submission.")
        return None

    # 3. UPLOAD FILE
    try:
        print(f"  > Uploading file: {os.path.basename(input_path)}")
        batch_input_file = llm.client.files.create(
            file=open(input_path, "rb"),
            purpose="batch"
        )
        print(f"  > File uploaded successfully. File ID: {batch_input_file.id}")

    except Exception as e:
        print(f"  > Error uploading file to OpenAI: {e}")
        return None

    # 4. CREATE BATCH JOB
    try:
        batch_job = llm.client.batches.create(
            input_file_id=batch_input_file.id,
            endpoint="/v1/chat/completions",
            completion_window="24h"
        )

        print(f"  > BATCH JOB SUBMITTED:")
        print(f"    Job ID: {batch_job.id}")
        print(f"    Status: {batch_job.status}")
        print(f"    Total Requests: {batch_job.request_counts.total}")

        return batch_job.id

    except Exception as e:
        print(f"  > Error creating batch job: {e}")
        return None


# ==============================================================================
# 4. MAIN EXECUTION BLOCK (Orchestration)
# ==============================================================================

if __name__ == "__main__": 

    INPUT_DIR = './talmud_output'
    
    # Ensure VADER is downloaded
    try:
        nltk.data.find('sentiment/vader_lexicon.zip')
    except nltk.downloader.DownloadError:
        nltk.download('vader_lexicon')

    # Instatiate model client 
    client = OpenAI(api_key=os.environ.get('OPENAI_API_KEY'))
    llm = OpenAIWrapper(client, model_name="gpt-3.5-turbo")
   
    prompt = PromptTemplate(
                     input_variables=["text"],
                     template="""You are a Talmud scholar translating a tractate of the Talmud that contains both Hebrew and Aramaic.
                         Translate the following text into English word-for-word, maintaining the original style and format of Talmudic discourse.
                         Keep names and technical terms transliterated.
                         Preserve any quotes from Biblical verses and translate them.
                         Do not add any commentary or explanations. Your readers are also Talmud Scholars and do not require any Notes.

                         Talmudic text: {text}

                         English translation:"""
                 )
    promt_template = prompt.template
    
    submitted_jobs = {} 

    # Iterate over all files in the directory
    for filename in os.listdir(INPUT_DIR):
        input_file_path = os.path.join(INPUT_DIR, filename)

        # Skip directories and non-JSON files for safety
        if not filename.endswith('.json') or os.path.isdir(input_file_path):
            continue
        
        print(f"\n--- Processing {filename} ---")
            
        try:
            job_id = analyze_translations_batch_submit(
                llm,
                promt_template,
                file_name_prefix = "gpt-3.5_batch_",
                input_file_path = input_file_path,
                is_test=False 
            )
                
            if job_id:
                submitted_jobs[os.path.basename(input_file_path)] = {
                    "job_id": job_id,
                    "input_file_path": input_file_path 
                }

            pause_time_seconds = 600
            print(f"\n--- PAUSING for {pause_time_seconds / 60} minutes (10 min) before next tractate... ---")
            time.sleep(pause_time_seconds)
            
        except Exception as e:
            print(f"An unexpected error occurred while processing {filename}: {str(e)}")
                
    
    # --- FINAL STEP: Save Job IDs ---
    if submitted_jobs:
        print("\n--- All submissions complete. Saving Job IDs for retrieval. ---")
        
        metadata_dir = "batch_metadata"
        os.makedirs(metadata_dir, exist_ok=True)
        
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        metadata_file = os.path.join(metadata_dir, f"submitted_jobs_metadata_{timestamp}.json")
        
        final_metadata = {
            "submission_time": timestamp,
            "model_config": llm.get_config(),
            "jobs": submitted_jobs
        }
        
        try:
            with open(metadata_file, 'w') as f:
                json.dump(final_metadata, f, indent=4)
            print(f"âœ… SUCCESS: Job IDs saved to: {metadata_file}")
            print("\nREMINDER: Wait 1-24 hours, then run the retrieval script.")
        except Exception as e:
            print(f"FATAL ERROR: Could not save metadata file! Error: {e}")
    else:
        print("\nNo jobs were successfully submitted.")

In [None]:
import pandas as pd
import os
from glob import glob

def combine_csv_files(folder_path, output_filename="combined_data.csv"):
    """
    Combines all CSV files within a specified folder into a single CSV file.

    Args:
        folder_path (str): The path to the folder containing the CSV files.
        output_filename (str): The name of the resulting combined CSV file.
    """
    # 1. Define the search pattern
    # The glob module finds all pathnames matching a specified pattern.
    search_pattern = os.path.join(folder_path, "*.csv")
    csv_files = glob(search_pattern)

    # Check if any CSV files were found
    if not csv_files:
        print(f"No CSV files found in: {folder_path}")
        return

    print(f"Found {len(csv_files)} CSV files to combine.")

    # 2. Read and store all dataframes
    all_data = []
    for f in csv_files:
        try:
            # Read the CSV file. We assume they all have the same columns.
            # You might need to adjust parameters like 'encoding' or 'sep'
            # if your files are not standard UTF-8/comma-separated.
            df = pd.read_csv(f)
            all_data.append(df)
            print(f"  - Successfully read: {os.path.basename(f)} ({len(df)} rows)")
        except Exception as e:
            print(f"  - ERROR reading {os.path.basename(f)}: {e}")

    # 3. Concatenate all dataframes
    if all_data:
        try:
            # pd.concat stacks the DataFrames on top of each other
            combined_df = pd.concat(all_data, ignore_index=True)

            # 4. Write the combined dataframe to a new CSV file
            output_path = os.path.join(os.getcwd(), output_filename)
            combined_df.to_csv(output_path, index=False)

            print("\n--- COMBINATION COMPLETE ---")
            print(f"Total rows in combined file: {len(combined_df)}")
            print(f"Output saved to: {output_path}")

        except Exception as e:
            print(f"\n--- ERROR during combination/saving ---")
            print(f"An error occurred: {e}")
    else:
        print("No data was successfully read to combine.")

# --- USAGE INSTRUCTIONS ---
# 1. If your CSV files are in the same folder as this Jupyter notebook:
#    folder_to_process = "."

# 2. If your CSV files are in a subfolder named 'data':
#    folder_to_process = "data"

# 3. For an absolute path (replace with your actual folder path):
#    folder_to_process = "/Users/username/Desktop/my_csv_files"


# >>> Set your folder path here <<<
folder_to_process = "analysis_output"

# Execute the function
combine_csv_files(folder_to_process, output_filename="all_combined_gpt35_translations.csv")