In [1]:
import os
from datasets import load_dataset, DatasetDict
import pandas as pd # Optional for inspection
import json

In [2]:
# --- 1. Configuration ---
dataset_name = "paraloq/json_data_extraction"
train_split_ratio = 0.9 # Use 90% for training, 10% for testing
num_train_samples = 400 # Or set a fixed number if preferred (use None for ratio)
num_test_samples = 40  # Or set a fixed number (use None for ratio)

output_train_filename = "train_json_extraction.jsonl"
output_test_filename = "test_json_extraction.jsonl"

# from google.colab import drive
# drive.mount('/content/drive')
output_base_path = "/content/drive/MyDrive/"
# output_base_path = "./" # Save to current Colab directory

output_train_filepath = os.path.join(output_base_path, output_train_filename)
output_test_filepath = os.path.join(output_base_path, output_test_filename)

In [3]:
# --- 2. Load the Dataset ---
print(f"Loading dataset '{dataset_name}' from Hugging Face...")
try:
    # Load the 'default' configuration and 'train' split
    original_dataset = load_dataset(dataset_name, name="default", split="train")
    print(f"Dataset loaded with {len(original_dataset)} examples.")
    # If the dataset is small, we might use most/all of it
    if len(original_dataset) < 500:
        print("Warning: Dataset is small. Consider using a larger portion for training.")

except Exception as e:
    print(f"Error loading dataset: {e}")
    raise

Loading dataset 'paraloq/json_data_extraction' from Hugging Face...


README.md: 0.00B [00:00, ?B/s]

data.parquet:   0%|          | 0.00/967k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/484 [00:00<?, ? examples/s]

Dataset loaded with 484 examples.


In [4]:
# --- 3. Split the Dataset ---
print(f"Splitting dataset ({train_split_ratio*100}% train / {(1-train_split_ratio)*100}% test)...")
# Shuffle before splitting
shuffled_dataset = original_dataset.shuffle(seed=42)

# Calculate split sizes or use fixed numbers
if num_train_samples is None or num_test_samples is None:
  split_point = int(len(shuffled_dataset) * train_split_ratio)
  train_sample = shuffled_dataset.select(range(split_point))
  test_sample = shuffled_dataset.select(range(split_point, len(shuffled_dataset)))
else:
  total_needed = num_train_samples + num_test_samples
  if total_needed > len(shuffled_dataset):
      print(f"Warning: Requested {total_needed} samples, but dataset only has {len(shuffled_dataset)}. Using all data.")
      num_train_samples = int(len(shuffled_dataset) * train_split_ratio)
      num_test_samples = len(shuffled_dataset) - num_train_samples

  train_sample = shuffled_dataset.select(range(num_train_samples))
  test_sample = shuffled_dataset.select(range(num_train_samples, num_train_samples + num_test_samples))


print(f"Training set size: {len(train_sample)}")
print(f"Test set size: {len(test_sample)}")

Splitting dataset (90.0% train / 9.999999999999998% test)...
Training set size: 400
Test set size: 40


In [9]:
# --- 4. Define Formatting Function (Crucial Step) ---
def format_json_extraction_example(example):
    """
    Formats an example for instruction tuning, including the schema.
    """
    instruction = "Extract information from the following text based on the provided JSON schema. Output ONLY the valid JSON object."
    input_text = example['text'].strip()
    schema_text = example['schema'].strip() # The schema is provided per example
    # The ground truth output should already be a JSON string or dict. Let's load and dump to ensure it's a valid string.
    try:
        # Check if output is already a dict or load from string
        if isinstance(example['item'], dict):
             ground_truth_json_obj = example['item']
        else:
             ground_truth_json_obj = json.loads(example['item'])
        # Convert back to a compact JSON string for the training data
        ground_truth_json_str = json.dumps(ground_truth_json_obj, separators=(',', ':'))
    except (json.JSONDecodeError, TypeError) as e:
        print(f"Warning: Skipping example due to invalid ground truth JSON: {e} - Data: {example['item']}")
        return {"text": None} # Return None or skip in mapping later

    # Construct the formatted string including the schema
    formatted_string = (
        f"<s>[INST] {instruction}\n\n"
        f"Schema:\n```json\n{schema_text}\n```\n\n"
        f"Text:\n'''\n{input_text}\n'''\n\n[/INST] "
        f"{ground_truth_json_str} </s>"
    )

    return {"text": formatted_string}

In [10]:
# --- 5. Apply Formatting ---
print("Applying formatting to train and test sets...")
formatted_train_dataset = train_sample.map(
    format_json_extraction_example,
    remove_columns=train_sample.column_names
).filter(lambda x: x['text'] is not None) # Filter out examples that failed formatting

formatted_test_dataset = test_sample.map(
    format_json_extraction_example,
    remove_columns=test_sample.column_names
).filter(lambda x: x['text'] is not None) # Filter out examples that failed formatting

print(f"Formatting complete. Final train size: {len(formatted_train_dataset)}, Final test size: {len(formatted_test_dataset)}")

Applying formatting to train and test sets...


Map:   0%|          | 0/400 [00:00<?, ? examples/s]

Filter:   0%|          | 0/400 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40 [00:00<?, ? examples/s]

Formatting complete. Final train size: 400, Final test size: 40


In [13]:
# --- 6. Save to JSONL Files ---
print(f"Saving formatted training data to: {output_train_filepath}")
try:
    formatted_train_dataset.to_json(
        output_train_filepath,
        lines=True,
        force_ascii=False
    )
    print(f"Successfully saved {len(formatted_train_dataset)} training records.")
except Exception as e:
    print(f"Error saving training file: {e}")
    raise

print(f"Saving formatted test data to: {output_test_filepath}")
try:
    # Also save the original test data fields needed for evaluation later
    # We need 'schema' and the original 'output' (as JSON object) for validation
    def prepare_test_for_eval(example):
        try:
             # Ensure ground truth is a JSON object/dict
            if isinstance(example['item'], str):
                gt_obj = json.loads(example['item'])
            else:
                gt_obj = example['item']
            # Convert ground truth JSON object to a string for consistent schema
            gt_json_str = json.dumps(gt_obj, separators=(',', ':'))
            return {"formatted_prompt": format_json_extraction_example(example)['text'], "ground_truth_json": gt_json_str, "schema": example['schema']}
        except (json.JSONDecodeError, TypeError):
             # Return a dictionary with None values for consistency if parsing fails
             return {"formatted_prompt": None, "ground_truth_json": None, "schema": example['schema']}

    # Create a dataset for evaluation containing the prompt, schema, and GT JSON object
    evaluation_test_dataset = test_sample.map(prepare_test_for_eval).filter(lambda x: x['formatted_prompt'] is not None) # Filter out examples that failed formatting or parsing

    evaluation_test_dataset.to_json(
        output_test_filepath, # Use the test filename
        lines=True,
        force_ascii=False
    )
    print(f"Successfully saved {len(evaluation_test_dataset)} test records (prepared for evaluation).")
except Exception as e:
    print(f"Error saving test file: {e}")
    raise

Saving formatted training data to: /content/drive/MyDrive/train_json_extraction.jsonl


Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Successfully saved 400 training records.
Saving formatted test data to: /content/drive/MyDrive/test_json_extraction.jsonl


Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Filter:   0%|          | 0/40 [00:00<?, ? examples/s]

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Successfully saved 40 test records (prepared for evaluation).


In [14]:
# --- 7. Confirmation ---
print("\nData preparation complete.")
# Optional: Check first line of train file
!head -n 1 {output_train_filepath}
# Optional: Check first line of test file
!head -n 1 {output_test_filepath}


Data preparation complete.
{"text":"<s>[INST] Extract information from the following text based on the provided JSON schema. Output ONLY the valid JSON object.\n\nSchema:\n```json\n{\"$schema\": \"http:\/\/json-schema.org\/draft-07\/schema#\", \"$defs\": {\"Party\": {\"type\": \"object\", \"properties\": {\"partyID\": {\"type\": \"number\"}, \"name\": {\"type\": \"string\"}, \"email\": {\"type\": \"string\", \"format\": \"email\"}, \"phone\": {\"type\": \"string\", \"pattern\": \"^\\\\d{3}-\\\\d{3}-\\\\d{4}$\"}, \"address\": {\"type\": \"string\"}, \"city\": {\"type\": \"string\"}, \"state\": {\"type\": \"string\", \"maxLength\": 2}, \"zip\": {\"type\": \"string\", \"pattern\": \"^\\\\d{5}$\"}}, \"required\": [\"partyID\", \"name\", \"email\", \"phone\", \"address\", \"city\", \"state\", \"zip\"]}, \"Product\": {\"type\": \"object\", \"properties\": {\"productID\": {\"type\": \"number\"}, \"name\": {\"type\": \"string\", \"minLength\": 3}, \"description\": {\"type\": \"string\"}, \"pr