# JSONL to Parquet Conversion

This notebook converts the normalized JSONL dataset into multiple split parquet files for efficient storage and faster loading.

In [7]:
import json
import os
from pathlib import Path
from typing import Optional

import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm

## Configuration

In [8]:
# Input JSONL file
INPUT_JSONL = "/home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/dataset_normalized.jsonl"

# Output directory for parquet files
OUTPUT_DIR = "/home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/parquet"

# Number of rows per parquet file (adjust based on memory and file size preferences)
ROWS_PER_FILE = 10000

# Compression codec: 'snappy', 'gzip', 'brotli', 'lz4', 'zstd', or None
COMPRESSION = 'snappy'

# Create output directory if it doesn't exist
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

print(f"Input file: {INPUT_JSONL}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Rows per file: {ROWS_PER_FILE}")
print(f"Compression: {COMPRESSION}")

Input file: /home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/dataset_normalized.jsonl
Output directory: /home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/parquet
Rows per file: 10000
Compression: snappy


## Load and Inspect JSONL Data

In [9]:
# Count total lines in JSONL file
with open(INPUT_JSONL, 'r') as f:
    total_lines = sum(1 for _ in f)

print(f"Total records in JSONL: {total_lines:,}")

Total records in JSONL: 71,798


In [10]:
# Load first record to inspect schema
with open(INPUT_JSONL, 'r') as f:
    first_record = json.loads(f.readline())

print("First record schema:")
for key, value in first_record.items():
    print(f"  {key}: {type(value).__name__} = {str(value)[:100]}")

First record schema:
  id: str = b13a2f24-fb66-43f5-867c-f09ecf6653ff
  source_license: str = MIT
  timestamp: str = 2026-02-06T20:39:18.551213
  domain: str = algebra
  level: int = 2
  text: str = Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How m
  solution: str = Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether
  expected_answer: str = 72
  objects: list = ['positive_integer']
  constraints: list = ['equality']
  reasoning_depth: str = shallow
  technique_transitions: int = 0
  reasoning_scope: str = local
  intermediate_reuse: str = none
  code: str = a = 48
b = a // 2
total = a + b
print(total)
  code_attempts: int = 1
  code_runtime_ms: int = 20
  code_generated_tokens: int = 21
  code_predicted_correct_answer: bool = True


## Convert JSONL to Multiple Parquet Files

We'll read the JSONL in chunks and write each chunk to a separate parquet file.

In [11]:
def read_jsonl_in_chunks(file_path: str, chunk_size: int):
    """
    Generator that yields chunks of records from a JSONL file.
    
    Args:
        file_path: Path to the JSONL file
        chunk_size: Number of records per chunk
        
    Yields:
        List of dictionaries (one chunk at a time)
    """
    chunk = []
    with open(file_path, 'r') as f:
        for line in f:
            chunk.append(json.loads(line.strip()))
            if len(chunk) >= chunk_size:
                yield chunk
                chunk = []
        
        # Yield remaining records
        if chunk:
            yield chunk

In [12]:
# Convert JSONL to parquet files
file_count = 0
total_records_written = 0

print("Converting JSONL to Parquet files...\n")

for chunk_idx, chunk in enumerate(tqdm(
    read_jsonl_in_chunks(INPUT_JSONL, ROWS_PER_FILE),
    total=(total_lines + ROWS_PER_FILE - 1) // ROWS_PER_FILE,
    desc="Processing chunks"
)):
    # Convert chunk to DataFrame
    df_chunk = pd.DataFrame(chunk)
    
    # Fix: Replace string 'null' with actual None/NaN values
    # This handles cases where JSON contains "null" as a string instead of null
    df_chunk = df_chunk.replace('null', None)
    
    # Generate output filename with zero-padded index
    output_file = os.path.join(OUTPUT_DIR, f"part_{chunk_idx:05d}.parquet")
    
    # Write to parquet
    df_chunk.to_parquet(
        output_file,
        engine='pyarrow',
        compression=COMPRESSION,
        index=False
    )
    
    file_count += 1
    total_records_written += len(df_chunk)

print(f"\n✓ Conversion complete!")
print(f"  Files created: {file_count}")
print(f"  Total records written: {total_records_written:,}")
print(f"  Output directory: {OUTPUT_DIR}")

Converting JSONL to Parquet files...



Processing chunks: 100%|██████████| 8/8 [00:13<00:00,  1.67s/it]


✓ Conversion complete!
  Files created: 8
  Total records written: 71,798
  Output directory: /home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/parquet





## Verify Parquet Files

In [13]:
# List all parquet files and their sizes
parquet_files = sorted(Path(OUTPUT_DIR).glob("*.parquet"))

print(f"Parquet files in {OUTPUT_DIR}:\n")
total_size_mb = 0

for pf in parquet_files[:5]:  # Show first 5
    size_mb = pf.stat().st_size / (1024 * 1024)
    total_size_mb += size_mb
    print(f"  {pf.name}: {size_mb:.2f} MB")

if len(parquet_files) > 5:
    print(f"  ... and {len(parquet_files) - 5} more files")
    for pf in parquet_files[5:]:
        total_size_mb += pf.stat().st_size / (1024 * 1024)

print(f"\nTotal files: {len(parquet_files)}")
print(f"Total size: {total_size_mb:.2f} MB")

Parquet files in /home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/parquet:

  part_00000.parquet: 6.90 MB
  part_00001.parquet: 11.72 MB
  part_00002.parquet: 12.19 MB
  part_00003.parquet: 87.94 MB
  part_00004.parquet: 103.21 MB
  ... and 3 more files

Total files: 8
Total size: 482.17 MB


In [14]:
# Read and verify first parquet file
if parquet_files:
    first_parquet = parquet_files[0]
    df_verify = pd.read_parquet(first_parquet)
    
    print(f"\nVerifying first parquet file: {first_parquet.name}")
    print(f"  Shape: {df_verify.shape}")
    print(f"  Columns: {list(df_verify.columns)}")
    print(f"\nFirst 3 rows:")
    display(df_verify.head(3))


Verifying first parquet file: part_00000.parquet
  Shape: (10000, 19)
  Columns: ['id', 'source_license', 'timestamp', 'domain', 'level', 'text', 'solution', 'expected_answer', 'objects', 'constraints', 'reasoning_depth', 'technique_transitions', 'reasoning_scope', 'intermediate_reuse', 'code', 'code_attempts', 'code_runtime_ms', 'code_generated_tokens', 'code_predicted_correct_answer']

First 3 rows:


Unnamed: 0,id,source_license,timestamp,domain,level,text,solution,expected_answer,objects,constraints,reasoning_depth,technique_transitions,reasoning_scope,intermediate_reuse,code,code_attempts,code_runtime_ms,code_generated_tokens,code_predicted_correct_answer
0,b13a2f24-fb66-43f5-867c-f09ecf6653ff,MIT,2026-02-06T20:39:18.551213,algebra,2.0,Natalia sold clips to 48 of her friends in Apr...,Natalia sold 48/2 = <<48/2=24>>24 clips in May...,72,[positive_integer],[equality],shallow,0.0,local,none,a = 48\nb = a // 2\ntotal = a + b\nprint(total),1,20,21,True
1,a60555cf-33b1-41ea-b7c8-cd8ca6c5329c,MIT,2026-02-06T20:39:18.551924,algebra,2.0,Weng earns $12 an hour for babysitting. Yester...,Weng earns 12/60 = $<<12/60=0.2>>0.2 per minut...,10,"[integer, real]",[equality],medium,0.0,local,single,# Goal: compute earnings for 50 minutes at $12...,1,22,72,True
2,ec181dc0-8cac-4f88-8cdf-c9383f7e569e,MIT,2026-02-06T20:39:18.552459,algebra,2.0,Betty is saving money for a new wallet which c...,"In the beginning, Betty has only 100 / 2 = $<<...",5,[integer],[equality],medium,0.0,local,single,# Goal: compute how much more money Betty need...,1,18,68,False


## Load All Parquet Files as Single DataFrame (Optional)

This demonstrates how to load all the parquet files back into a single DataFrame.

In [15]:
# Load all parquet files into a single DataFrame
# Warning: This may use significant memory for large datasets

# Uncomment to load all files:
# df_full = pd.read_parquet(OUTPUT_DIR)
# print(f"Full dataset shape: {df_full.shape}")
# print(f"Memory usage: {df_full.memory_usage(deep=True).sum() / (1024**2):.2f} MB")

print("To load all parquet files, uncomment the code above.")
print("\nAlternatively, use lazy loading with PyArrow or Dask for large datasets:")
print("  import pyarrow.parquet as pq")
print(f"  table = pq.read_table('{OUTPUT_DIR}')")
print("  df = table.to_pandas()")

To load all parquet files, uncomment the code above.

Alternatively, use lazy loading with PyArrow or Dask for large datasets:
  import pyarrow.parquet as pq
  table = pq.read_table('/home/larcanio/AIMO3_v2/data/datasets/Dataset_Full/bucketed/parquet')
  df = table.to_pandas()


## Statistics and Summary

In [16]:
# Calculate compression ratio by comparing to original JSONL file size
original_size_mb = Path(INPUT_JSONL).stat().st_size / (1024 * 1024)
compression_ratio = (1 - (total_size_mb / original_size_mb)) * 100

print("\n" + "="*60)
print("CONVERSION SUMMARY")
print("="*60)
print(f"Original JSONL file size:  {original_size_mb:.2f} MB")
print(f"Total Parquet files size:  {total_size_mb:.2f} MB")
print(f"Compression ratio:         {compression_ratio:.2f}%")
print(f"Number of parquet files:   {len(parquet_files)}")
print(f"Records per file:          ~{ROWS_PER_FILE:,}")
print(f"Total records:             {total_records_written:,}")
print("="*60)


CONVERSION SUMMARY
Original JSONL file size:  1113.63 MB
Total Parquet files size:  482.17 MB
Compression ratio:         56.70%
Number of parquet files:   8
Records per file:          ~10,000
Total records:             71,798


## Usage Examples

### Reading a single parquet file:
```python
df = pd.read_parquet('path/to/part_00000.parquet')
```

### Reading all parquet files in a directory:
```python
df = pd.read_parquet('path/to/output/directory/')
```

### Reading specific parquet files:
```python
import glob
files = glob.glob('path/to/output/directory/part_*.parquet')
df = pd.concat([pd.read_parquet(f) for f in files])
```

### Using PyArrow for lazy loading:
```python
import pyarrow.parquet as pq
table = pq.read_table('path/to/output/directory')
# Filter before converting to pandas
filtered = table.filter(pc.field('domain') == 'algebra')
df = filtered.to_pandas()
```

### Using Dask for large-scale parallel processing:
```python
import dask.dataframe as dd
ddf = dd.read_parquet('path/to/output/directory/*.parquet')
result = ddf.groupby('domain').size().compute()
```