In [None]:
#!/usr/bin/env python3
"""
Script to download the OpenWebMath dataset and save it locally
"""

import os
import json
from datasets import load_dataset
from tqdm import tqdm
import pandas as pd

def ensure_dir_exists(directory):
    """Create directory if it doesn't exist"""
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Created directory: {directory}")

def save_dataset_to_disk(dataset, output_dir, format="parquet"):
    """
    Save the dataset to disk in the specified format
    Supported formats: parquet, json, csv
    """
    ensure_dir_exists(output_dir)
    
    # Iterate through each split in the dataset
    for split_name, split_data in dataset.items():
        split_dir = os.path.join(output_dir, split_name)
        ensure_dir_exists(split_dir)
        
        # Get the file path for the output
        if format == "parquet":
            file_path = os.path.join(split_dir, f"{split_name}.parquet")
            # Save as parquet file (efficient for large datasets)
            split_data.to_parquet(file_path)
        elif format == "json":
            file_path = os.path.join(split_dir, f"{split_name}.json")
            # Convert to pandas and save as json
            df = split_data.to_pandas()
            df.to_json(file_path, orient="records", lines=True)
        elif format == "csv":
            file_path = os.path.join(split_dir, f"{split_name}.csv")
            # Convert to pandas and save as csv
            df = split_data.to_pandas()
            df.to_csv(file_path, index=False)
        else:
            raise ValueError(f"Unsupported format: {format}")
        
        print(f"Saved {split_name} split to {file_path}")
        
        # Save a sample file with just a few examples for quick inspection
        sample_file_path = os.path.join(split_dir, f"{split_name}_sample.json")
        with open(sample_file_path, 'w') as f:
            json.dump(split_data[:5], f, indent=2)
        print(f"Saved sample of {split_name} split to {sample_file_path}")
        
        # Save metadata
        metadata_file = os.path.join(split_dir, "metadata.json")
        metadata = {
            "num_examples": len(split_data),
            "column_names": split_data.column_names,
            "features": str(split_data.features),
        }
        with open(metadata_file, 'w') as f:
            json.dump(metadata, f, indent=2)
        print(f"Saved metadata to {metadata_file}")

def main():
    # Define the output directory
    output_dir = "data"
    
    print("Downloading OpenWebMath dataset...")
    # Load the dataset
    try:
        dataset = load_dataset("open-web-math/open-web-math")
        print("Dataset downloaded successfully!")
        
        # Print dataset information
        print("\nDataset Information:")
        for split_name, split_data in dataset.items():
            print(f"Split: {split_name}, Examples: {len(split_data)}")
            print(f"Columns: {split_data.column_names}")
            print(f"First example: {split_data[0]}")
        
        # Save the dataset (in parquet format by default for efficiency)
        print("\nSaving dataset to disk...")
        save_dataset_to_disk(dataset, output_dir, format="parquet")
        
        print("\nDataset saved successfully!")
        print(f"Output directory: {output_dir}")
    except Exception as e:
        print(f"Error downloading or saving the dataset: {e}")

if __name__ == "__main__":
    main()

In [None]:
#!/usr/bin/env python3
"""
Memory-efficient script to count samples in a large Parquet file
"""

import os
import pyarrow.parquet as pq
import time

def count_samples(file_path):
    """
    Count samples in a Parquet file using PyArrow (memory efficient)
    Also displays file metadata without loading the entire file into memory
    """
    try:
        start_time = time.time()
        
        # Check if file exists
        if not os.path.exists(file_path):
            print(f"Error: File not found at {file_path}")
            print(f"Absolute path: {os.path.abspath(file_path)}")
            return None
        
        # Get file size
        file_size_bytes = os.path.getsize(file_path)
        file_size_mb = file_size_bytes / (1024 * 1024)
        file_size_gb = file_size_mb / 1024
        
        if file_size_gb >= 1:
            print(f"File size: {file_size_gb:.2f} GB")
        else:
            print(f"File size: {file_size_mb:.2f} MB")
        
        # Open the file without loading it completely
        print("Reading Parquet metadata...")
        parquet_file = pq.ParquetFile(file_path)
        
        # Get row count from metadata
        count = parquet_file.metadata.num_rows
        
        # Get schema information
        schema = parquet_file.schema
        
        # Get number of row groups (useful for understanding file structure)
        num_row_groups = parquet_file.metadata.num_row_groups
        
        # Calculate time taken
        elapsed_time = time.time() - start_time
        
        # Print results
        print(f"\nResults for: {os.path.basename(file_path)}")
        print(f"Total samples: {count:,}")
        print(f"Number of row groups: {num_row_groups}")
        print(f"Time taken: {elapsed_time:.2f} seconds")
        
        return count
        
    except Exception as e:
        print(f"Error: {e}")
        return None

if __name__ == "__main__":
    file_path = "data/train/train.parquet"
    count_samples(file_path)