In [None]:
%cd ..

In [None]:
import pyarrow as pa
import pyarrow.parquet as pq

# Read the Arrow file
table = pa.ipc.open_file('/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow').read_all()

# Print the schema
print(table.schema)

In [None]:
with pa.memory_map('/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow') as source:
    table = pa.ipc.open_stream(source).read_all()

In [None]:
with pa.ipc.open_file('/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow') as reader:
    for batch in reader:
        # Process each batch
        print(batch.schema)

In [None]:
def examine_file(file_path):
    with open(file_path, 'rb') as f:
        # Read the first 8 bytes
        header = f.read(8)
        print(f"First 8 bytes: {header}")
        
        # Check if it starts with the Arrow magic number
        if header.startswith(b'ARROW1'):
            print("File starts with Arrow magic number")
        else:
            print("File does not start with Arrow magic number")
        
        # Read and print the first 100 bytes as hex
        f.seek(0)
        first_100 = f.read(100)
        print("First 100 bytes as hex:")
        print(' '.join(f'{b:02x}' for b in first_100))

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
examine_file(file_path)

In [None]:
import pyarrow as pa
import os

def read_arrow_footer(file_path):
    file_size = os.path.getsize(file_path)
    with open(file_path, 'rb') as file:
        # Read the last 24 bytes (8 bytes for magic footer, 4 for footer length, 12 for metadata)
        file.seek(file_size - 24)
        footer_data = file.read(24)
        
        # Check if the footer ends with the Arrow magic number
        if footer_data[-8:] != b'ARROW1\x00\x00':
            print("File does not end with Arrow magic number")
            return
        
        # Extract the footer length
        footer_length = int.from_bytes(footer_data[-12:-8], byteorder='little')
        print(f"Footer length: {footer_length}")
        
        # Read the full footer
        file.seek(file_size - footer_length - 8)
        full_footer = file.read(footer_length + 8)
        
        # Try to parse the footer
        try:
            footer = pa.ipc.read_footer(pa.py_buffer(full_footer))
            print(f"Schema: {footer.schema}")
            print(f"Number of record batches: {footer.num_record_batches}")
        except Exception as e:
            print(f"Error parsing footer: {e}")

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
read_arrow_footer(file_path)

In [None]:
import os

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
file_size = os.path.getsize(file_path)
print(f"File size: {file_size} bytes")
print(f"File size in GB: {file_size / (1024**3):.2f} GB")

In [None]:
import pyarrow as pa
print(pa.__version__)

In [None]:
import pyarrow as pa

def detailed_scan_arrow_file(file_path, chunk_size=1024*1024):  # 1MB chunks
    with open(file_path, 'rb') as file:
        position = 0
        record_batch_count = 0
        exception_count = 0
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            
            # Look for the continuation indicator (0xFFFFFFFF)
            index = chunk.find(b'\xFF\xFF\xFF\xFF')
            while index != -1:
                try:
                    file.seek(position + index)
                    message = pa.ipc.read_message(file)
                    print(f"Found message at position {position + index}")
                    print(f"Message type: {message.type}")
                    print(f"Message metadata: {message.metadata}")
                    if message.type == pa.ipc.MessageType.RECORD_BATCH:
                        record_batch_count += 1
                        print(f"Found record batch at position {position + index}")
                        # We can't directly access num_rows and body_length, but we can print the metadata
                        print(f"Metadata: {message.metadata}")
                except Exception as e:
                    exception_count += 1
                    if exception_count <= 10:  # Limit the number of printed exceptions
                        print(f"Exception at position {position + index}: {str(e)}")
                
                index = chunk.find(b'\xFF\xFF\xFF\xFF', index + 1)
            
            position += len(chunk)
        
        print(f"Total record batches found: {record_batch_count}")
        print(f"Total exceptions encountered: {exception_count}")

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
detailed_scan_arrow_file(file_path)

In [None]:
def examine_file_ends(file_path, chunk_size=1024):
    with open(file_path, 'rb') as file:
        # Read the first chunk
        start_chunk = file.read(chunk_size)
        print("First 1KB of the file:")
        print(start_chunk.hex())
        
        # Read the last chunk
        file.seek(-chunk_size, 2)  # Seek from the end
        end_chunk = file.read()
        print("\nLast 1KB of the file:")
        print(end_chunk.hex())

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
examine_file_ends(file_path)

In [None]:
import pyarrow as pa
import struct

def analyze_file_content(file_path, chunk_size=1024*1024*10, num_chunks=100000):
    with open(file_path, 'rb') as file:
        position = 0
        for i in range(num_chunks):
            chunk = file.read(chunk_size)
            if not chunk:
                break
            
            print(f"\nAnalyzing chunk {i+1} at position {position}:")
            
            # Look for ARROW1 magic number
            arrow1_positions = [j for j in range(len(chunk)) if chunk.startswith(b'ARROW1', j)]
            if arrow1_positions:
                print(f"Found ARROW1 magic number at positions: {arrow1_positions}")
            
            # Look for continuation indicator (0xFFFFFFFF)
            cont_positions = [j for j in range(len(chunk)) if chunk.startswith(b'\xFF\xFF\xFF\xFF', j)]
            if cont_positions:
                print(f"Found continuation indicators at positions: {cont_positions}")
            
            # Look for common strings
            for s in [b'"item"', b'"tokens"', b'"audio"', b'"index"']:
                if s in chunk:
                    print(f"Found {s} string in chunk")
            
            # Try to parse as flatbuffer
            for j in range(0, len(chunk), 4):
                try:
                    size = struct.unpack('<I', chunk[j:j+4])[0]
                    if 24 <= size <= chunk_size and j + 4 + size <= len(chunk):
                        message = pa.ipc.read_message(chunk[j:j+4+size])
                        print(f"Found valid message at position {position + j}")
                        print(f"Message type: {message.type}")
                        print(f"Message metadata: {message.metadata}")
                except:
                    pass
            
            position += len(chunk)
            
            if i == num_chunks - 1:
                print("\nReached the maximum number of chunks to analyze.")
                break

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
analyze_file_content(file_path)

In [None]:
import pyarrow as pa
import mmap
import struct

def scan_for_arrow_structures(file_path, chunk_size=50*1024*1024):  # 50MB chunks
    with open(file_path, 'rb') as file:
        with mmap.mmap(file.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            total_size = len(mm)
            chunk_start = 0
            
            while chunk_start < total_size:
                chunk_end = min(chunk_start + chunk_size, total_size)
                
                arrow1_count = 0
                continuation_count = 0
                message_count = 0
                schema_count = 0
                record_batch_count = 0
                
                offset = chunk_start
                while offset < chunk_end:
                    # Check for ARROW1 magic number
                    if mm[offset:offset+6] == b'ARROW1':
                        arrow1_count += 1

                    # Check for continuation indicator
                    if mm[offset:offset+4] == b'\xFF\xFF\xFF\xFF':
                        continuation_count += 1

                    # Try to read a message
                    try:
                        message = pa.ipc.read_message(mm, offset)
                        if message is not None:
                            message_count += 1
                            if message.type == 'SCHEMA':
                                schema_count += 1
                            elif message.type == 'RECORD_BATCH':
                                record_batch_count += 1
                            offset += message.total_body_length + message.metadata_length + 4
                            continue
                    except Exception:
                        pass

                    offset += 1

                print(f"Chunk {chunk_start // chunk_size + 1} ({chunk_start / 1000000:.2f}MB - {chunk_end / 1000000:.2f}MB):")
                print(f"  ARROW1 magic numbers found: {arrow1_count}")
                print(f"  Continuation indicators found: {continuation_count}")
                print(f"  Valid messages found: {message_count}")
                print(f"    Schema messages: {schema_count}")
                print(f"    Record batch messages: {record_batch_count}")
                
                if arrow1_count > 0 or continuation_count > 0 or message_count > 0:
                    print("  Notable offsets:")
                    offset = chunk_start
                    while offset < chunk_end:
                        if mm[offset:offset+6] == b'ARROW1':
                            print(f"    ARROW1 at {offset}")
                        if mm[offset:offset+4] == b'\xFF\xFF\xFF\xFF':
                            print(f"    Continuation indicator at {offset}")
                        try:
                            message = pa.ipc.read_message(mm, offset)
                            if message is not None:
                                print(f"    Message at {offset}, type: {message.type}")
                                offset += message.total_body_length + message.metadata_length + 4
                                continue
                        except Exception:
                            pass
                        offset += 1
                
                print()  # Empty line for readability between chunks
                
                chunk_start = chunk_end

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
scan_for_arrow_structures(file_path)

In [None]:
import pyarrow as pa
import struct

def finalize_arrow_file(input_path, output_path):
    # Define the correct schema
    schema = pa.schema([
        ('index', pa.int64()),
        ('audio', pa.list_(pa.float32())),
        ('tokens', pa.list_(pa.int64()))
    ])

    # Read the entire file
    with open(input_path, 'rb') as f:
        content = f.read()

    # Create the EOS (End of Stream) message
    eos_msg = pa.ipc.Message.new_schema(schema)
    eos_bytes = eos_msg.serialize().to_pybytes()

    # Create the footer
    footer = pa.ipc.Footer(schema, 0, 0)  # 0 dictionaries, 0 record batches
    footer_bytes = footer.serialize().to_pybytes()

    # Combine everything
    output_content = (
        content +  # Original file content
        eos_bytes +  # End of Stream message
        footer_bytes +  # Footer
        struct.pack('<i', len(footer_bytes)) +  # Footer length (4 bytes)
        b'ARROW1'  # Magic number at the end
    )

    # Write the finalized file
    with open(output_path, 'wb') as f:
        f.write(output_content)

    print(f"Finalized Arrow file written to {output_path}")

# Use the function
input_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
output_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4_test.arrow'
finalize_arrow_file(input_path, output_path)

In [None]:
import pyarrow as pa

def scan_for_record_batches(file_path, chunk_size=1024*1024):  # 1MB chunks
    with open(file_path, 'rb') as file:
        position = 0
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            
            # Look for the continuation indicator (0xFFFFFFFF)
            index = chunk.find(b'\xFF\xFF\xFF\xFF')
            while index != -1:
                try:
                    file.seek(position + index)
                    message = pa.ipc.read_message(file)
                    if isinstance(message, pa.ipc.RecordBatchMetadata):
                        print(f"Found record batch at position {position + index}")
                        print(f"Number of rows: {message.num_rows}")
                        print(f"Body length: {message.body_length}")
                except Exception as e:
                    pass  # Ignore errors and continue searching
                
                index = chunk.find(b'\xFF\xFF\xFF\xFF', index + 1)
            
            position += len(chunk)

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'
scan_for_record_batches(file_path)

In [None]:
import pyarrow as pa

def read_arrow_file_in_chunks(file_path, chunk_size=1024*1024*1):  # 10MB chunks
    with open(file_path, 'rb') as file:
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            try:
                reader = pa.ipc.open_stream(pa.py_buffer(chunk))
                for batch in reader:
                    yield batch
            except Exception as e:
                print(f"Error processing chunk: {e}")
                continue

file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v4.arrow'

try:
    for i, batch in enumerate(read_arrow_file_in_chunks(file_path)):
        print(f"Successfully read batch {i}")
        print(f"Batch schema: {batch.schema}")
        print(f"Batch row count: {batch.num_rows}")
        break  # Remove this to process all batches
except Exception as e:
    print(f"Error: {e}")

# Add Arrow

In [20]:
# Recursively get all arrows file inside a directory
import os
from datasets import Dataset

def get_all_arrows_files(directory):
    all_files = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.arrow'):
                all_files.append(os.path.join(root, file))
    return all_files

In [21]:
all_arrow_files = get_all_arrows_files('/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/voice_parts')
len(all_arrow_files)

277

In [22]:
all_ds = []
for arrow_file in all_arrow_files:
    ds = Dataset.from_file(arrow_file)
    all_ds.append(ds)

In [23]:
from datasets import concatenate_datasets
all_ds = concatenate_datasets(all_ds)

In [24]:
len(all_ds)

332367

In [28]:
all_ds.push_to_hub("jan-hq/instruction-speech-v1.5-conversation", token = "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")

Uploading the dataset shards:   0%|          | 0/242 [00:00<?, ?it/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1374 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

Map:   0%|          | 0/1373 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/jan-hq/instruction-speech-v1.5-conversation/commit/bf008f37ce6d3879c3663bdf64da09bb93be4603', commit_message='Upload dataset (part 00004-of-00005)', commit_description='', oid='bf008f37ce6d3879c3663bdf64da09bb93be4603', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
all_indices = sorted(all_ds['index'])
all_indices

In [None]:
len(all_indices)

In [None]:
missed_indices = []
for i in range(1, len(all_indices)-1, 1):
   if all_indices[i] - all_indices[i-1] != 1:
       missed_indices.append(all_indices[i])

In [None]:
len(missed_indices)

In [None]:
# remove audio files with missing indices
import os
for missed_index in missed_indices:
    try:
        print("Remove", f'audio/audio_{missed_index}.wav')
        os.remove(f'/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/audio/audio_{missed_index}.wav')
    except Exception as e:
        print(e)

In [None]:
total_indices = range(700000)
# remaning indices
remaining_indices = list(set(total_indices) - set(all_indices))
len(remaining_indices)

In [None]:
with open("turn_0_processed.json", "w") as f:
    json.dump(all_indices, f)

In [None]:
len(all_indices)

In [None]:
import json

with open('/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data//home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/turn_1_processed.json.json', 'w') as f:
    json.dump(remaining_indices, f)

## CHECK FILE INTEGRITY

In [None]:
%cd ..

In [None]:
# Play audio from path
import IPython.display as ipd
ipd.Audio(all_ds[20000]['path'])

In [None]:
import IPython.display as ipd

def get_audio_and_prompt(idx, audio_dir="/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/audio"):
    # get the row containing index field equal to index
    real_index = all_ds[idx]["index"]
    print("Prompt: ", all_ds[idx]["prompt"])
    audio_path = os.path.join(audio_dir, f"audio_{real_index}.wav")
    return audio_path

def get_decoded_audio_and_prompt(idx, audio_tokenizer):
    # get the row containing index field equal to index
    print("Prompt: ", all_ds[idx]["prompt"])
    tokens = all_ds[idx]["tokens"]
    audio, sr = audio_tokenizer.decode(tokens)
    return audio, sr
    

In [None]:
ipd.Audio(get_audio_and_prompt(123456), rate=24_000)

In [None]:
from synthetic_data_pipeline import AudioTokenizer

tokenizer = AudioTokenizer(device="cuda:0")

In [None]:
audio, sr = get_decoded_audio_and_prompt(20000, tokenizer)
audio = audio.cpu().numpy()
ipd.Audio(audio, rate=sr)

In [None]:
def transform_batch_tokens(batch):
    # Process the 'tokens' column for each batch
    batch_transformed_tokens = []
    for token_ids in batch['tokens']:
        # Convert each token ID to the desired string format
        tokens = [f"<|sound_{num:04}|>" for num in token_ids]
        # Optionally add empty strings at the beginning and end if needed
        tokens = ["<|sound_start|>"] + tokens + ["<|sound_end|>"]
        batch_transformed_tokens.append(tokens)
    return {"sound_tokens": batch_transformed_tokens}

transformed_dataset = dataset.map(
    transform_batch_tokens,
    batched=True,
    num_proc=56,
    batch_size=10000,
)

In [None]:
transformed_dataset

In [None]:
transformed_dataset[0]

In [None]:
transformed_dataset.push_to_hub("jan-hq/instruction-speech-no-audio")

In [None]:
import torch
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, get_cosine_schedule_with_warmup
import time
from datasets import Dataset, interleave_datasets
from trl import SFTTrainer
import multiprocessing
from datasets import load_dataset

def formatting_prompts_func(examples):
    convos = examples["conversations"]
    texts = [tokenizer.apply_chat_template(convo, tokenize = False, add_generation_prompt = False) for convo in convos]
    return { "text" : texts, }
pass

# import glob
# from datasets import Dataset, concatenate_datasets
# arrow_files = glob.glob('/home/alandao/voice_data_process/audio/instruction-speech-v1/data/data*.arrow')

# dataset = concatenate_datasets([Dataset.from_file(arrow_file) for arrow_file in arrow_files])
dataset = load_dataset("jan-hq/instruction-speech-no-audio", num_proc=64, split="train")

dataset = dataset.select_columns(['prompt', 'answer', 'tokens'])
print(dataset)

def count_tokens(example):
    example['token_count'] = len(example['tokens'])
    return example

dataset = dataset.map(count_tokens, num_proc=64)
print(dataset)

In [None]:
def transform_batch_tokens(batch):
    # Process the 'tokens' column for each batch
    batch_transformed_tokens = []
    for token_ids in batch['tokens']:
        # Convert each token ID to the desired string format
        tokens = [f"<|sound_{num:04}|>" for num in token_ids]
        # Optionally add empty strings at the beginning and end if needed
        tokens = ["<|sound_start|>"] + tokens + ["<|sound_end|>"]
        batch_transformed_tokens.append(tokens)
    return {"sound_tokens": batch_transformed_tokens}

transformed_dataset = dataset.map(
    transform_batch_tokens,
    batched=True,
    num_proc=56,
    batch_size=10000,
)
# Dataset 1
def create_conversations_sound(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for sound_token, answer in zip(batch['sound_tokens'], batch['answer']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": "".join(sound_token)}
        assistant_part = {"role": "assistant", "content": answer}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"sound_convo": conversations}


# Apply the transformation to create a new 'conversations' column
transformed_dataset_sound = transformed_dataset.map(
    create_conversations_sound,
    batched=True,
    num_proc=56,
    batch_size=10000,
)

def create_conversations(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for question, answer in zip(batch['prompt'], batch['answer']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": question}
        assistant_part = {"role": "assistant", "content": answer}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"text_convo": conversations}

# Apply the transformation to create a new 'conversations' column
transformed_dataset_sound = transformed_dataset_sound.map(
    create_conversations,
    batched=True,
    num_proc=56,
    batch_size=10000,
)

# Dataset 3
def create_conversations_transcribe(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for sound_token, question in zip(batch['sound_tokens'], batch['prompt']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": f"Transcribe this given sound: {''.join(sound_token)}"}
        # Create the assistant part using the answer
        assistant_part = {"role": "assistant", "content": f"This is a transcription: {question}"}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"sound_transcribe": conversations}


# Apply the transformation to create a new 'conversations' column
transformed_dataset_sound = transformed_dataset_sound.map(
    create_conversations_transcribe,
    batched=True,
    num_proc=56,
    batch_size=10000,
)


transformed_dataset_sound = transformed_dataset_sound.remove_columns([col for col in transformed_dataset_sound.column_names if col not in ['text_convo','sound_convo','prompt','answer','sound_transcribe']])
print(transformed_dataset_sound)

In [None]:
transformed_dataset_sound[0]["sound_transcribe"]

In [None]:
transformed_dataset_sound.push_to_hub("jan-hq/instruction-speech-conversation")

# NEW PIPELINE DS TRANSFORM

In [None]:
import json

with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/turn_1_processed.json") as f:
    turn_1_processed_data = json.load(f)

In [None]:
import os
import torch

token_dir = "/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens"

indices = [int(file.replace(".pt", "")) for file in os.listdir(token_dir) if file.endswith(".pt")]

In [None]:
type(turn_1_processed_data)

In [None]:
len(indices)

In [None]:
remaining_indices = list(set(turn_1_processed_data) - set(indices))
with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/turn_1_processed.json", "w") as f:
    json.dump(indices, f)

In [None]:
len(remaining_indices)

In [None]:
token_ids = torch.load("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens/2501.pt")

In [None]:
from datasets import load_dataset

dataset = load_dataset("jan-hq/instruction-speech-v1.5", split="train")
sub_set = dataset.select(indices)

In [None]:
type(sub_set[0]['index'])

In [None]:
def create_sound_conversations(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for sound_token, answer in zip(batch['sound_tokens'], batch['answer']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": "".join(sound_token)}
        assistant_part = {"role": "assistant", "content": answer}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"sound_convo": conversations}

def create_text_conversations(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for question, answer in zip(batch['prompt'], batch['answer']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": question}
        assistant_part = {"role": "assistant", "content": answer}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"text_convo": conversations}

def create_conversations_transcribe(batch):
    # Initialize the list to hold the formatted conversation data
    conversations = []
    
    # Iterate through the batch
    for sound_token, question in zip(batch['sound_tokens'], batch['prompt']):
        # Create the user part using the sound_tokens
        user_part = {"role": "user", "content": f"Transcribe this given sound: {''.join(sound_token)}"}
        # Create the assistant part using the answer
        assistant_part = {"role": "assistant", "content": f"This is a transcription: {question}"}
        conversation = [user_part, assistant_part]
        conversations.append(conversation)
        
    return {"sound_transcribe": conversations}

In [None]:
def add_sound_tokens(batch):
    # Process the 'tokens' column for each batch
    # Convert each token ID to the desired string format
    sound_tokens = []
    audio = []

    for index in batch["index"]:
        token_ids = torch.load(f"/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens/{index}.pt")
        audio_path = os.path.join("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_audio", f"audio_{index}.wav")
        tokens = [f"<|sound_{num:04}|>" for num in token_ids]
        # Optionally add empty strings at the beginning and end if needed
        tokens = ["<|sound_start|>"] + tokens + ["<|sound_end|>"]
        audio.append(audio_path)
        sound_tokens.append(tokens)
    
    return {"sound_tokens": sound_tokens, "audio": audio}


In [None]:
converted = sub_set.map(add_sound_tokens, batched=True, num_proc=2)
converted[0]

In [None]:
converted_2 = converted.map(create_sound_conversations, batched=True, num_proc=2)
converted_2[0]

In [None]:
converted_3 = converted_2.map(create_text_conversations, batched=True, num_proc=2)
converted_3[0]

In [None]:
final = converted_3.map(create_conversations_transcribe, batched=True, num_proc=2)
final[0]

In [None]:
from datasets import Audio
audio_dataset = final.cast_column("audio", Audio())
audio_dataset[0]["audio"]

# NEW CODE V2

In [None]:
import csv 

processed_ids = {}
with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v2.csv", newline='') as csvfile:
    csv_reader = csv.reader(csvfile, delimiter=',', quotechar='"', escapechar="\\")
    
    for i, row in enumerate(csv_reader):
        if i == 0:
            continue
        processed_ids[int(row[0])] = row[1]

In [None]:
indices = list(processed_ids.keys())
indices[2500]

In [None]:
len(set(indices))

In [None]:
processed_ids[410986]

In [None]:
with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/turn_2_processed.json", "w") as f:
    json.dump(indices, f)

In [None]:
len(indices)

In [None]:
import json

with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/remaining_indices.json", "w") as f:
    with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/turn_2_processed.json", "r") as f2:
        turn_2_processed = json.load(f2)
        json.dump(list(set(turn_2_processed) - set(indices)), f)

In [None]:
with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/remaining_indices.json", "r") as f:
    remaining_indices = json.load(f)

len(remaining_indices)

# New Code V3

In [None]:
import pandas as pd

df = pd.read_csv("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_v3.csv")
df

In [None]:
# get all index from pandas 
indices = df["index"].tolist()

In [None]:
indices

In [None]:
with open("turn_3_processed.json", "w") as f:
    json.dump(indices, f)

In [None]:
len(indices)

In [None]:
import json

with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/remaining_indices.json", "w") as f:
    old_remaining = json.load(f)
    with open("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/turn_3_processed.json", "r") as f2:
        turn_3_processed = json.load(f2)
        remaining_indices = list(set(old_remaining) - set(turn_3_processed))
        json.dump(remaining_indices, f)

In [None]:
len(remaining_indices)

In [None]:
with open("./turn_0_processed.json", "r") as f:
    turn_0 = json.load(f)
with open("./turn_1_processed.json", "r") as f:
    turn_1 = json.load(f)
with open("./turn_2_processed.json", "r") as f:
    turn_2 = json.load(f)
with open("./turn_3_processed.json", "r") as f:
    turn_3 = json.load(f)

total = range(700000)
with open("turn_4_processed.json", "w") as f:
    json.dump(list(set(total) - set(turn_0) - set(turn_1) - set(turn_2) - set(turn_3)), f)

In [None]:
len(turn_0)

In [None]:
len(turn_1)

In [None]:
len(turn_2)

In [33]:
import pandas as pd

# Load the first DataFrame (v2)
df_v2 = pd.read_csv("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_audio_v4_2/audio_tokens_1.csv")

# Print the head of df_v3 (as requested)
print("DataFrame v2 head:")
print(df_v2.head())

DataFrame v2 head:
    index                                              audio  \
0  148870  [8.571102080168203e-05, 0.00014627441123593599...   
1  148871  [-6.811766797909513e-05, -9.268448047805578e-0...   
2  148872  [-0.0001802546757971868, -0.000301635649520903...   
3  148873  [-7.2003308559942525e-06, -1.5234570128086489e...   
4  148874  [0.00020438485080376267, 6.477542774518952e-05...   

                                              tokens  
0  [62, 913, 408, 913, 738, 601, 491, 687, 583, 5...  
1  [121, 913, 62, 424, 408, 913, 738, 424, 699, 6...  
2  [62, 913, 408, 518, 62, 424, 408, 544, 408, 54...  
3  [62, 913, 62, 424, 62, 424, 62, 424, 408, 544,...  
4  [865, 424, 62, 424, 106, 544, 393, 924, 393, 1...  


In [39]:
import os
import pandas as pd

# Set the path to the folder containing CSV files
folder_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_audio_v4_2/'

# Get a list of all CSV files in the folder
csv_files = [f for f in os.listdir(folder_path) if f.endswith('.csv')]

# Create an empty list to store individual dataframes
dfs = []

# Read each CSV file and append it to the list
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    df = pd.read_csv(file_path)
    dfs.append(df)

# Concatenate all dataframes in the list
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.shape

(39950, 3)

In [38]:
import json

def open_json_file(file_path):
    try:
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data
    except FileNotFoundError:
        print(f"Error: The file {file_path} was not found.")
    except json.JSONDecodeError:
        print(f"Error: The file {file_path} is not a valid JSON file.")
    except Exception as e:
        print(f"An error occurred: {str(e)}")
    return None

# Example usage
file_path = '/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_audio_v4_2/failed_indices_2.json'  # Replace with the actual path to your JSON file
json_data = open_json_file(file_path)

if json_data is not None:
    print("Successfully opened and read the JSON file.")
    print("Contents:")
    print(json.dumps(json_data, indent=2))  # Pretty print the JSON data
else:
    print("Failed to open or read the JSON file.")

Error: The file /home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_audio_v4_2/failed_indices_2.json is not a valid JSON file.
Failed to open or read the JSON file.
