In [None]:
from whisperspeech.pipeline import Pipeline

class TTSProcessor:
    def __init__(self, num_processes_per_gpu, num_gpus):
        """TTS Processor to generate TTS from prompts."""
        self.num_processes_per_gpu = num_processes_per_gpu
        self.num_gpus = num_gpus

    def distribute_pipe(self):
        """Distribute the pipeline."""
        self.pipes = []
        for gpu_id in range(self.num_gpus):
            for pipe_id in range(self.num_processes_per_gpu):
                self.pipes.append(Pipeline(s2a_ref="collabora/whisperspeech:s2a-q4-tiny-en+pl.model", ))


In [8]:
import torch

# random float32 tensor with normal distribution
x = torch.randn(10, 2, dtype=torch.float32).numpy()

In [25]:
import pyarrow as pa

# convert to pyarrow tensor
x_pa = pa.Tensor.from_numpy(x)
x_pa

<pyarrow.Tensor>
type: float
shape: (10, 2)
strides: (8, 4)

In [10]:
x_pa

<pyarrow.Tensor>
type: float
shape: (10, 2)
strides: (8, 4)

In [36]:
schema = pa.schema([pa.field("feature", pa.list_(pa.float32())), pa.field("label", pa.list_(pa.float32()))])

In [31]:
import torchaudio


tensor, sr = torchaudio.load("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_audio/audio_2499.wav")

In [32]:
tensor.squeeze(0).numpy()

array([-0.00079372, -0.00075542, -0.00076741, ...,  0.00136764,
       -0.00015275, -0.00142147], dtype=float32)

In [44]:
import pyarrow as pa
import torchaudio

# Define the schema
schema = pa.schema([
    pa.field("feature", pa.list_(pa.float32())),
    pa.field("label", pa.list_(pa.float32()))
])

# Load the audio file
tensor, sr = torchaudio.load("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_audio/audio_2499.wav")

In [53]:
import pyarrow as pa
import torchaudio

# Define the schema
schema = pa.schema([
    pa.field("feature", pa.list_(pa.float32())),
    pa.field("label", pa.list_(pa.float32()))
])

# Load an audio file
tensor, sr = torchaudio.load("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_audio/audio_2499.wav")
print(f"Loaded audio file with sample rate: {sr}")

# Open the file sink for writing
with pa.OSFile("data.arrow", "wb") as sink:
    writer = pa.ipc.RecordBatchFileWriter(sink, schema)
    print("Opened data.arrow for writing.")

    # Prepare the data as pyarrow arrays
    feature_array = pa.array([tensor.squeeze(0).numpy()], type=pa.list_(pa.float32()))
    label_array = pa.array([tensor.squeeze(0).numpy()], type=pa.list_(pa.float32()))
    print("Prepared pyarrow arrays for feature and label.")

    # Create a RecordBatch from the arrays
    batch = pa.RecordBatch.from_arrays([feature_array, label_array], schema)
    print("Created RecordBatch with schema:", schema)

    # Write the batch to the file
    writer.write_batch(batch)
    print("Wrote RecordBatch to data.arrow.")

    # Close the writer explicitly
    writer.close()
    print("Closed the writer.")


Loaded audio file with sample rate: 24000
Opened data.arrow for writing.
Prepared pyarrow arrays for feature and label.
Created RecordBatch with schema: feature: list<item: float>
  child 0, item: float
label: list<item: float>
  child 0, item: float
Wrote RecordBatch to data.arrow.
Closed the writer.


  batch = pa.RecordBatch.from_arrays([feature_array, label_array], schema)


In [2]:
import pyarrow as pa

# Open the Arrow file for reading
with pa.memory_map("/home/root/Workspace/synthetic_data_generation/sound_instruct_llama3/data/new_tokens_test.arrow", "r") as source:
    reader = pa.ipc.RecordBatchFileReader(source)
    print("Opened data.arrow for reading.")

    # Read the first RecordBatch from the file
    batch = reader.get_batch(0)

Opened data.arrow for reading.


In [7]:
len(batch['index'].to_pylist())

30

In [8]:
batch

pyarrow.RecordBatch
index: int64
audio: list<item: float>
  child 0, item: float
tokens: list<item: int64>
  child 0, item: int64
----
index: [39,26,13,39,26,13,0,26,27,13,...,29,40,29,0,40,14,30,41,1,30]
audio: [[-0.00029094573,-0.00034573989,0.000022817841,-0.00027235653,-0.00031429573,-0.00039280177,-0.00034648518,-0.00032828614,-0.00035278645,-0.000373924,...,-0.0025422256,-0.0010895681,0.0007264769,0.0010857449,-0.00083532487,-0.003013576,-0.0022790248,0.0010084701,0.0033206274,0.0029433377],[0.0025704913,0.0025136126,0.0014571494,-0.00037038358,-0.0007443017,-0.0006073907,-0.0004519045,-0.0005722932,-0.0007481215,-0.00013342877,...,-0.0000245303,-0.00002758873,-0.000046347614,-0.000029870982,-0.000030304207,-0.00005873055,-0.000045055498,-0.000027642718,-0.000021178023,-0.000029912522],...,[-0.000015874664,-0.00003266528,-0.00007935162,-0.00008547738,-0.00006313591,-0.00009888231,-0.00009041828,-0.000036883597,-0.00005379759,-0.00008361159,...,-0.00001311769,-0.000062267834,-0.00

In [9]:
len(batch['audio'].to_pylist())

30

In [None]:
audio