In [1]:
%%sh
pwd

/home/ubuntu/whisper-dev/whisper-stream/development/notebooks


In [2]:
%load_ext autoreload

In [3]:
import jax

jax.device_count(), jax.devices()[0].device_kind

(1, 'NVIDIA A10G')

In [4]:
from whisper_stream.core.helpers.data_loading import load_data_samples_from_path
from whisper_stream.projects.jax_pipelines import (
    JAXStreamingPipeline,
)
from whisper_stream.projects.jax_pipelines.constants import (
    JAXValidDtypesMapping,
    JAXScalarDType,
)

from whisper_stream.core.constants import WhisperValidCheckpoints, WhisperValidTasks

from whisper_stream.core.logger import LogLevelNames
from pathlib import Path
from time import time

%autoreload 2

In [5]:
# Prepare
checkpoint: WhisperValidCheckpoints = "openai/whisper-large-v2"
model_dtype: JAXScalarDType = JAXValidDtypesMapping["BFLOAT16"]
task: WhisperValidTasks = "transcribe"
language: str = "english"
return_timestamps: bool = True
batch_size: int = 1
log_level: LogLevelNames = "INFO"

data_directory = Path("../../data")

run_opts = {
    "batch_size": batch_size,
    "return_timestamps": return_timestamps,
    "language": language,
    "task": task,
}

# construct
pipeline = JAXStreamingPipeline(
    checkpoint=checkpoint,
    dtype=model_dtype,
    batch_size=batch_size,
    min_log_level=log_level,
)

In [6]:
# Load data
pipeline_data: bytes = load_data_samples_from_path(
    "audio_2.mp3", directory=data_directory, binary_mode=True
)  # 4s
pipeline_data_large: bytes = load_data_samples_from_path(
    "tryst.mp3", directory=data_directory, binary_mode=True
)  # 4:44s

In [8]:
# initialize & warmup
%time pipeline.initialize_pipeline(**run_opts, use_experimental_cache=True)

event="Initializing openai/whisper-large-v2/<class 'jax.numpy.bfloat16'> pipeline" name='pipeline' version='0.0.5' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" application='whisper_stream' level='info' timestamp='2023-09-12 17:33:08'




event="Compiling openai/whisper-large-v2/<class 'jax.numpy.bfloat16'> pipeline" name='pipeline' version='0.0.5' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" application='whisper_stream' level='info' timestamp='2023-09-12 17:33:08'


event='Compilation done in 0.20s' name='pipeline' version='0.0.5' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" application='whisper_stream' level='info' timestamp='2023-09-12 17:33:09'
CPU times: user 201 ms, sys: 67 µs, total: 201 ms
Wall time: 199 ms


In [19]:
# should be warmed up now (time should be similar to # small data)
%time list(pipeline(pipeline_data, **run_opts))

event='ffmpeg conversion' time_taken='0.088s' name='pipeline' num_items=1 version='0.0.5' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" application='whisper_stream' level='info' timestamp='2023-09-12 17:35:03'


CPU times: user 478 ms, sys: 558 ms, total: 1.04 s
Wall time: 444 ms


[[{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the players of cricket.'}]}]]

In [16]:
# small data
%time list(pipeline(pipeline_data, **run_opts))

event='ffmpeg conversion' time_taken='0.09s' name='pipeline' num_items=1 version='0.0.5' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" application='whisper_stream' level='info' timestamp='2023-09-12 17:33:50'


CPU times: user 478 ms, sys: 563 ms, total: 1.04 s
Wall time: 447 ms


[[{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the players of cricket.'}]}]]

In [15]:
# small data in batch
%time list(pipeline([pipeline_data] * 10, **run_opts))

event='ffmpeg conversion' time_taken='0.3s' name='pipeline' num_items=10 version='0.0.5' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" application='whisper_stream' level='info' timestamp='2023-09-12 17:33:43'
CPU times: user 4.95 s, sys: 5.42 s, total: 10.4 s
Wall time: 3.87 s


[[{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the players of cricket.'}]}],
 [{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the players of cricket.'}]}],
 [{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the players of cricket.'}]}],
 [{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the players of cricket.'}]}],
 [{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the players of cricket.'}]}],
 [{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the players of cricket.'}]}],
 [{'text': ' I know all the players of cricket.',
   'chunks': [{'timestamp': (0.0, 2.8),
     'text': ' I know all the play

In [19]:
# chunkable data
%time list(pipeline(pipeline_data_large, **run_opts))

event='ffmpeg conversion' time_taken='0.61s' name='pipeline' num_items=1 application='whisper_stream' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" version='0.0.5' level='info' timestamp='2023-09-10 23:50:11'
CPU times: user 3.79 s, sys: 8.22 s, total: 12 s
Wall time: 2.38 s


[[{'text': ' long years ago we made a trick with this pinnip and now the time comes when we shall redeem our pledge, not only for in full measure but there is a potential aid. At the stroke of the midnight hour when the world sleeps in the hour when awake to light and freedom. At the stroke of the midnight hour, when the world sleeps, India will awake in light and freedom. A moment comes which comes but rarely in history. When we step out from the world to the new, when an agent, when the throne of a nation, kept out from the world to the new, when an agent, when the soon of a nation, lungs suppressed, fine utterance. It is fitting that at this solemn moment, we take the pledge of dedication to the service of India and her people and to the still larger cause of humanity. her people and to the still larger cause of humanity. At the grown of history India started on her unending quest. And trackless centuries of feud with her spying and the grandeur of her successes and her failures. Wh

In [20]:
# chunkable data in batches
%time list(pipeline([pipeline_data_large] * 32, **run_opts))

event='ffmpeg conversion' time_taken='6.1s' name='pipeline' num_items=32 application='whisper_stream' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" version='0.0.5' level='info' timestamp='2023-09-10 23:50:25'
CPU times: user 2min 9s, sys: 4min 31s, total: 6min 40s
Wall time: 1min 12s


[[{'text': ' long years ago we made a trick with this pinnip and now the time comes when we shall redeem our pledge, not only for in full measure but there is a potential aid. At the stroke of the midnight hour when the world sleeps in the hour when awake to light and freedom. At the stroke of the midnight hour, when the world sleeps, India will awake in light and freedom. A moment comes which comes but rarely in history. When we step out from the world to the new, when an agent, when the throne of a nation, kept out from the world to the new, when an agent, when the soon of a nation, lungs suppressed, fine utterance. It is fitting that at this solemn moment, we take the pledge of dedication to the service of India and her people and to the still larger cause of humanity. her people and to the still larger cause of humanity. At the grown of history India started on her unending quest. And trackless centuries of feud with her spying and the grandeur of her successes and her failures. Wh

In [21]:
mixed_mode_data: list[bytes] = [
    pipeline_data_large,
    pipeline_data,
    pipeline_data,
    pipeline_data,
] * 4

In [22]:
# mixed data, received as it comes, using default `smallest` strategy the smaller files will come in larger batches first
start: float = time()
for data in pipeline(mixed_mode_data, strategy="smallest", **run_opts):
    print({"num_items": len(data)}, end="\n")
    print({"data": data, "time_taken": f"{time() - start:.2}s"}, end="\n")
    print("-" * 40, end="\n")
    start = time()

event='ffmpeg conversion' time_taken='1.3s' name='pipeline' num_items=16 application='whisper_stream' python_version='3.11.5' platform_architecture="('64bit', 'ELF')" version='0.0.5' level='info' timestamp='2023-09-10 23:51:43'
{'num_items': 1}
{'data': [{'text': ' I know all the players of cricket.', 'chunks': [{'timestamp': (0.0, 2.72), 'text': ' I know all the players of cricket.'}]}], 'time_taken': '1.4s'}
----------------------------------------
{'num_items': 1}
{'data': [{'text': ' I know all the players of cricket.', 'chunks': [{'timestamp': (0.0, 2.72), 'text': ' I know all the players of cricket.'}]}], 'time_taken': '0.067s'}
----------------------------------------
{'num_items': 1}
{'data': [{'text': ' I know all the players of cricket.', 'chunks': [{'timestamp': (0.0, 2.72), 'text': ' I know all the players of cricket.'}]}], 'time_taken': '0.068s'}
----------------------------------------
{'num_items': 1}
{'data': [{'text': ' I know all the players of cricket.', 'chunks': [{