In [1]:
! pip install -Uqq transformers datasets

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m61.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m469.0/469.0 KB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m70.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m21.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m22.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 KB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [7]:
import argparse
import base64
import os
import tempfile

import torch
from transformers import (
    AutoFeatureExtractor,
    AutoTokenizer,
    WhisperForConditionalGeneration,
    WhisperProcessor,
    pipeline,
)

In [3]:
model_dir="parambharat/whisper-base-ml"

In [22]:
model_dir="thennal/whisper-medium-ml"

In [23]:
class Whisper:
    def __init__(self):
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.feature_extractor = AutoFeatureExtractor.from_pretrained(model_dir)
        self.processor = WhisperProcessor.from_pretrained(model_dir)
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        # print([x for x in Path(model_dir).iterdir()])
        self.model = WhisperForConditionalGeneration.from_pretrained(model_dir).to(self.device)

    def predict_raw(self, payload):
        try:
            if payload is None:
                return {"inputerror": "JSON expected"}

            if "wav_base64" not in payload:
                return {"inputerror": "Missing key wav_base64 in payload."}

            afs = payload["wav_base64"]
            if not isinstance(afs, str):
                return {"inputerror": "Audio file to passed as input in base64 format"}

            lang = payload.get("language")
            print(lang)
            afs = base64.b64decode(afs)

            dno = torch.cuda.current_device() if self.device == "cuda" else -1
            with tempfile.NamedTemporaryFile() as a_file:
                a_file.write(afs)
                pipe = pipeline(
                    task="automatic-speech-recognition",
                    model=self.model,
                    tokenizer=self.tokenizer,
                    feature_extractor=self.feature_extractor,
                    framework="pt",
                    chunk_length_s=30,
                    generate_kwargs={"max_new_tokens": 1024},
                    device=dno,
                )
                if lang:
                    self.model.config.forced_decoder_ids = self.processor.get_decoder_prompt_ids(
                        task="transcribe", language=lang
                    )
                return pipe(a_file.name)


        except Exception as error:
            return {"error": str(error)}

In [24]:
model=Whisper()

Downloading (…)okenizer_config.json:   0%|          | 0.00/830 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/494k [00:00<?, ?B/s]

Downloading (…)main/normalizer.json:   0%|          | 0.00/52.7k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.11k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

Downloading (…)rocessor_config.json:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/3.06G [00:00<?, ?B/s]

In [8]:
from datasets import load_dataset
from datasets import get_dataset_split_names


In [9]:
dataset = load_dataset("thennal/msc")

Downloading readme:   0%|          | 0.00/1.57k [00:00<?, ?B/s]

Downloading and preparing dataset None/None to /root/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/219M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/204M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/1541 [00:00<?, ? examples/s]

Dataset parquet downloaded and prepared to /root/.cache/huggingface/datasets/thennal___parquet/thennal--msc-cc9d10989b2ac4bd/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [10]:
train_df =dataset["train"].to_pandas()
train_df.shape

(1541, 8)

In [15]:

def convert_to_base64(file):
    with open(file, "rb") as file:
        msg = base64.b64encode(file.read())
        return msg.decode("utf-8")

In [18]:
b64=convert_to_base64("/content/007a1f8b-03e8-41d9-929c-b40873b8b229.webm")


In [25]:
%%time
payload = {"wav_base64": b64}
model.predict_raw(payload)

None
CPU times: user 3min 20s, sys: 544 ms, total: 3min 21s
Wall time: 3min 22s


{'text': 'വേറെ വിശേഷമൊന്നുമില്ല'}