In [None]:
!pip install --upgrade pip

!pip install --upgrade huggingface_hub

!pip install git+https://github.com/huggingface/transformers.git accelerate

!pip install torch torchvision torchaudio

!pip install "sagemaker>=2.69.0" "transformers==4.12.3" --upgrade
# using older dataset due to incompatibility of sagemaker notebook & aws-cli with > s3fs and fsspec to >= 2021.10
!pip install  "datasets==1.13" --upgrade

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [4]:
import torch
import json
import datetime

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# device = "cuda:0" if torch.cuda.is_available() else "cpu"
device ="cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
torch_dtype=torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, 
    torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True,
     use_safetensors=True,
    # cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/models"
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=1000,
    chunk_length_s=30,
    # batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "cantonese"}
)


# this is the place you modify your input - the name of the mp3 file you want to run
result = pipe("source/trimmed_sample.mp3")

# then it will write the response in a json file named as the current date time
now = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
json_object = json.dumps(result, indent=4)
with open('output/'+now+".json", "w") as f:
    f.write(json_object)

# also it will print out the result in the following output block
print(result)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'text': ' 其實都有佢嘅價值 可能會係同身份認同有關係 又或者可能佢會帶動到一個地方嘅文化旅遊 佢隱藏住同埋佢對於社會創造緊嘅價值 其實都係好重要嘅元素 史丹頓街一帶嘅唐樓活化工程已經完成 而市建局亦都話嚟緊會引入共同租住單位嘅共居模式 希望嚟時呢一度 就可以變成一個充滿文化特色 同活力嘅社區', 'chunks': [{'timestamp': (0.0, 1.4), 'text': ' 其實都有佢嘅價值'}, {'timestamp': (1.4, 4.76), 'text': ' 可能會係同身份認同有關係'}, {'timestamp': (4.76, 8.08), 'text': ' 又或者可能佢會帶動到一個地方嘅文化旅遊'}, {'timestamp': (8.08, 12.88), 'text': ' 佢隱藏住同埋佢對於社會創造緊嘅價值'}, {'timestamp': (12.88, 15.52), 'text': ' 其實都係好重要嘅元素'}, {'timestamp': (19.0, 22.6), 'text': ' 史丹頓街一帶嘅唐樓活化工程已經完成'}, {'timestamp': (22.6, 26.92), 'text': ' 而市建局亦都話嚟緊會引入共同租住單位嘅共居模式'}, {'timestamp': (26.92, 28.16), 'text': ' 希望嚟時呢一度'}, {'timestamp': (28.16, 30.08), 'text': ' 就可以變成一個充滿文化特色'}, {'timestamp': (30.08, 31.32), 'text': ' 同活力嘅社區'}]}


In [5]:
import pandas as pd
import json
from IPython.display import display


df = pd.json_normalize(result, record_path =['chunks'])
display(df)

Unnamed: 0,timestamp,text
0,"(0.0, 1.4)",其實都有佢嘅價值
1,"(1.4, 4.76)",可能會係同身份認同有關係
2,"(4.76, 8.08)",又或者可能佢會帶動到一個地方嘅文化旅遊
3,"(8.08, 12.88)",佢隱藏住同埋佢對於社會創造緊嘅價值
4,"(12.88, 15.52)",其實都係好重要嘅元素
5,"(19.0, 22.6)",史丹頓街一帶嘅唐樓活化工程已經完成
6,"(22.6, 26.92)",而市建局亦都話嚟緊會引入共同租住單位嘅共居模式
7,"(26.92, 28.16)",希望嚟時呢一度
8,"(28.16, 30.08)",就可以變成一個充滿文化特色
9,"(30.08, 31.32)",同活力嘅社區


In [None]:
# ALTERNATIVE - there is a version which use an assistive model for the transciption
# !pip install "tokenizers>=0.14,<0.15"

import torch
import json
import datetime
from transformers import pipeline, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq, AutoProcessor


# device = "cuda:0" if torch.cuda.is_available() else "cpu"
device ="mps"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
torch_dtype=torch.float32

assistant_model_id = "distil-whisper/distil-large-v2"
assistant_model = AutoModelForCausalLM.from_pretrained(
    assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
assistant_model.to(device)

model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, 
    use_safetensors=True
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id)



pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=1,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    # generate_kwargs={"assistant_model": assistant_model, "language": "cantonese"}
    generate_kwargs={"assistant_model": assistant_model}
)



In [None]:
# this is the place you modify your input - the name of the mp3 file you want to run
result = pipe("source/sample.mp3")

# then it will write the response in a json file named as the current date time
now = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
json_object = json.dumps(result, indent=4)
with open('whisper-transcript/'+now+".json", "w") as f:
    f.write(json_object)

# also it will print out the result in the following output block
print(result)

In [None]:
!pip install pyannote.audio

In [None]:
from pyannote.audio import Pipeline
import datetime

pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  use_auth_token="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")

diarization = pipeline("source/OpenAIKeynote.mp3")

# dump the diarization output to disk using RTTM format
now = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
with open('whisper-transcript/'+now+"_transcript.rttm", "w") as rttm:
    diarization.write_rttm(rttm)


In [None]:
print(1+1)