In [None]:
!pip install --upgrade pip

!pip install --upgrade huggingface_hub

!pip install git+https://github.com/huggingface/transformers.git accelerate

!pip install torch torchvision torchaudio

!pip install "sagemaker>=2.69.0" "transformers==4.12.3" --upgrade
# using older dataset due to incompatibility of sagemaker notebook & aws-cli with > s3fs and fsspec to >= 2021.10
!pip install  "datasets==1.13" --upgrade

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [1]:
import torch
import json
import datetime

from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline

# device = "cuda:0" if torch.cuda.is_available() else "cpu"
device ="cpu"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
torch_dtype=torch.float32

model_id = "openai/whisper-large-v3"

model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, 
    torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True,
     use_safetensors=True,
    cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/models"
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id, cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/processor")

pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=1000,
    chunk_length_s=30,
    # batch_size=16,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    generate_kwargs={"language": "cantonese"}
)


# this is the place you modify your input - the name of the mp3 file you want to run
result = pipe("source/trimmed_sample.mp3")

# then it will write the response in a json file named as the current date time
now = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
json_object = json.dumps(result, indent=4)
with open('output/'+now+".json", "w") as f:
    f.write(json_object)

# also it will print out the result in the following output block
print(result)

  from .autonotebook import tqdm as notebook_tqdm
The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.
0it [00:00, ?it/s]
preprocessor_config.json: 100%|██████████| 340/340 [00:00<00:00, 53.3kB/s]
tokenizer_config.json: 100%|██████████| 283k/283k [00:00<00:00, 1.29MB/s]
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 3.36MB/s]
tokenizer.json: 100%|██████████| 2.48M/2.48M [00:00<00:00, 5.53MB/s]
merges.txt: 100%|██████████| 494k/494k [00:00<00:00, 2.22MB/s]
normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 715kB/s]
added_tokens.json: 100%|██████████| 34.6k/34.6k [00:00<00:00, 35.2MB/s]
special_tokens_map.json: 100%|██████████| 2.07k/2.07k [00:00<00:00, 6.35MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


{'text': ' 其實都有佢嘅價值 可能會係同身份認同有關係 又或者可能佢會帶動到一個地方嘅文化旅遊 佢隱藏住同埋佢對於社會創造緊嘅價值 其實都係好重要嘅元素 史丹頓街一帶嘅唐樓活化工程已經完成 而市建局亦都話嚟緊會引入共同租住單位嘅共居模式 希望嚟時呢一度 就可以變成一個充滿文化特色 同活力嘅社區', 'chunks': [{'timestamp': (0.0, 1.4), 'text': ' 其實都有佢嘅價值'}, {'timestamp': (1.4, 4.76), 'text': ' 可能會係同身份認同有關係'}, {'timestamp': (4.76, 8.08), 'text': ' 又或者可能佢會帶動到一個地方嘅文化旅遊'}, {'timestamp': (8.08, 12.88), 'text': ' 佢隱藏住同埋佢對於社會創造緊嘅價值'}, {'timestamp': (12.88, 15.52), 'text': ' 其實都係好重要嘅元素'}, {'timestamp': (19.0, 22.6), 'text': ' 史丹頓街一帶嘅唐樓活化工程已經完成'}, {'timestamp': (22.6, 26.92), 'text': ' 而市建局亦都話嚟緊會引入共同租住單位嘅共居模式'}, {'timestamp': (26.92, 28.16), 'text': ' 希望嚟時呢一度'}, {'timestamp': (28.16, 30.08), 'text': ' 就可以變成一個充滿文化特色'}, {'timestamp': (30.08, 31.32), 'text': ' 同活力嘅社區'}]}


In [2]:
import pandas as pd
import json
from IPython.display import display


df = pd.json_normalize(result, record_path =['chunks'])
display(df)

Unnamed: 0,timestamp,text
0,"(0.0, 1.4)",其實都有佢嘅價值
1,"(1.4, 4.76)",可能會係同身份認同有關係
2,"(4.76, 8.08)",又或者可能佢會帶動到一個地方嘅文化旅遊
3,"(8.08, 12.88)",佢隱藏住同埋佢對於社會創造緊嘅價值
4,"(12.88, 15.52)",其實都係好重要嘅元素
5,"(19.0, 22.6)",史丹頓街一帶嘅唐樓活化工程已經完成
6,"(22.6, 26.92)",而市建局亦都話嚟緊會引入共同租住單位嘅共居模式
7,"(26.92, 28.16)",希望嚟時呢一度
8,"(28.16, 30.08)",就可以變成一個充滿文化特色
9,"(30.08, 31.32)",同活力嘅社區


In [3]:
# ALTERNATIVE - there is a version which use an assistive model for the transciption
# !pip install "tokenizers>=0.14,<0.15"

import torch
import json
import datetime
from transformers import pipeline, AutoModelForCausalLM, AutoModelForSpeechSeq2Seq, AutoProcessor


# device = "cuda:0" if torch.cuda.is_available() else "cpu"
device ="mps"
# torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
torch_dtype=torch.float32

assistant_model_id = "distil-whisper/distil-large-v2"
assistant_model = AutoModelForCausalLM.from_pretrained(
    assistant_model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
)
assistant_model.to(device)

model_id = "openai/whisper-large-v3"
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id, torch_dtype=torch_dtype, 
    low_cpu_mem_usage=True, 
    use_safetensors=True,
    cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/models"
)
model.to(device)

processor = AutoProcessor.from_pretrained(model_id, cache_dir="/Volumes/BACKUP/Coding/HUGGING_FACE/processor")



pipe = pipeline(
    "automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    max_new_tokens=128,
    chunk_length_s=30,
    batch_size=1,
    return_timestamps=True,
    torch_dtype=torch_dtype,
    device=device,
    # generate_kwargs={"assistant_model": assistant_model, "language": "cantonese"}
    generate_kwargs={"assistant_model": assistant_model}
)



config.json: 100%|██████████| 2.29k/2.29k [00:00<00:00, 1.70MB/s]
model.safetensors: 100%|██████████| 1.51G/1.51G [00:14<00:00, 104MB/s] 
generation_config.json: 100%|██████████| 3.59k/3.59k [00:00<00:00, 9.54MB/s]
preprocessor_config.json: 100%|██████████| 340/340 [00:00<00:00, 93.0kB/s]
tokenizer_config.json: 100%|██████████| 283k/283k [00:00<00:00, 1.27MB/s]
vocab.json: 100%|██████████| 1.04M/1.04M [00:00<00:00, 54.2MB/s]
tokenizer.json: 100%|██████████| 2.48M/2.48M [00:00<00:00, 5.49MB/s]
merges.txt: 100%|██████████| 494k/494k [00:00<00:00, 113MB/s]
normalizer.json: 100%|██████████| 52.7k/52.7k [00:00<00:00, 21.1MB/s]
added_tokens.json: 100%|██████████| 34.6k/34.6k [00:00<00:00, 15.8MB/s]
special_tokens_map.json: 100%|██████████| 2.07k/2.07k [00:00<00:00, 1.25MB/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# this is the place you modify your input - the name of the mp3 file you want to run
result = pipe("source/sample.mp3")

# then it will write the response in a json file named as the current date time
# now = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
# json_object = json.dumps(result, indent=4)
# with open('output/'+now+".json", "w") as f:
#     f.write(json_object)

# also it will print out the result in the following output block
print(result)

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


{'text': '中上環近半山一帶除了讓人感覺寧靜之外這裡的建築物亦是新舊交融的不說你不知這裡曾經出現過一個叫做沙雅間的社區被稱為沙雅間的社區目前並沒有完整的文獻記錄資料相傳在十九世紀史丹頓街及必烈者市街附近一大華人聚居地所建造的30間石屋而得名隨著社會變遷石屋已經不復存在現時, 方位內仍有數座大約在1950年代建成的塘樓建築分別是在2019年確定成為二級歷史建築的史丹頓街88及90號以及評級有待評估的華賢方西塘樓建築群Kara,其實當初三十間的起源是怎樣的?其實我們如果找回資料的話最早我們是在1880年的政府憲報上見到三十間這個名字我們現在在香港的地圖上面其實我們已經很難看到三十間這個名字了知道這個名字的人,大概已經60歲或以上的人才會懂得用這個名字當時在這個位置應該建了大約30間屋的建築群如果肉眼看到的痕跡,可能只有三十間街坊盂蘭會這個招牌可能就是唯一我們可以反映到以前這裡真的叫三十間的一件事上環以前其實有很多華人聚居的地方盂蘭會就是每年七月的時候會舉辦盂蘭盛會的組織他們其實是一班居民組織出來的地方盂蘭盛會對於一個華人社會來說非常重要是超渡一些孤魂野鬼令商鋪、街坊可以安心一點的傳統習俗在街道佈局上,三十間社區的特色是怎樣的?如果想理解三十間的範圍我們應該由下面的士丹頓街開始計算那個其實是一個軸心然後一直打上去到上面半山的堅島範圍中間的一個範圍其實我們也可以理解為三十間這裡當中其實都有不少的地方全部都是一些只是人行車不能進的地方包括是一些荒和里例如维延方西成王街也是人行的楼梯有些人也会将永里街计算到沙雅间的范围里面所以可以说其实沙雅间是一个步行的小区这个地方其实保留了很多那个年代50年代至60年代三至四層高的堂樓建築群長遠如何保留30間社區的歷史故事保育最好的方法是活用所以30間盂蘭盛會如果可以繼續練習繼續實踐的話就會是最好的保育方法鄰近史丹頓街一帶的堂樓原本被劃入市區重建局在2003年提出的重建計劃當中其後因應社區人士提出保留建築物的訴求市建局在2020年放棄重建計劃研究保育和活化項目裡面的建築群除了市建局的活化計劃附近亦有多個活化歷史建築的項目包括前身為荷里活道前已分警察宿舍的原創房以及前身為別列遮市街市場的香港新聞博覽館30間的範圍內或外面其實有不少歷史建築物已經活化例如包括活化成為博覽館又或者可能是做了一些文化文創的地方但是我自己覺得其實沒有一個方法一定

In [None]:
!pip install pyannote.audio

In [None]:
from pyannote.audio import Pipeline
import datetime

pipeline = Pipeline.from_pretrained(
  "pyannote/speaker-diarization-3.1",
  use_auth_token="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")

diarization = pipeline("source/OpenAIKeynote.mp3")

# dump the diarization output to disk using RTTM format
now = datetime.datetime.now().strftime("%d-%m-%Y-%H-%M-%S")
with open('whisper-transcript/'+now+"_transcript.rttm", "w") as rttm:
    diarization.write_rttm(rttm)


In [None]:
print(1+1)