In [8]:
import whisperx
import gc 

device = "cuda" 
audio_file = "20201113_MYY_Buddhiman.mp3"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("kurianbenoy/vegam-whisper-medium-ml", device, compute_type=compute_type)

audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code="ml", device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token="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", device=device)
# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint .cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.0.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0. Bad things might happen unless you revert torch to 1.x.
Detected language: ta (0.86) in first 30s of audio...
[{'text': 'ദൈവത്തിന് സ്തോത്രം ദൈവത്തിന് സ്തൂതി കർത്താവാന് മുഖത്വം സ്തോത്രം സ്തോത്രം സ്തോത്രം കർത്താവാ യേശക്രിസ്ത�', 'start': 0.009, 'end': 25.913}, {'text': 'അതിന് നിങ്ങൾ നിങ്ങൾത്തിനെ വിധേയരാക്കുന്നത് കൊണ്ടാണ് നന്ദി ഞാൻ പറഞ്ഞത് വധനം പറയുന്നത് സ്നേഹം കൊണ്ടാണ് നിങ�', 'start': 25.913, 'end': 54.002}, {'text': 'നിത്യജീവം പ്രാവിക്കാൻ ഇടയാകുന്നെങ്കിൽ അതൊരു ഭാഗ്യമാണ് അതാണ് വധനം തരുന്നതിൻറെ ലക്ഷ്യം നിത്യജീവൻ എവർലാസ്റ�', 'start': 54.002, 'end': 75.759}, {'text': 'പിന്നെ പിന്നെ ഓരോ നൂറ്റാണ്ടുകളിലും ദൈവമനുക്കൾ കർത്താൻറെ വേലയായി ഇതവര് ദൈവത്തിന് സ്തോദനം കോൺസ്റ്റൻഡ് റ�', 'start': 76.203, 'end': 93.063}, {'

Some weights of the model checkpoint at gvs/wav2vec2-large-xlsr-malayalam were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at gvs/wav2vec2-large-xlsr-malayalam and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN

[{'start': 0.009, 'end': 7.996, 'text': 'ദൈവത്തിന് സ്തോത്രം ദൈവത്തിന് സ്തൂതി കർത്താവാന് മുഖത്വം സ്തോത്രം സ്തോത്രം സ്തോത്രം കർത്താവാ യേശക്രിസ്ത�', 'words': [{'word': 'ദൈവത്തിന്', 'start': 0.009, 'end': 1.05, 'score': 0.189}, {'word': 'സ്തോത്രം', 'start': 1.11, 'end': 1.631, 'score': 0.356}, {'word': 'ദൈവത്തിന്', 'start': 2.071, 'end': 2.471, 'score': 0.335}, {'word': 'സ്തൂതി', 'start': 2.531, 'end': 3.012, 'score': 0.264}, {'word': 'കർത്താവാന്', 'start': 3.372, 'end': 3.893, 'score': 0.345}, {'word': 'മുഖത്വം', 'start': 3.913, 'end': 4.373, 'score': 0.291}, {'word': 'സ്തോത്രം', 'start': 4.753, 'end': 5.294, 'score': 0.378}, {'word': 'സ്തോത്രം', 'start': 5.394, 'end': 5.874, 'score': 0.401}, {'word': 'സ്തോത്രം', 'start': 5.955, 'end': 6.435, 'score': 0.422}, {'word': 'കർത്താവാ', 'start': 6.915, 'end': 7.416, 'score': 0.151}, {'word': 'യേശക്രിസ്ത�', 'start': 7.456, 'end': 7.996, 'score': 0.447}]}, {'start': 25.913, 'end': 33.335, 'text': 'അതിന് നിങ്ങൾ നിങ്ങൾത്തിനെ വിധേയരാക്കുന്നത് കൊണ്ടാണ

In [None]:
import whisperx
import gc 

device = "cuda" 
audio_file = "anil.wav"
batch_size = 16 # reduce if low on GPU mem
compute_type = "float16" # change to "int8" if low on GPU mem (may reduce accuracy)

# 1. Transcribe with original whisper (batched)
model = whisperx.load_model("kurianbenoy/vegam-whisper-medium-ml", device, compute_type=compute_type)

audio = whisperx.load_audio(audio_file)
result = model.transcribe(audio, batch_size=batch_size)
print(result["segments"]) # before alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model

# 2. Align whisper output
model_a, metadata = whisperx.load_align_model(language_code="ml", device=device)
result = whisperx.align(result["segments"], model_a, metadata, audio, device, return_char_alignments=False)

print(result["segments"]) # after alignment

# delete model if low on GPU resources
# import gc; gc.collect(); torch.cuda.empty_cache(); del model_a

# 3. Assign speaker labels
diarize_model = whisperx.DiarizationPipeline(use_auth_token="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", device=device)
# add min/max number of speakers if known
diarize_segments = diarize_model(audio)
# diarize_model(audio, min_speakers=min_speakers, max_speakers=max_speakers)

result = whisperx.assign_word_speakers(diarize_segments, result)
print(diarize_segments)
print(result["segments"]) # segments are now assigned speaker IDs

Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint .cache/torch/whisperx-vad-segmentation.bin`


No language specified, language will be first be detected for each audio file (increases inference time).
Model was trained with pyannote.audio 0.0.1, yours is 3.0.1. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.1.0. Bad things might happen unless you revert torch to 1.x.
Detected language: ta (0.52) in first 30s of audio...
[{'text': 'അവയിലൊന്ന് മരണാന്തരം പ്രസിദ്ധീകരിച്ച ഡണ്ണിന്റെ സമഹാരത്തിൽ ഉൾപ്പെടുത്തിയിരുന്നു', 'start': 0.145, 'end': 6.425}]
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize
Ignored unknown kwarg option normalize


Some weights of the model checkpoint at gvs/wav2vec2-large-xlsr-malayalam were not used when initializing Wav2Vec2ForCTC: ['wav2vec2.encoder.pos_conv_embed.conv.weight_g', 'wav2vec2.encoder.pos_conv_embed.conv.weight_v']
- This IS expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Wav2Vec2ForCTC from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Wav2Vec2ForCTC were not initialized from the model checkpoint at gvs/wav2vec2-large-xlsr-malayalam and are newly initialized: ['wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original1', 'wav2vec2.encoder.pos_conv_embed.conv.parametrizations.weight.original0']
You should probably TRAIN

[{'start': 0.165, 'end': 6.325, 'text': 'അവയിലൊന്ന് മരണാന്തരം പ്രസിദ്ധീകരിച്ച ഡണ്ണിന്റെ സമഹാരത്തിൽ ഉൾപ്പെടുത്തിയിരുന്നു', 'words': [{'word': 'അവയിലൊന്ന്', 'start': 0.165, 'end': 0.827, 'score': 0.362}, {'word': 'മരണാന്തരം', 'start': 1.168, 'end': 1.991, 'score': 0.275}, {'word': 'പ്രസിദ്ധീകരിച്ച', 'start': 2.131, 'end': 2.974, 'score': 0.56}, {'word': 'ഡണ്ണിന്റെ', 'start': 3.395, 'end': 3.997, 'score': 0.47}, {'word': 'സമഹാരത്തിൽ', 'start': 4.338, 'end': 5.281, 'score': 0.379}, {'word': 'ഉൾപ്പെടുത്തിയിരുന്നു', 'start': 5.562, 'end': 6.325, 'score': 0.643}]}]
                             segment  label     speaker     start       end  \
0  [ 00:00:00.008 -->  00:00:06.443]      0  SPEAKER_00  0.008489  6.443124   

   intersection     union  
0         0.763  6.434635  
[{'start': 0.165, 'end': 6.325, 'text': 'അവയിലൊന്ന് മരണാന്തരം പ്രസിദ്ധീകരിച്ച ഡണ്ണിന്റെ സമഹാരത്തിൽ ഉൾപ്പെടുത്തിയിരുന്നു', 'words': [{'word': 'അവയിലൊന്ന്', 'start': 0.165, 'end': 0.827, 'score': 0.362, 'speaker': 'SPEAKER

In [None]:
1 + 1

2