# WhisperX48_local: 用于本地运行WhisperX48的Jupyter Notebook脚本


In [8]:
import os
import ffmpeg
import subprocess
import torch
import whisperx
import time
from pathlib import Path
import sys


model_size = "small"  # @param ["base","small","medium", "large"]
language = "ja"  # @param {type:"string"}
#sub_style = "default"  # @param ["default", "ikedaCN", "kaedeCN","sugawaraCN"]
#compression_ratio_threshold = 2.4 # @param {type:"number"}
#no_speech_threshold = 0.6 # @param {type:"number"}
#logprob_threshold = -1.0 # @param {type:"number"}
#condition_on_previous_text = "True" # @param ["True", "False"]

output_dir = "./files/"  # 默认的音频文件输入和字幕文件输出路径 Path for input audio file and output subtitle by default
file_name = "sample1.wav"  # 在这里输入音频文件名 Name of the audio file
audio_file = output_dir + file_name

device = "cuda"
torch.cuda.empty_cache()
print('加载whisper模型 Loading whisper model...')
model = whisperx.load_model(model_size, device)

#Original whisper transcribe
tic = time.time()
print('识别中 Transcribe in progress...')
result = model.transcribe(audio_file, language =language)

#Load alignment model and metadata
print('加载调整模型 Load alignment model...')
#model_id = "jonatasgrosman/wav2vec2-large-xlsr-53-japanese"
model_a, metadata = whisperx.load_align_model(language_code="ja", device=device)

#Align whisper output
print('调整识别结果 Align whisper output...')
result_aligned = whisperx.align(result["segments"], model_a, metadata, audio_file, device)

toc = time.time()
print('识别完毕 Done')
print(f'Time consumpution {toc-tic}s')

#Write SRT file
from whisperx.utils import write_srt
with open(Path(output_dir) / (audio_file + ".srt"), "w", encoding="utf-8") as srt:
    write_srt(result["segments"], file=srt)
print('字幕生成完毕 Subtitle generated!')

torch.cuda.empty_cache()

加载whisper模型 Loading whisper model...
识别中 Transcribe in progress...
加载调整模型 Load alignment model...




调整识别结果 Align whisper output...
识别完毕 Done
Time consumpution 26.63913893699646s
字幕生成完毕 All done!


* Last modified 2023-03-07
* Author: ifeimi &#11046 Email me: yfwu0202 AT gmail.com

* Acknowledgements and copyright notice: 
This script relies on [whisperx](https://github.com/m-bain/whisperX), which provides an improvement to [OpenAI's whisper](https://github.com/openai/whisper) with more accurate and especially word-level timestamps. This is achieved by forcing align the inaccurate timestamps generated by whisper with some speech model ([wav2vec2.0](https://huggingface.co/facebook/wav2vec2-large-960h-lv60-self) for example). 

Part of this code was referenced from [N46Whisper](https://github.com/Ayanaminn/N46Whisper) project under [MIT license](https://github.com/ifeimi/WhisperX48/blob/main/LICENSE). Modifications were made to incorporate the usage of [whisperx](https://github.com/m-bain/whisperX). 