論文  
https://cdn.openai.com/papers/whisper.pdf<br>
<br>
GitHub  
https://github.com/openai/whisper<br>
<br>
<a href="https://colab.research.google.com/github/kaz12tech/ai_demos/blob/master/Whisper_demo.ipynb" target="_blank"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 環境セットアップ

## GPU確認

In [None]:
!nvidia-smi

## GitHubからコード取得

## ライブラリのインストール

In [None]:
%cd /content

!pip install git+https://github.com/openai/whisper.git@eff383b27b783e280c089475852ba83f20f64998

!pip install moviepy==0.2.3.5 imageio==2.4.1
!pip install yt-dlp

!pip install setuptools-rust

## ライブラリのインポート

In [None]:
import whisper

from yt_dlp import YoutubeDL
from moviepy.video.fx.resize import resize
from moviepy.editor import VideoFileClip, AudioFileClip

# テスト動画のセットアップ

In [None]:
video_url = 'https://www.youtube.com/watch?v=o97upTCsRME' #@param {type:"string"}

#@markdown 動画の切り抜き範囲(秒)を指定してください。\
#@markdown 30秒以上の場合OOM発生の可能性が高いため注意
start_sec =  8#@param {type:"integer"}
end_sec =  19#@param {type:"integer"}

(start_pt, end_pt) = (start_sec, end_sec)

In [None]:
!mkdir test_files

download_resolution = 720
full_video_path = '/content/test_files/full_video_en.mp4'
input_clip_path = '/content/test_files/clip_video_en.mp4'
input_audio_path = '/content/test_files/audio_en.mp3'

# 動画ダウンロード
ydl_opts = {'format': f'best[height<={download_resolution}]', 'overwrites': True, 'outtmpl': full_video_path}
with YoutubeDL(ydl_opts) as ydl:
    ydl.download([video_url])

# 指定区間切り抜き
with VideoFileClip(full_video_path) as video:
    subclip = video.subclip(start_pt, end_pt)
    subclip.write_videofile(input_clip_path)

# 音声抽出
videoclip = VideoFileClip(input_clip_path)
audioclip = videoclip.audio
audioclip.write_audiofile(input_audio_path)

In [None]:
# 動画の確認
clip = VideoFileClip(input_clip_path)
clip = resize(clip, height=420)
clip.ipython_display()

In [None]:
!wget -c http://pro-video.jp/voice/announce/mp3/001-sibutomo.mp3 \
      -O /content/test_files/audio_jp.mp3

In [None]:
audioclip = AudioFileClip('/content/test_files/audio_jp.mp3')
audioclip.ipython_display()

# Speech Recognition

In [None]:
model = whisper.load_model("large")
print('model loaded.', model.device)

## English

In [None]:
# audioのロード
audio = whisper.load_audio('/content/test_files/audio_en.mp3')
# padding/trimming(30秒)
audio = whisper.pad_or_trim(audio)

# log-Mel spectrogram生成
# modelと同じメモリに配置(cuda)
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# 言語検出
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# 音声のデコード
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# Speech Recognition結果出力
print(result.text)

## Japanese

In [None]:
# audioのロード
audio = whisper.load_audio('/content/test_files/audio_jp.mp3')
# padding/trimming(30秒)
audio = whisper.pad_or_trim(audio)

# log-Mel spectrogram生成
# modelと同じメモリに配置(cuda)
mel = whisper.log_mel_spectrogram(audio).to(model.device)

# 言語検出
_, probs = model.detect_language(mel)
print(f"Detected language: {max(probs, key=probs.get)}")

# 音声のデコード
options = whisper.DecodingOptions()
result = whisper.decode(model, mel, options)

# Speech Recognition結果出力
print(result.text)