In [1]:
#!pip install git+https://github.com/openai/whisper.git
!pip install git+https://github.com/guillaumekln/faster-whisper.git
!pip install pytube

Collecting git+https://github.com/guillaumekln/faster-whisper.git
  Cloning https://github.com/guillaumekln/faster-whisper.git to /tmp/pip-req-build-lm78vs6y
  Running command git clone --filter=blob:none --quiet https://github.com/guillaumekln/faster-whisper.git /tmp/pip-req-build-lm78vs6y
  Resolved https://github.com/guillaumekln/faster-whisper.git to commit 5c17de17713f65929c7c33add3a9735ff75a945c
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting av==10.* (from faster-whisper==0.7.1)
  Using cached av-10.0.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (31.0 MB)
Collecting ctranslate2<4,>=3.17 (from faster-whisper==0.7.1)
  Using cached ctranslate2-3.17.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.2 MB)
Collecting huggingface_hub>=0.13 (from faster-whisper==0.7.1)
  Using cached huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
Collecting tokenizers==0.13.* (from faster-whisper==0.7.1)
  Using cached tokenizers-0.13.3-cp310-cp310-manyl

In [2]:
#import whisper
from faster_whisper import WhisperModel
from pytube import YouTube, Channel
import pandas as pd
from random import shuffle
import concurrent.futures as cf
import os
from tqdm import tqdm

In [3]:
from google.colab import drive
drive.mount("/content/drive")

db_dir = "/content/drive/MyDrive/kato-db/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
'''
video_links.csvから書き起こしを行うクラス
'''
class Transcriber:
  model_num_to_name = {
      -1: "none",
      0: "tiny",
      1: "base",
      2: "small",
      3: "medium",
      4: "large-v2"
  }

  def __init__(self, model = 4):
    self.df_videos = pd.read_csv(db_dir + "video_links.csv")
    self.model = 3   #使用するwhisperモデル
    #self.whisper_model = whisper.load_model(Transcriber.model_num_to_name[model])
    self.whisper_model = WhisperModel('zh-plus/faster-whisper-large-v2-japanese-5k-steps', device="cuda", compute_type="float16")

  '''
  作動する。途中中断しても進捗は保存される。
  mode
  0->未書き起こしを対象に
  1->改善対象（モデルが下位のもの）を対象に
  '''
  def run(self, mode = 0):
    if mode == 0:
      index_target = self.get_untranscribeds()
    elif mode == 1:
      index_target = self.get_improvables()

    # with cf.ThreadPoolExecutor(max_workers=2) as executor:
    #   futures = []
    #   for index in index_target:
    #     row = self.df_videos.iloc[index]
    #     futures.append(executor.submit(self.transcribe, index, row["link"], self.model))

    #   for future in cf.as_completed(futures):
    #     index = future.result()
    for index in tqdm(index_target):
        row = self.df_videos.iloc[index]
        self.transcribe(index, row["link"], self.model)
        self.df_videos.at[index, "transcribed"] = self.model
        self.df_videos.to_csv(db_dir + "video_links.csv")

        print("saved:", index)

  '''
  未書き起こしの動画のインデックス一覧を取得。一覧はシャッフルされている
  '''
  def get_untranscribeds(self):
    index_untranscribed = list(self.df_videos[self.df_videos["transcribed"] == -1].index)
    shuffle(index_untranscribed)
    return index_untranscribed

  '''
  self.modelの方が上位のデータのインデックス一覧を取得。一覧はシャッフルされる。
  '''
  def get_improvables(self):
    index_improvable = list(self.df_videos[self.df_videos["transcribed"] < self.model].index)
    shuffle(index_improvable)
    return index_improvable

  '''
  書き起こしを行う。書き起こしファイルの保存、video_links.csvの更新も行われる
  '''
  def transcribe(self, index, link, model):
    #print("start:", index)

    #動画をダウンロード
    audio_file_name = str(index) + ".mp4"
    audio_file = YouTube(link).streams.filter(only_audio=True).first().download(filename=audio_file_name)

    #書き起こし
    #transcription = self.whisper_model.transcribe(audio_file, language = "ja")["segments"]
    segments, _ = self.whisper_model.transcribe(audio_file, language="ja")

    #データ整理
    transcription = []
    for segment in segments:
      transcription.append([segment.start, segment.end, segment.text])

    #ダウンロードした動画を削除（容量のため）
    os.remove(audio_file)

    #書き起こしファイルの保存
    transcription_file_name = db_dir + str(index) + ".csv"
    df_transcription = pd.DataFrame(data=transcription, columns=['start', 'end', 'text'])
    df_transcription.to_csv(transcription_file_name, index = False)

    #print("transcribed:", index)

    return index

In [None]:
if __name__ == "__main__":
  transcriber =  Transcriber(model = 4)
  transcriber.run()

  0%|          | 0/672 [00:00<?, ?it/s]