In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# YouTube のビデオ ID を指定して wav をダウンロードする
# https://github.com/yt-dlp/yt-dlp#embedding-yt-dlp
!pip install yt-dlp
from yt_dlp import YoutubeDL

video_id = "LjK_JqYF_t8" # CHANGEME

yt_dlp_opts = {
  'outtmpl': '/content/drive/MyDrive/opt/nekuro-chat/youtube-video/%(id)s.%(ext)s',
  'format': 'mp3/bestaudio/best',
  'postprocessors': [{
    'key': 'FFmpegExtractAudio',
    'preferredcodec': 'wav',
   }]
}

with YoutubeDL(yt_dlp_opts) as ydl:
  ydl.download(["https://www.youtube.com/watch?v=%s" % video_id])

In [None]:
# 無音部分でファイルを分割する
# https://github.com/jiaaro/pydub/
# https://github.com/jiaaro/pydub/blob/master/API.markdown#silencesplit_on_silence
!pip install pydub
from pydub import AudioSegment, silence

video_id = "LjK_JqYF_t8" # CHANGEME

file = AudioSegment.from_wav("/content/drive/MyDrive/opt/nekuro-chat/youtube-video/%s.wav" % video_id)

chunks = silence.split_on_silence(
  file,
  min_silence_len = 3000,
  silence_thresh = -40,
  seek_step = 1000,
)

for i, chunk in enumerate(chunks):
  print(chunk, chunk.duration_seconds) # ここのログを見ながらしきい値を調整する
  if 10 <= chunk.duration_seconds <= 20:
    print("export", chunk.duration_seconds)
    chunk.export("/content/drive/MyDrive/opt/nekuro-chat/youtube-video-chunk/%s_%04d.wav" % (video_id, i + 1), format="wav")

In [None]:
# 音声ファイルをテキストに変換する
# https://github.com/openai/whisper#python-usage
!pip install git+https://github.com/openai/whisper.git
import whisper
import os

video_id = "zMpnAQANC0Q" # CHANGEME

model = whisper.load_model("base")

for path in os.listdir("."):
  if not path.startswith("%s_" % video_id):
    continue

  print(path)

  result = model.transcribe(path)
  print(result["text"])

In [None]:
# japanrse single speaker speech dataset のテキストファイルのフォーマットを
# tacotron2 のフォーマットに変換して保存する。

!ls /content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset

input_path = "/content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset/transcript.txt"
output_path = "/content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset/transcript_tacotron2.txt"

lines = []

with open(input_path) as file:
  for line in file.readlines():
    columns = line.split("|")
    wav_path = columns[0]
    ja_text = columns[1]
    alphabet_text = columns[2]
    duration_sec = columns[3]

    replacer = {
      '、': ',',
      '。': '.',
      '―': '',
      '？': '',
      '！': '',
      ' ': '',
    }

    if float(duration_sec) <= 5:
      continue

    for key in replacer:
      alphabet_text = alphabet_text.replace(key, replacer[key])

    lines.append("%s|%s\n" % (wav_path.replace("meian/", "/content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset/meian_tacotron2/"), alphabet_text))

with open(output_path, "w") as file:
  file.writelines(lines)

!head /content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset/transcript_tacotron2.txt
!wc  /content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset/transcript_tacotron2.txt

In [None]:
# japanrse single speaker speech dataset のテキストファイルのフォーマットを
# tacotron2 のフォーマットに変換して保存する。

!mkdir -p /content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset/meian_tacotron2

!pip install librosa
!pip install pysoundfile

import os
import librosa
import soundfile as sf

input_path = '/content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset/meian'
output_path = '/content/drive/MyDrive/opt/nekuro-chat/japanese/japanese-single-speaker-speech-dataset/meian_tacotron2'

input_files = os.listdir(input_path)

for i, path in enumerate(input_files):
   print("%d/%d %s" % (i, len(input_files), path))
   y, sr = librosa.core.load("%s/%s" % (input_path, path), sr=22050, mono=True)
   sf.write("%s/%s" % (output_path, path), y, sr, subtype="PCM_16")