In [1]:
!pip install pyannote.audio openai-whisper ffmpeg torch



In [15]:
from google.colab import drive

def mount_google_drive():
    """Google Driveをマウントします"""
    drive.mount('/content/drive')
    print("Google Drive mounted successfully.")

mount_google_drive()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Google Drive mounted successfully.


In [16]:
import os

def list_media_files(folder_path):
    """指定フォルダ内のMP4またはMP3ファイルをリストアップ"""
    return [
        os.path.join(folder_path, f)
        for f in os.listdir(folder_path)
        if f.endswith(('.mp4', '.mp3'))
    ]

# Google Drive内の対象フォルダを指定
target_folder = "/content/drive/My Drive/TranscriptionFolder"
media_files = list_media_files(target_folder)

if not media_files:
    print("対象のMP4またはMP3ファイルが見つかりませんでした。")
else:
    print(f"対象ファイル: {media_files}")


対象ファイル: ['/content/drive/My Drive/TranscriptionFolder/ZeroPlus_student-opinion-meeting.mp4', '/content/drive/My Drive/TranscriptionFolder/ZeroPlus_student-opinion-meeting.mp3']


In [17]:
import subprocess

def convert_mp4_to_mp3(mp4_file):
    """MP4ファイルをMP3ファイルに変換"""
    mp3_file = mp4_file.replace(".mp4", ".mp3")
    command = f'ffmpeg -i "{mp4_file}" -q:a 0 -map a "{mp3_file}" -y'
    subprocess.run(command, shell=True, check=True)
    print(f"Converted {mp4_file} to {mp3_file}")
    return mp3_file


In [27]:
from pyannote.audio.pipelines import SpeakerDiarization
from google.colab import userdata
from huggingface_hub import login

HUGGINGFACE_TOKEN = userdata.get('HUGGINGFACE_TOKEN')

# Hugging Faceのトークンを入力してください
login(token=HUGGINGFACE_TOKEN)


def diarize_audio(file_path):
    """音声ファイルの話者分離を実行"""
    pipeline = SpeakerDiarization.from_pretrained("pyannote/speaker-diarization@2.1")
    diarization = pipeline(file_path)

    # 話者ごとにセグメントを分ける
    segments = []
    for turn, _, speaker in diarization.itertracks(yield_label=True):
        segments.append({
            'start': turn.start,
            'end': turn.end,
            'speaker': speaker
        })
    return segments


In [23]:
import whisper
import torch

# Whisperモデルをロード
model = whisper.load_model("small", device="cpu")  # 必要に応じてモデルサイズを変更

# モデルのすべてのパラメータをFP32（単精度浮動小数点）に変換
for param in model.parameters():
    param.data = param.data.to(dtype=torch.float32)
    if param.grad is not None:
        param.grad.data = param.grad.data.to(dtype=torch.float32)

def transcribe_with_speaker_segments(segments, audio_path, model):
    """話者分離後、各セグメントごとに文字起こしを行う"""
    transcribed_text = ""
    for segment in segments:
        start = segment['start']
        end = segment['end']
        speaker = segment['speaker']
        # セグメントごとの音声を切り出し、文字起こし
        audio_segment_path = f"/content/segment_{start}_{end}.wav"
        command = f"ffmpeg -ss {start} -to {end} -i {audio_path} -acodec pcm_s16le -ar 16000 -ac 1 {audio_segment_path}"
        subprocess.run(command, shell=True, check=True)

        # Whisperで文字起こし
        result = model.transcribe(audio_segment_path)
        transcribed_text += f"\n\n{speaker}: {result['text']}"

    return transcribed_text


In [None]:
# MP4ファイルをMP3に変換
converted_mp3_files = []
for media_file in media_files:
    if media_file.endswith(".mp4"):
        try:
            converted_mp3_files.append(convert_mp4_to_mp3(media_file))
        except subprocess.CalledProcessError as e:
            print(f"Error converting {media_file}: {e}")
    elif media_file.endswith(".mp3"):
        converted_mp3_files.append(media_file)

# テキスト化を実行
if converted_mp3_files:
    for audio_file in converted_mp3_files:
        print(f"Processing: {audio_file}")
        # 音声ファイルを話者ごとに分ける
        segments = diarize_audio(audio_file)
        # 話者分離後、各セグメントを文字起こし
        transcribed_text = transcribe_with_speaker_segments(segments, audio_file, model)

        # 結果をファイルに保存
        output_file = audio_file.replace('.mp3', '.txt')
        with open(output_file, "w") as f:
            f.write(transcribed_text)
        print(f"Saved transcription to: {output_file}")
else:
    print("テキスト化するファイルがありません。")


Converted /content/drive/My Drive/TranscriptionFolder/ZeroPlus_student-opinion-meeting.mp4 to /content/drive/My Drive/TranscriptionFolder/ZeroPlus_student-opinion-meeting.mp3
Processing: /content/drive/My Drive/TranscriptionFolder/ZeroPlus_student-opinion-meeting.mp3


pytorch_model.bin:   0%|          | 0.00/17.7M [00:00<?, ?B/s]

config.yaml:   0%|          | 0.00/318 [00:00<?, ?B/s]

INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.5.4 to v2.5.0.post0. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../root/.cache/torch/pyannote/models--pyannote--segmentation/snapshots/c4c8ceafcbb3a7a280c2d357aee9fbc9b0be7f9b/pytorch_model.bin`


Model was trained with pyannote.audio 0.0.1, yours is 3.3.2. Bad things might happen unless you revert pyannote.audio to 0.x.
Model was trained with torch 1.10.0+cu102, yours is 2.6.0+cu124. Bad things might happen unless you revert torch to 1.x.


hyperparams.yaml:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


embedding_model.ckpt:   0%|          | 0.00/83.3M [00:00<?, ?B/s]

mean_var_norm_emb.ckpt:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

classifier.ckpt:   0%|          | 0.00/5.53M [00:00<?, ?B/s]

label_encoder.txt:   0%|          | 0.00/129k [00:00<?, ?B/s]

