In [4]:
from moviepy.editor import VideoFileClip

# Đường dẫn tới video của bạn
video_path = "video.mp4"
audio_output_path = "output_audio.wav"

# Tách âm thanh từ video
video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_clip.write_audiofile(audio_output_path, codec="pcm_s16le")  # Lưu âm thanh dưới định dạng WAV

chunk:   0%|          | 31/78806 [00:07<5:02:51,  4.34it/s, now=None]

MoviePy - Writing audio in output_audio.wav


chunk:   0%|          | 31/78806 [00:39<27:39:49,  1.26s/it, now=None]

MoviePy - Done.


In [1]:
import torch
import torchaudio
from transformers import Speech2TextProcessor, Speech2TextForConditionalGeneration
import srt
from datetime import timedelta
from moviepy.editor import VideoFileClip

# Check if a GPU is available and move the model to GPU if possible
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load the model and processor
model = Speech2TextForConditionalGeneration.from_pretrained("facebook/s2t-small-librispeech-asr").to(device)
processor = Speech2TextProcessor.from_pretrained("facebook/s2t-small-librispeech-asr")

# Load and resample the audio file to 16,000 Hz
audio_path = "output_audio.wav"  # Replace with your file path
waveform, sample_rate = torchaudio.load(audio_path)
if sample_rate != 16000:
    waveform = torchaudio.transforms.Resample(orig_freq=sample_rate, new_freq=16000)(waveform)

# Define chunk size (e.g., 11 seconds per chunk) and prepare for transcription
chunk_size = 11 * 16000  # 11 seconds * 16000 samples per second
transcriptions = []

# Process each chunk separately and record start/end times for SRT
segments = []
for i in range(0, waveform.size(1), chunk_size):
    # Extract chunk of the waveform
    chunk = waveform[:, i:i + chunk_size]

    # Preprocess the audio chunk
    inputs = processor(chunk.squeeze().numpy(), sampling_rate=16000, return_tensors="pt").to(device)

    # Perform inference on the chunk
    with torch.no_grad():
        generated_ids = model.generate(
            inputs["input_features"],
            attention_mask=inputs["attention_mask"],
            max_length=200,  # Adjust based on your VRAM availability
            num_beams=1
        )

    # Decode the chunk transcription
    transcription_chunk = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
    transcriptions.append(transcription_chunk)  # Append the text result for the chunk

    # Calculate start and end times for SRT
    start_time = timedelta(seconds=i / 16000)
    end_time = timedelta(seconds=min(i + chunk_size, waveform.size(1)) / 16000)
    segments.append(srt.Subtitle(index=len(segments) + 1, start=start_time, end=end_time, content=transcription_chunk))

    # Clear memory after each chunk
    del inputs, generated_ids
    torch.cuda.empty_cache() if device == "cuda" else None

# Combine all transcriptions and generate SRT content
final_transcription = " ".join(transcriptions)
srt_content = srt.compose(segments)

# Save the SRT file
srt_path = "output_subtitles.srt"
with open(srt_path, "w") as f:
    f.write(srt_content)

print("Subtitles saved as output_subtitles.srt")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of Speech2TextForConditionalGeneration were not initialized from the model checkpoint at facebook/s2t-small-librispeech-asr and are newly initialized: ['model.decoder.embed_positions.weights', 'model.encoder.embed_positions.weights']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Subtitles saved as output_subtitles.srt


In [None]:
import ffmpeg
# Integrate subtitles into the video (requires ffmpeg installed)
video_path = "video.mp4"  # Replace with your video file path
output_video_path = "video_with_subtitles.mp4"
ffmpeg_command = f"ffmpeg -i {video_path} -vf subtitles={srt_path} {output_video_path}"

import os
os.system(ffmpeg_command)

print(f"Final video with subtitles saved as {output_video_path}")