## Connect to Google Drive

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Install dependencies

In [1]:
!pip install moviepy==2.0.0.dev2 --quiet
!pip install imageio==2.25.1 --quiet

In [2]:
!apt install imagemagick --quiet

Reading package lists...
Building dependency tree...
Reading state information...
imagemagick is already the newest version (8:6.9.10.23+dfsg-2.1ubuntu11.7).
0 upgraded, 0 newly installed, 0 to remove and 24 not upgraded.


In [3]:
!cat /etc/ImageMagick-6/policy.xml | sed 's/none/read,write/g'> /etc/ImageMagick-6/policy.xml

In [4]:
!pip install pysrt==1.1.2 --quiet

In [5]:
!pip install git+https://github.com/openai/whisper.git -q

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone


## Define Data Folder

In [6]:
data_path = "/content/drive/MyDrive/project2/"

## Extract Audio

In [None]:
import os
import moviepy.editor as mp

video_path = data_path + 'video/'
audio_path = data_path + 'audio/'

os.makedirs(audio_path, exist_ok=True)

mp4_files = [f for f in os.listdir(video_path) if f.endswith('.mp4')]
for mp4_file in mp4_files:
    video = mp.VideoFileClip(video_path + mp4_file)
    audio = video.audio
    audio.write_audiofile(audio_path + mp4_file[:-4] + '.mp3')

MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/喜剧_1.mp3




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/喜剧_6.mp3




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/女装衣橱_2.mp3




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/女装衣橱_7.mp3




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/女装衣橱_9.mp3




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/房地产_0.mp3




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/房地产_1.mp3




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/新生儿玩具_1.mp3




MoviePy - Done.
MoviePy - Writing audio in /content/drive/MyDrive/project2/audio/新生儿玩具_3.mp3




MoviePy - Done.


## Transcript and export to SRT

In [7]:
import os
import whisper
from datetime import timedelta
from whisper.utils import get_writer
from collections import defaultdict as ddict

srt_path = data_path + 'srt/'
os.makedirs(srt_path, exist_ok=True)

model_name = "large"
model_path = data_path + 'models/'
audio_path = data_path + 'audio/'
writer = get_writer('srt', data_path + 'srt/')

model = whisper.load_model(model_name, download_root=model_path)
for audio in os.listdir(audio_path):
    transcript = model.transcribe(audio_path + audio)
    writer(transcript, audio_path+audio, ddict(int))

    # segments = transcript['segments']
    # for segment in segments:
    #     start_time = str(0) + str(timedelta(seconds=int(segment['start']))) + ',000'
    #     end_time   = str(0) + str(timedelta(seconds=int(segment['end']))) + ',000'

    #     text = segment['text']
    #     segment_id = segment['id'] + 1
    #     segment = f"{segment_id}\n{start_time} --> {end_time}\n{text[1:] if text[0] == ' ' else text}\n\n"

    #     with open(srt_path + f'{audio[:-4]}.srt', 'a', encoding='utf-8') as srtf:
    #         srtf.write(segment)

## Translate to Vietnamese

In [None]:
!pip install translators --upgrade -q

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.2/41.2 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.8/79.8 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.8/52.8 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m110.5/110.5 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.3/134.3 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for PyExecJS (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fol

In [None]:
import translators as ts
from pysrt import SubRipFile

def translate_srt_file(input_file, output_file, input_language, output_language):
    subs = SubRipFile.open(input_file, encoding='utf-8')
    for sentence in subs:
        sentence.text = ts.translate_text(sentence.text, translator='google', to_language='vi')
    subs.save(output_file, 'utf-8')

In [None]:
path = srt_path
translate_srt_file(path + '喜剧_1.srt', path + '喜剧_1_translated.srt', 'zh-cn', 'en')

## Create subtitle

In [113]:
import sys
import pysrt
from moviepy.editor import VideoFileClip, TextClip, CompositeVideoClip


def time_to_seconds(time_obj):
    return time_obj.hours * 3600 + time_obj.minutes * 60 + time_obj.seconds + time_obj.milliseconds / 1000


def create_subtitle_clips(subtitles, videosize,fontsize=24, font='Arial', color='yellow', debug = False):
    subtitle_clips = []

    for subtitle in subtitles:
        start_time = time_to_seconds(subtitle.start)
        end_time = time_to_seconds(subtitle.end)
        duration = end_time - start_time

        video_width, video_height = videosize
        
        text_clip = TextClip(subtitle.text, fontsize=fontsize, font=font, color=color, bg_color = 'black',size=(video_width*3/4, None), method='caption').set_start(start_time).set_duration(duration)
        subtitle_x_position = 'center'
        subtitle_y_position = video_height* 4 / 5 

        text_position = (subtitle_x_position, subtitle_y_position)                    
        subtitle_clips.append(text_clip.set_position(text_position))

    return subtitle_clips


In [None]:
subtitle_path = data_path + 'subtitle/'
os.makedirs(subtitle_path, exist_ok=True)

english_font = "Arial"
chinese_font = "Songti-SC-Black"
vietnamese_font = "Helvetica"

for mp4_file, srt_file in zip(sorted(os.listdir(video_path)), sorted(os.listdir(srt_path))):
    video = VideoFileClip(video_path + mp4_file)
    subtitle = pysrt.open(srt_path + srt_file, encoding='utf-8')

    begin, end = mp4_file.split('.mp4')
    output_video_file = begin + '_subtitled.mp4'

    subtitle_clips = create_subtitle_clips(subtitle, video.size, font=english_font)
    final_video = CompositeVideoClip([video] + subtitle_clips)
    final_video.write_videofile(subtitle_path + output_video_file)

In [114]:
video_path = data_path + 'video/'
audio_path = data_path + 'audio/'
subtitle_path = data_path + 'subtitle/'
srt_path = data_path + 'srt/'

os.makedirs(subtitle_path, exist_ok=True)

english_font = "Arial"
chinese_font = "Songti-SC-Black"
vietnamese_font = "FreeMono"

video = VideoFileClip(video_path + '喜剧_1.mp4')
subtitle = pysrt.open(srt_path + '喜剧_1_translated.srt', encoding='utf-8')

begin, end = '喜剧_1.mp4'.split('.mp4')
output_video_file = begin + '_subtitled.mp4'

subtitle_clips = create_subtitle_clips(subtitle, video.size, font=vietnamese_font)
final_video = CompositeVideoClip([video] + subtitle_clips)
final_video.write_videofile(subtitle_path + output_video_file)

FreeMono
Moviepy - Building video /content/drive/MyDrive/project2/subtitle/喜剧_1_subtitled.mp4.
MoviePy - Writing audio in 喜剧_1_subtitledTEMP_MPY_wvf_snd.mp3




MoviePy - Done.
Moviepy - Writing video /content/drive/MyDrive/project2/subtitle/喜剧_1_subtitled.mp4





Moviepy - Done !
Moviepy - video ready /content/drive/MyDrive/project2/subtitle/喜剧_1_subtitled.mp4


In [115]:
from IPython.display import HTML
from base64 import b64encode
mp4 = open(subtitle_path + output_video_file,'rb').read()
data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=400 controls>
      <source src="%s" type="video/mp4">
</video>
""" % data_url)

In [5]:
import whisper


mp4 = "WoodyMeetsBuzz.mp4"
model = whisper.load_model("small", download_root='models')
transcript = model.transcribe(mp4)

100%|███████████████████████████████████████| 461M/461M [00:09<00:00, 52.4MiB/s]


In [10]:
from pprint import pprint

pprint(transcript['segments'][4])

{'avg_logprob': -0.14527519929756239,
 'compression_ratio': 1.4115044247787611,
 'end': 31.0,
 'id': 4,
 'no_speech_prob': 0.0971389040350914,
 'seek': 2900,
 'start': 29.0,
 'temperature': 0.0,
 'text': " Why don't they answer?",
 'tokens': [50364, 1545, 500, 380, 436, 1867, 30, 50464]}


In [12]:
from whisper.utils import get_writer
from collections import defaultdict as ddict


writer = get_writer(output_format='srt', output_dir='')
writer(transcript, audio_path=mp4, options=ddict(int))