In [1]:
import pandas as pd
from moviepy.editor import *
from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
from numpy import asarray
import os
import subprocess

In [2]:
# Change ffmpeg used by moviepy to the one installed if one is installed, otherwise use the one from moviepy.
# This is necessary for using HW acceleration.
try:
    from moviepy.config import change_settings
    change_settings({"FFMPEG_BINARY":"ffmpeg"})
except:
    pass

In [3]:
# this can be empty if the video file and its videopipe output are at the same
# location as the code
path = ''
v_name = 'HIGH_LIGHTS_I_SNOWMAGAZINE_I_SANDER_26'
task = '_frame_text_detection_datamodel'
w, h = 1920, 1080
RESIZE_DIM = 640

In [4]:
## read text detection json

text = pd.read_json(path + v_name + '/' + v_name + task + '.json', lines = True)

text_detected = [f for f in text.data[0] if len(f['text']) > 0]

frames = [f['dimension_idx'] for f in text_detected]

In [15]:
## read video file with moviepy

clip = VideoFileClip(v_name + '.mp4')

audio = clip.audio

# Write audio to file
audio.write_audiofile(v_name + '_audio.mp3')

fps = clip.fps
frame_duration = 1 / fps

MoviePy - Writing audio in HIGH_LIGHTS_I_SNOWMAGAZINE_I_SANDER_26_audio.mp3


                                                                        

MoviePy - Done.




In [6]:
def get_frame(clip, frame_number):
    return Image.fromarray(clip.get_frame(frame_number * frame_duration))

In [7]:
def draw_text(texts, img):
    for text in texts:
        left, top, width, height, conf, text = texts[text].values()
        draw = ImageDraw.Draw(img)
        draw.rectangle(((left, top), ((left + width), (top + height))), outline = 'red')
        text = text + " (conf: " + str(conf) + ")"
        draw.text((left, top), text, font=font, fill = 'red')
    return img

In [11]:
# Set how many faces to be included in each video clip. If set to -1, all faces
# will be included in one video clip. A lower number will result in more video
# clips using less memory but more disk space. A higher number will result in
# less video clips using more memory but less disk space.
text_limit = 100

# Requires font in /usr/share/fonts/truetype.
font = ImageFont.truetype("Lato-Bold.ttf", 20)

duration_t = frame_duration
prev_t = 0

f = open('text_detection.txt', 'w')

def get_text_clips(text_detected, text_limit=100, timestamp=0):
    clips = []
    text_count = 0
    for f in text_detected:
        if text_count == text_limit:
            break

        img = get_frame(clip, f['dimension_idx'])
        t = f['dimension_idx'] * frame_duration

        texts = f['text']

        draw_text(texts, img)

        if (timestamp != t):
            clips.append(clip.subclip(timestamp, t))

        clips.append(ImageClip(asarray(img), duration = duration_t))
        img.close()
        timestamp = t + duration_t
        text_count += 1

        if f == text_detected[-1]:
            clips.append(clip.subclip(timestamp, clip.duration))
            timestamp = clip.duration

    return clips, timestamp

for i in range(len(text_detected) // text_limit + 1):
    clips = []
    clips, prev_t = get_text_clips(text_detected[i * text_limit:], text_limit, prev_t)

    # Try hw acceleration, else use cpu
    try:
            concatenate_videoclips(clips).write_videofile('text_detection_' + str(i) + '.mp4', fps=fps, codec='h264_nvenc', audio=False, logger=None, preset='3')
    except:
        try:
            concatenate_videoclips(clips).write_videofile('text_detection_' + str(i) + '.mp4', fps=fps, codec='libx264', audio=False, logger=None, preset='ultrafast')
        except:
            raise Exception('An error occured while writing the video file.')
    f.write('file text_detection_' + str(i) + '.mp4\n')
f.close()

In [12]:
# remove any existing output.mp4 file
if os.path.exists('output.mp4'):
    os.remove('output.mp4')
if os.path.exists(v_name + '_text_detection.mp4'):
    os.remove(v_name + '_text_detection.mp4')

# Concatenate all the files in the face_detection.txt file into one final clip
# and write to .mp4 file.
subprocess.call("ffmpeg -f concat -safe 0 -i text_detection.txt -c copy output.mp4", shell=True)

# Add the audio to the final clip.
subprocess.call("ffmpeg -i output.mp4 -i " + v_name + "_audio.mp3 -c:v copy -c:a aac -map 0:v:0 -map 1:a:0 -shortest " + v_name + "_text_detection.mp4", shell=True)

# Delete all the subclips.
for i in range(len(text_detected) // text_limit + 1):
    os.remove('text_detection_' + str(i) + '.mp4')

# Delete the face_detection.txt file.
os.remove('text_detection.txt')

# Delete the audio file.
os.remove(v_name + '_audio.mp3')