In [51]:
import requests
from readability.readability import Document
import pandas as pd
import bs4
from bs4 import Tag, BeautifulSoup
import hashlib
import os
from os import path
import sh
import multiprocessing
import docopt


class Extractor(object):
    """
    Extract text/ images sequences from a web page's main body
    """
    def __init__(self, base_url):
        self.cur_text = ''
        self.result = []
        self.base_url = base_url

    def get_abs_url(self, url):
        if url.startswith('http://') or url.startswith('https://'):
            return url
        else:
            return '%s/%s' % (self.base_url, url)

    def recursive_extract_text_image(self, obj):
        # reference:
        # http://stackoverflow.com/questions/20590624/python-beautifulsoup-div-text-and-img-attributes-in-correct-order
        for child in obj.children:
            if isinstance(child, Tag):
                #result.append(child.get('alt', ''))
                self.recursive_extract_text_image(child)
                if child.name == 'img':
                    self.result.append(('text', self.cur_text))
                    self.cur_text = ''
                    self.result.append(('image',
                                        self.get_abs_url(child['src'])
                    ))
            else:
                if len(child.strip()) > 0:
                    self.cur_text += ' ' + child.strip() + ' '

    def html_to_asset_list(self, html):
        """
        :param html: The html content in str
        :return: The extracted list of text/ image sequence
        """
        bs_obj = BeautifulSoup(html, 'html.parser')
        self.result = []
        self.cur_text = ''
        self.recursive_extract_text_image(bs_obj)
        self.result.append(('text', self.cur_text))
        return self.result


#global_pool = None

class Converter(object):
    def __init__(self, num_pools=4):
        #global global_pool
        #global_pool = multiprocessing.Pool(num_pools)
        pass

    def execute(self, command):
        return os.system(command)

    def execute_all(self, commands):
        #global global_pool
        #return global_pool.map(self.execute, commands)
        print(commands)
        for c in commands:
            os.system(c)

    def string2hash(self, s):
        m = hashlib.sha256()
        m.update(s.encode('utf-8'))
        return m.hexdigest()[:16]

    def get_audio_length(self, local_src):
        filename = local_src + '.wav'
        # Caveats: can only deal with < 60s audios
        # | grep Duration | cut -f1 -d, | cut -f4 -d:
        seconds = sh.soxi('-D', filename)
        return seconds.strip()

    def get_screen_play(self, url):
        """Download webpage and analyze basic sequence

        :param url:
        :return:
        """
        res = requests.get(url)
        html = res.content.decode('utf-8')
        # Analyze basic sequence
        readable_article = Document(html).summary()
        self.readable_article = readable_article
        readable_title = Document(html).title()
        self.readable_title = readable_title
        base_url = path.dirname(res.request.url)
        result = Extractor(base_url).html_to_asset_list(readable_article)
        #print(result)
        df_screenplay = pd.DataFrame(result, columns=['type', 'content'])
        df_screenplay['local_src'] = df_screenplay['content'].apply(lambda x: self.string2hash(x))
        image_selector = (df_screenplay['type'] == 'image')
        df_screenplay.loc[image_selector, 'filename'] = df_screenplay.loc[
            image_selector, 'content'].apply(lambda x: path.basename(x))
        df_screenplay.loc[image_selector, 'extname'] = df_screenplay.loc[
            image_selector, 'filename'].apply(lambda x: path.splitext(x)[1])
        df_screenplay = df_screenplay.fillna('')
        df_screenplay['download_name'] = df_screenplay['local_src'] + df_screenplay['extname']
        df_screenplay['converted_name'] = df_screenplay['local_src'] + '.png'

        self.df_screenplay = df_screenplay
        return df_screenplay

    def get_png_images(self):
        """Download images and convert to .png
        :return:
        """
        commands = []
        for (i, r) in self.df_screenplay.iterrows():
            if r['type'] == 'image':
                commands.append('wget {content} -O {download_name}'.format(**r))
        self.execute_all(commands)
        commands = []
        for (i, r) in self.df_screenplay.iterrows():
            if r['type'] == 'image':
                commands.append('convert {download_name} -geometry 600x400! {converted_name}'.format(**r))
        self.execute_all(commands)

    def text_to_speech(self, rate, voice):
        """ Generate audio via say (m4a) and convert to (wav)

        :return:
        """
        commands = []
        for (i, r) in self.df_screenplay.iterrows():
            if r['type'] == 'text':
                #commands.append('say --output-file={local_src}.m4a --voice=daniel --rate=220 --progress --file-format=m4af "{content}"'.format(**r))
                #commands.append('say --output-file={local_src}.m4a -v Ting-Ting --rate=300 --progress --file-format=m4af "{content}"'.format(**r))
                commands.append('say --output-file={local_src}.m4a -v {voice} --rate={rate} --progress --file-format=m4af "{content}"'.format(rate=rate, voice=voice, **r))
        self.execute_all(commands)
        # Convert to .wav
        commands = []
        for (i, r) in self.df_screenplay.iterrows():
            if r['type'] == 'text':
                commands.append('avconv -i {local_src}.m4a -y {local_src}.wav'.format(**r))
        self.execute_all(commands)
        # Analyze audio duration
        text_selector = (self.df_screenplay['type'] == 'text')
        self.df_screenplay.loc[text_selector, 'duration'] = self.df_screenplay.loc[text_selector, 'local_src'].apply(self.get_audio_length)

    def organise_scenes(self):
        """ Organise scenes
        From:

        :return:
        """
        scenes = []
        df_sp_orged = self.df_screenplay.reset_index()
        # Group the sequence
        df_sp_orged['group'] = df_sp_orged['index'].apply(lambda x: int((x + 1) / 2))
        for (gname, group) in df_sp_orged.groupby('group'):
            if len(group[group['type'] == 'image']) == 0:
                fn_image = 'default-image.png'
            else:
                fn_image = group[group['type'] == 'image']['converted_name'].values[0]

            if len(group[group['type'] == 'text']) == 0:
                duration = 1.53
                fn_audio = 'default-audio.mp4'
            else:
                duration = group[group['type'] == 'text']['duration'].values[0]
                fn_audio = group[group['type'] == 'text']['local_src'].values[0] + '.m4a'
            scenes.append(('%04d' % gname, fn_image, duration, fn_audio))
        df_scenes = pd.DataFrame(scenes, columns=['group', 'fn_image', 'duration', 'fn_audio'])

        df_scenes['fn_video_only'] = 'group' + df_scenes['group'] + '.mp4'
        df_scenes['fn_video'] = 'group' + df_scenes['group'] + '-a.mp4'
        # Following was used to solve non integer fps problem the conflicts with stanrdard
        # Now we already use output parameter to work around.
        #df_scenes['duration'] = df_scenes['duration'].apply(lambda x: int(np.ceil(float(x))))
        df_scenes['fn_image_resized'] = 'resized-' + df_scenes['fn_image']
        df_scenes['fn_audio_only'] = 'group' + df_scenes['group'] + '-audio.m4a'
        # To avoid too short clips
        df_scenes = df_scenes[df_scenes['duration'].apply(lambda x: float(x) > 0.1)]

        self.df_sp_orged = df_sp_orged
        self.df_scenes = df_scenes

    def prepare_default_assets(self):
        os.system('cp -f default/* .')

    def images_to_videos(self, screen_size):
        commands = []
        for (i, r) in self.df_scenes.iterrows():
            commands.append('convert {fn_image} -resize {screen_size} {fn_image_resized}'.format(screen_size=screen_size, **r))
        self.execute_all(commands)
        commands = []
        for (i, r) in self.df_scenes.iterrows():
            commands.append('ffmpeg -f image2 -r 1/{duration} -i {fn_image_resized} -qscale:v 1 -copyts -vcodec mpeg4 -y -r 25 {fn_video_only}'.format(**r))
        self.execute_all(commands)

    def videos_add_audio(self):
        commands = []
        for (i, r) in self.df_scenes.iterrows():
            commands.append('cp {fn_audio} {fn_audio_only}'.format(**r))
        self.execute_all(commands)
        commands = []
        for (i, r) in self.df_scenes.iterrows():
            commands.append('ffmpeg -i {fn_video_only} -i {fn_audio} -qscale:v 1 -copyts -vcodec copy -acodec copy -y {fn_video}'.format(**r))
            #commands.append('ffmpeg -i {fn_video_only} -i {fn_audio} -map 0:0 -map 1 -vcodec copy -acodec copy -y {fn_video}'.format(**r))
        self.execute_all(commands)

    def assemble_output(self, fn_output):
        open('playlist.txt', 'w').write('\n'.join(list(self.df_scenes['fn_video'].apply(lambda x: "file '%s'" % x))))
        os.system('ffmpeg -f concat -i playlist.txt -c copy -y %s' % fn_output)

    def convert(self, url, fn_output, rate=220, voice='Ting-Ting', screen_size='600x400!'):
        self.get_screen_play(url)
        self.get_png_images()
        self.text_to_speech(rate, voice)
        self.organise_scenes()
        self.prepare_default_assets()
        self.images_to_videos(screen_size)
        self.videos_add_audio()
        self.assemble_output(fn_output)


In [52]:
url='http://project.initiumlab.com/news2video/case1/index.html'
url = 'https://theinitium.com/article/20151127-mainland-government-officials-suicide-map/'

In [53]:
c = Converter()
#c.convert(url, 'out.mp4')
c.get_screen_play(url)

Unnamed: 0,type,content,local_src,filename,extname,download_name,converted_name
0,text,,e3b0c44298fc1c14,,,e3b0c44298fc1c14,e3b0c44298fc1c14.png
1,image,https://d3rhr7kgmtrq1v.cloudfront.net/media/d4...,e900fabec7ee6e02,jpg,,e900fabec7ee6e02,e900fabec7ee6e02.png
2,text,圖：端傳媒設計部 依據媒體公開報導，自2015年10月23日至11月23日，30天內，發...,2e4ed770b69aa2c3,,,2e4ed770b69aa2c3,2e4ed770b69aa2c3.png
3,image,https://d3rhr7kgmtrq1v.cloudfront.net/media/36...,9fdd97ae72c9ff1b,jpg,,9fdd97ae72c9ff1b,9fdd97ae72c9ff1b.png
4,text,圖：端傳媒設計部 儘管如此，我們仍然能從這份不完全統計裏窺見一些規律。比如，被媒體報導的...,d4599df7083eb5fd,,,d4599df7083eb5fd,d4599df7083eb5fd.png
5,image,https://d3rhr7kgmtrq1v.cloudfront.net/media/21...,2f851004b480a4bc,jpg,,2f851004b480a4bc,2f851004b480a4bc.png
6,text,圖：端傳媒設計部,04f567efa5d15822,,,04f567efa5d15822,04f567efa5d15822.png
7,image,https://d3rhr7kgmtrq1v.cloudfront.net/media/ed...,40ef9c5bfb693cf6,jpg,,40ef9c5bfb693cf6,40ef9c5bfb693cf6.png
8,text,圖：端傳媒設計部 根據蒐集到的新聞報導顯示，40至60歲的官員是「非正常死亡」的「高危人...,e3a9683990511c32,,,e3a9683990511c32,e3a9683990511c32.png
9,image,https://d3rhr7kgmtrq1v.cloudfront.net/media/0f...,24d7a6edce2e8ea5,jpg,,24d7a6edce2e8ea5,24d7a6edce2e8ea5.png


In [54]:
c.readable_title

'圖解中國官員非正常死亡，基層官員超過2/3 | 端傳媒 Initium Media'

In [73]:
os.system(
    'say --output-file=title.m4a -v Ting-Ting --rate=180 --progress --file-format=m4af "%s"' % c.readable_title
)
os.system(
    'avconv -i title.m4a -y title.wav'
)

0

In [74]:
length = c.get_audio_length('title')

In [75]:
c.get_png_images()

['wget https://d3rhr7kgmtrq1v.cloudfront.net/media/d47a1245af8e4b818a1bd78ea0b82c23.jpg?imageView2/1/w/1080/h/720/format/jpg -O e900fabec7ee6e02', 'wget https://d3rhr7kgmtrq1v.cloudfront.net/media/365f3167248143e6aaf7444cc651b178.jpg?imageView2/1/w/1080/h/720/format/jpg -O 9fdd97ae72c9ff1b', 'wget https://d3rhr7kgmtrq1v.cloudfront.net/media/21b8d3a88f8047a7b414afb07520418a.jpg?imageView2/1/w/1080/h/720/format/jpg -O 2f851004b480a4bc', 'wget https://d3rhr7kgmtrq1v.cloudfront.net/media/ed1f7e0844ff4ef698a434ca1740996c.jpg?imageView2/1/w/1080/h/721/format/jpg -O 40ef9c5bfb693cf6', 'wget https://d3rhr7kgmtrq1v.cloudfront.net/media/0f93f762b76a4ae59764ec2a21f05d0b.jpg?imageView2/1/w/1080/h/720/format/jpg -O 24d7a6edce2e8ea5', 'wget https://d3rhr7kgmtrq1v.cloudfront.net/media/fad736b805154eceb3e03a86416737d6.jpg?imageView2/1/w/1080/h/720/format/jpg -O 971ae5ee810ce74c']
['convert e900fabec7ee6e02 -geometry 600x400! e900fabec7ee6e02.png', 'convert 9fdd97ae72c9ff1b -geometry 600x400! 9fdd97ae7

In [76]:
df_images = c.df_screenplay[c.df_screenplay['type'] == 'image']
os.system('rm -f image*.png')
order=1
for (i, r) in df_images.iterrows():
    os.system('cp %s image%02d.png' % (r['converted_name'], order))
    order += 1

In [77]:
length

'7.801905'

In [78]:
per_image_length = 1.0 * float(length) / len(df_images)

In [79]:
os.system('ffmpeg -f image2 -r 1/{length} -i image%02d.png -vcodec mpeg4 -r 25 -y movie.mp4'.format(length=per_image_length))

0

In [80]:
os.system('ffmpeg -i movie.mp4 -i title.m4a -vcodec copy -acodec copy -y output.mp4')

0