# Pre-process talks

Convert talk content to markdown format and extract key information

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import os

from bs4 import BeautifulSoup
from markdownify import MarkdownConverter
from tqdm import tqdm
from urllib.parse import urljoin, urlparse

In [None]:
# config
input_dir = '../data/raw'
output_dir = '../data/pre_process'
bs_parser = 'html.parser'

In [None]:
class MyConverter(MarkdownConverter):
    """
    Create a custom MarkdownConverter
    """
    def __init__(self, *args, **kwargs):
        super(MyConverter, self).__init__(*args, **kwargs)
        self.base_url = kwargs.get('base_url','')
        
    def convert_a(self, el, text, convert_as_inline):
        """join hrefs with a base url"""
        if 'href' in el.attrs:
            el['href'] = urljoin(self.base_url, el['href'])
        return super().convert_a(el, text, convert_as_inline)

    def convert_p(self, el, text, convert_as_inline):
        if el.has_attr('id') and len(el['id']) > 0:
            _id = el['id']
            text = f'<a name="{_id}"></a>{text}'
        return super().convert_p(el, text, convert_as_inline)

    
# Create shorthand method for custom conversion
def md(html, **options):
    return MyConverter(**options).convert(html)


def _clean(text):
    """Replace non-breaking space with normal space"""
    return text.replace(' ', ' ')


def get_talk_info(url, html):
    path_components = urlparse(url).path.split('/')
    year, month = path_components[3:5]
    soup = BeautifulSoup(html, bs_parser)
    title = soup.select_one('article header h1')
    author = soup.select_one('article p.author-name')
    author_role = soup.select_one('article p.author-role')
    body = soup.select_one('article div.body-block')
    content = _clean(md(str(body), base_url=url)) if body else ''

    return {
        'year': year,
        'month': month,
        'url': url,
        'title': _clean(title.text) if title else '',
        'author': _clean(author.text) if author else '',
        'author_role': _clean(author_role.text) if author_role else '',
        'content': content,
    }


def write_talk_info(path, talk_info):
    with open(path, 'w', encoding='utf-8') as f:
        json.dump(talk_info, f, ensure_ascii=False, indent=2)

In [None]:
for filename in tqdm(os.listdir(input_dir)):
    path = os.path.join(input_dir, filename)
    with open(path, encoding='utf8') as f:
        data = json.load(f)
    talk_info = get_talk_info(data['url'], data['html'])
    if not talk_info['title'] or not talk_info['content']:
        print("Missing title or content - skipping", filename)
        continue
    if not talk_info['author']:
        print("Missing author", filename)
    out_path = os.path.join(output_dir, filename)
    write_talk_info(out_path, talk_info)