In [47]:
import logging
import os
import re
from urllib.parse import urlparse, parse_qs
import tempfile


from bs4 import BeautifulSoup
import requests

from tessa_chef import make_request, get_parsed_html_from_url, make_fully_qualified_url
from tessa_chef import get_license, get_text

from le_utils.constants import content_kinds, file_formats, licenses

import pprint
pp = pprint.PrettyPrinter(indent=4, width=80)

In [41]:
test_module = 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108' # EN
# test_module = 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=82912' # AR

In [50]:

BG_IMG_RE = re.compile("background-image:url\((.*)\)")
SEND_FACEBOOK_RE = re.compile("sendFacebook\([^,]*,[^,]*,(.*)\)")


def download_module(module_url):
    print('Scapring module...')
    doc = get_parsed_html_from_url(module_url)
    destination = tempfile.mkdtemp()
    print('destination=', destination)
    
    source_id = parse_qs(urlparse(module_url).query)['id'][0]
    raw_title = doc.select_one("head title").text
    title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()

    contents_dict = dict(
        kind='TessaModuleContentsDict',
        source_id=source_id,
        title=title,
        children=[],
    )
    section_lis = doc.find('ul', class_='oucontent-tree-current-section').find_all('li', recursive=False)
    for section_li in section_lis:
        section_link = section_li.find('a')
        section_href = section_link['href']
        subsections_ul = section_li.find('ul')
        accesshide_span = section_li.find('span', class_='accesshide')
        subsection_lis = subsections_ul.find_all('li')
        if accesshide_span:
            accesshide_span.extract()
        subsections_ul.extract()
        section_title = get_text(section_li)
        
        section_dict = dict(
            title=section_title,
            href=section_href,
            children=[],
        )
        contents_dict['children'].append(section_dict)
        
        for subsection_li in subsection_lis:
            subsection_link = subsection_li.find('a')
            subsection_href = subsection_link['href']
            subaccesshide_span = subsection_li.find('span', class_='accesshide')
            if subaccesshide_span:
                subaccesshide_span.extract()
            subsection_title = get_text(subsection_li)
            subsection_dict = dict(
                title=subsection_title,
                href=subsection_href,
            )
            section_dict['children'].append(subsection_dict)

    return contents_dict
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(str(doc))

#     zip_path = create_predictable_zip(destination)

#     module_dict = dict(
#         kind='HTML5AppNode',
#         source_id=source_id,
#         title=title,
#         license=get_license(licenses.CC_BY_NC_SA, copyright_holder='TESSA'),
#         description='fake descri',
#         files=[{'file_type':'HTMLZipFile', 'path':zip_path)],
#     )
#     return module_dict

                


def download_page(page_url):
    print('Scapring module page...')
    pass
    

In [51]:
toc = download_module(test_module)
pprint.pprint(toc)

Scapring module...
destination= /var/folders/k3/r74jr38d56v717n39fd073f80000gn/T/tmpzd5zpg36
{'children': [{'children': [{'href': 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108&section=1.1',
                             'title': '1. Organising pupils in groups and '
                                      'pairs'},
                            {'href': 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108&section=1.2',
                             'title': '2. Finding out what pupils think and '
                                      'feel'},
                            {'href': 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108&section=1.3',
                             'title': '3. Respecting differences'},
                            {'href': 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108&section=1.4',
                             'title': 'Resource 1: Similarities and '
                                      'd

In [53]:
sec_url = 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108&section=1.3'

sec_url = 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108&section=2'

def get_section_filename(sec_url):
    sec_num = parse_qs(urlparse(sec_url).query)['section'][0]
    return 'section_' + sec_num.replace('.', '_') + '.html'

get_section_filename(sec_url)

'section_2.html'

In [28]:

def passa():

    def download_assets(selector, attr, middleware=None):
        nodes = doc.select(selector)
        for i, node in enumerate(nodes):
            url = make_fully_qualified_url(node[attr])
            filename = "%s_%s" % (i, os.path.basename(url))
            node[attr] = filename
            download_file(url, destination, request_fn=make_request, filename=filename, middleware_callbacks=middleware)

    def js_middleware(content, url, **kwargs):
        # Polyfill window.localStorage as iframes can't access localStorage.
        return content.replace("window.localStorage",
                "({setItem: function(){}, removeItem: function(){}})")

    # Download all static assets.
    # TODO(davidhu): Also download fonts referenced in http://www.africanstorybook.org/css/app.css
    download_assets("img[src]", "src")  # Images
    download_assets("link[href]", "href")  # CSS
    download_assets("script[src]", "src", middleware=js_middleware) # JS

    # Download all background images, e.g. <div style="background-image:url()">
    # (africanstorybook.org uses these for the main picture found on each page
    # of the storybook.)
    bg_img_nodes = doc.select("div[style*=\"background-image:url(\"]")
    for i, node in enumerate(bg_img_nodes):
        style = node["style"]
        match = BG_IMG_RE.search(style)
        if not match:
            continue

        url = make_fully_qualified_url(match.group(1))
        filename = "%s_%s" % (i, os.path.basename(url))
        node["style"] = BG_IMG_RE.sub("background-image:url(%s)" % filename, style)
        download_file(url, destination, request_fn=make_request, filename=filename)

    # Hide the African Storybook header nav bar.
    header = doc.select_one("#headerBar")
    if header:
        header["style"] = "display: none;"
