In [8]:
import logging
import os
import re
from urllib.parse import urlparse, parse_qs
import tempfile


from bs4 import BeautifulSoup
import jinja2
import requests

from tessa_chef import make_request, get_parsed_html_from_url, make_fully_qualified_url
from tessa_chef import get_license, get_text
from tessa_chef import get_section_filename, create_predictable_zip, download_file

from le_utils.constants import content_kinds, file_formats, licenses

import pprint
pp = pprint.PrettyPrinter(indent=4, width=80)

In [9]:
test_module = 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108' # EN
# test_module = 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=82912' # AR
# test_module = 'http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=82514' # FR

In [10]:

def download_module(module_url):
    print('Scrapring module @ url =', module_url)
    doc = get_parsed_html_from_url(module_url)
    destination = tempfile.mkdtemp()
    print('destination=', destination)

    source_id = parse_qs(urlparse(module_url).query)['id'][0]
    raw_title = doc.select_one("head title").text
    title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()

    contents_dict = dict(
        kind='TessaModuleContentsDict',
        source_id=source_id,
        title=title,
        children=[],
    )

    is_first_section = True
    section_lis = doc.find('ul', class_='oucontent-tree-current-section').find_all('li', recursive=False)
    for section_li in section_lis:            
        section_link = section_li.find('a')
        section_href = section_link['href']
        # TODO: special case for first section --- since it doesn't save section in filename
        #       manually call download_page with filename section_1.html with contents of current page
        if is_first_section:
            section_filename = 'section_1.html'
            is_first_section = False
        else:
            section_filename = get_section_filename(section_href)
        subsections_ul = section_li.find('ul')
        accesshide_span = section_li.find('span', class_='accesshide')
        subsection_lis = subsections_ul.find_all('li')
        if accesshide_span:
            accesshide_span.extract()
        subsections_ul.extract()
        section_title = get_text(section_li)

        section_dict = dict(
            kind='TessaModuleSection',
            title=section_title,
            href=section_href,
            filename=section_filename,
            children=[],
        )
        contents_dict['children'].append(section_dict)

        for subsection_li in subsection_lis:
            subsection_link = subsection_li.find('a')
            subsection_href = subsection_link['href']
            subsection_filename = get_section_filename(subsection_href)
            subaccesshide_span = subsection_li.find('span', class_='accesshide')
            if subaccesshide_span:
                subaccesshide_span.extract()
            subsection_title = get_text(subsection_li)
            subsection_dict = dict(
                kind='TessaModuleSubsection',
                title=subsection_title,
                href=subsection_href,
                filename=subsection_filename,
            )
            section_dict['children'].append(subsection_dict)

    module_index_tmpl = jinja2.Template(open('../chefdata/templates/module_index.html').read())
    index_contents = module_index_tmpl.render(module=contents_dict)
    with open(os.path.join(destination, "index.html"), "w") as f:
        f.write(index_contents)

    # download the html content from each section/subsection
    for section in contents_dict['children']:
        # print('  - section:', section['title'], section['filename'])
        download_page(section['href'], destination, section['filename'], section)
        for subsection in section['children']:
            # print('    - subsection:', subsection['title'], subsection['filename'])
            download_page(subsection['href'], destination, subsection['filename'], subsection)

    
    return contents_dict

    zip_path = create_predictable_zip(destination)
    print('zip_path =', zip_path)
    module_dict = dict(
        kind='HTML5AppNode',
        source_id=source_id,
        title=title,
        license=get_license(licenses.CC_BY_NC_SA, copyright_holder='TESSA'),
        description='fake descri',
        files=[{'file_type':'HTMLZipFile', 'path':zip_path}],
    )
    return module_dict



def download_page(page_url, destination, filename, page_info):
    print('Scrapring section/subsectino...', filename)
    doc = get_parsed_html_from_url(page_url)
    source_id = parse_qs(urlparse(page_url).query)['id'][0] + '/' + filename   # or should I use &section=1.6 ?

    # TODO: edit doc so only contentdiv ramains
    
    def download_assets(selector, attr, middleware=None):
        print('donwloading assets', selector)
        nodes = doc.select(selector)
        for i, node in enumerate(nodes):
            url = make_fully_qualified_url(node[attr])
            filename = "%s_%s" % (i, os.path.basename(url))
            node[attr] = filename
            download_file(url, destination, request_fn=make_request, filename=filename, middleware_callbacks=middleware)

    def js_middleware(content, url, **kwargs):
        return content

    # Download all static assets.
    # TODO(davidhu): Also download fonts referenced in http://www.africanstorybook.org/css/app.css
    download_assets("img[src]", "src")  # Images
    download_assets("link[href]", "href")  # CSS
    download_assets("script[src]", "src", middleware=js_middleware) # JS
            
    raw_title = doc.select_one("head title").text
    title = raw_title.replace('OLCreate:', '')\
            .replace('TESSA_ARABIC', '')\
            .replace('TESSA_Eng', '')\
            .replace('TESSA_Fr', '')\
            .strip()

    with open(os.path.join(destination, filename), "w") as f:
        f.write(str(doc))


In [11]:
toc = download_module(test_module)

# for section in toc['children']:
#     print('  - section:', section['title'], section['filename'])
#     for subsection in section['children']:
#         print('    - subsection:', subsection['title'], subsection['filename'])


Scrapring module @ url = http://www.open.edu/openlearncreate/mod/oucontent/view.php?id=81108
destination= /var/folders/k3/r74jr38d56v717n39fd073f80000gn/T/tmpn5wu59c1
Scrapring section/subsectino... section_1.html


KeyboardInterrupt: 

In [5]:

def passa():


