In [2]:
from collections import defaultdict
import json
import os

import requests
import ricecooker
from bs4 import BeautifulSoup

from tessa_chef import get_modtype, get_resource_info
from tessa_chef import process_language_page
from tessa_chef import get_text, create_subpage_node
from tessa_chef import  TESSA_LANG_URL_MAP, TESSA_HOME_URL

from tessa_chef import DATA_DIR, TREES_DATA_DIR, CRAWLING_STAGE_OUTPUT_TPL, SCRAPING_STAGE_OUTPUT_TPL

import pprint
pp = pprint.PrettyPrinter(indent=4, width=80)

In [3]:
from tessa_chef import TessaChef
tessa_chef = TessaChef()

## Explore crawling with the following debugging steps

In [4]:
DEBUG_LANG = 'en'
tessa_chef.crawl(None, {'lang': DEBUG_LANG})




------- JULY 28 SPECIAL OUTPUT FOR CRAWLING REVIEW ------
crawling lang= en starting at http://www.open.edu/openlearncreate/course/view.php?id=2042



Processing TESSA page  EN     num of unfiltered activity links: 22
Adding oucontent Primary Curriculum framework
Adding oucontent Secondary Science Curriculum framework
Append description: The subject resources are divi..


Adding subpage Life Skills (primary)
http://www.open.edu/openlearncreate/mod/subpage/view.php?id=66754
 - Recognizd standard module structure. Taking whole module:
   Content (oucontent): Module 1: Personal development – how self-esteem impacts on learning
    - skipping other li Read or download individual sections of 
 - Recognizd standard module structure. Taking whole module:
   Content (oucontent): Module 2: Exploring social development
    - skipping other li Read or download individual sections of 
 - Recognizd standard module structure. Taking whole module:
   Content (oucontent): Module 3: Community issues

should never be here >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
 - NEW! Recognizd standard module structure. Taking whole module:
   Content (oucontent): Working with Pupils: A Guide for Teachers
   total oucontent sections skipped in prev module = 1
   total oucontent sections skipped in prev module = 0
   total oucontent sections skipped in prev module = 0
{'kind': 'TessaSubpage', 'url': 'http://www.open.edu/openlearncreate/mod/subpage/view.php?id=66699', 'source_id': 'topic_66699', 'title': 'Handbooks and toolkits for teachers and teacher educators', 'lang': 'en', 'description': '', 'children': [{'kind': 'TessaSubject', 'title': 'A handbook designed to support teacher educators to use TESSA materials in courses and programmes. It is for lecturers, advisers, professional development leaders, policy makers and other colleagues working in teacher education. Working with teachers includes the handbook for teachers, Working with pupils.', 'children': [{'url': 'http

In [1]:
ADDITIONAL_RESOURCES_TITLES = [
    'Additional resources',
    'Autres ressources',
    'موارد المواد',
    'Nyenzo za zaidi',
    # 'Download the complete Pan-Africa English library' # keeping bcs we can handle pdf
]
for r in ADDITIONAL_RESOURCES_TITLES:
    print('<' + r + '>')

<Additional resources>
<Autres ressources>
<موارد المواد>
<Nyenzo za zaidi>


In [5]:
with open(os.path.join(TREES_DATA_DIR, CRAWLING_STAGE_OUTPUT_TPL.format(DEBUG_LANG).replace('.json', '_unfiltered.json'))) as json_file:
    web_resource_tree = json.load(json_file)

print('----')
print('WEB RESOURCE TREE:', 'title:', web_resource_tree['title'], '  len(children) =', len(web_resource_tree['children']))
print('      description:', web_resource_tree['description'][0:60]+'..')
for category in web_resource_tree['children']:
    print('   - Category   title:', category['title']) # len(category['title']))
    print('           desciption:', category['description'][0:60]+'..') # len(category['description']))

    for resource in category['children']:
        # print(resource)
        if 'kind' not in resource:
            resource['kind'] = resource['type']
        print('      - Resource (%s):' % resource['kind'], resource['title'])
        for child in resource['children']:
            # print(child)
            print('         - Child (%s):' % child['kind'], child['title'])

print('\n\n')



----
WEB RESOURCE TREE: title: TESSA (EN)   len(children) = 4
      description: TESSA materials have been created and developed by experts ..
   - Category   title: Curriculum framework
           desciption: An outline of all the TESSA modules can be found in the fol..
      - Resource (oucontent): Primary Curriculum framework
      - Resource (oucontent): Secondary Science Curriculum framework
   - Category   title: Subject resources
           desciption:  The subject resources are divided into six subject areas wh..
      - Resource (TessaSubpage): Life Skills (primary)
         - Child (TessaModule): Module 1: Personal development – how self-esteem impacts on learning
         - Child (TessaModule): Module 2: Exploring social development
         - Child (TessaModule): Module 3: Community issues and citizenship
      - Resource (TessaSubpage): Literacy (primary)
         - Child (TessaModule): Module 1: Reading and writing for a range of purposes
         - Child (TessaModule

In [6]:
from tessa_chef import get_parsed_html_from_url

with open(os.path.join(TREES_DATA_DIR, CRAWLING_STAGE_OUTPUT_TPL.format(DEBUG_LANG))) as json_file:
    web_resource_tree = json.load(json_file)

print('----')
print('WEB RESOURCE TREE:', 'title:', web_resource_tree['title'], '  len(children) =', len(web_resource_tree['children']))
print('      description:', web_resource_tree['description'][0:60]+'..')
for category in web_resource_tree['children']:
    print('   - Category   title:', category['title']) # len(category['title']))
    print('           desciption:', category['description'][0:60]+'..') # len(category['description']))

    for resource in category['children']:
        # print(resource)
        if 'kind' not in resource:
            resource['kind'] = resource['type']
        print('      - Resource (%s):' % resource['kind'], resource['title'])
        for child in resource['children']:
            # print(child)
            print('         - Child (%s):' % child['kind'], child['title'])
            for subchild in child['children']:
                print('            - Subchild (%s):' % subchild['kind'], subchild['title'])

print('\n\n')


----
WEB RESOURCE TREE: title: TESSA (EN)   len(children) = 4
      description: TESSA materials have been created and developed by experts ..
   - Category   title: Curriculum framework
           desciption: An outline of all the TESSA modules can be found in the fol..
      - Resource (oucontent): Primary Curriculum framework
      - Resource (oucontent): Secondary Science Curriculum framework
   - Category   title: Subject resources
           desciption:  The subject resources are divided into six subject areas wh..
      - Resource (TessaSubpage): Life Skills (primary)
         - Child (TessaModule): Module 1: Personal development – how self-esteem impacts on learning
         - Child (TessaModule): Module 2: Exploring social development
         - Child (TessaModule): Module 3: Community issues and citizenship
      - Resource (TessaSubpage): Literacy (primary)
         - Child (TessaModule): Module 1: Reading and writing for a range of purposes
         - Child (TessaModule

In [25]:
from tessa_chef import get_parsed_html_from_url

page = get_parsed_html_from_url('http://www.open.edu/openlearncreate/course/view.php?id=2042')
pre_activity_links = page.find(class_="course-content").find_all("li", class_="activity")
activity_links = list(pre_activity_links)

activity = activity_links[12]

In [27]:
from tessa_chef import get_text
get_text(activity)


'Additional\r resources'

In [10]:
s

'alksj_{}_bas.json'

In [8]:
DOMAIN = os.getenv('CONTENTWORKSHOP_URL', "https://contentworkshop.learningequality.org/")
if DOMAIN.endswith('/'):
    DOMAIN = DOMAIN.rstrip('/')
DOMAIN

'https://contentworkshop.learningequality.org'

'https://contentworkshop.learningequality.org'