In [100]:
from bs4 import BeautifulSoup
from html2text import html2text
import requests
import urllib
import logging

# from le_utils.constants import content_kinds
from le_utils.constants import licenses
from ricecooker.classes import nodes, files
from ricecooker.classes.nodes import ChannelNode, HTML5AppNode, TopicNode, VideoNode, DocumentNode, ExerciseNode
from ricecooker.commands import uploadchannel
from ricecooker.exceptions import UnknownContentKindError, UnknownFileTypeError, raise_for_invalid_channel
from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter
from ricecooker.config import LOGGER

In [101]:
LOGGER.setLevel(logging.INFO)

sess = requests.Session()
cache = FileCache('.webcache')
basic_adapter = CacheControlAdapter(cache=cache)
forever_adapter= CacheControlAdapter(heuristic=CacheForeverHeuristic(), cache=cache)

sess.mount('http://', basic_adapter)
sess.mount('https://', basic_adapter)
sess.mount('http://www.prathamopenschool.org', forever_adapter)
sess.mount('https://www.prathamopenschool.org', forever_adapter)


In [102]:
# Get list of main topics
BASE_URL = 'http://www.prathamopenschool.org'
LANGUAGE = 'hn'

def get_absolute_path(path):
    return urllib.parse.urljoin(BASE_URL, path)

def get_page(path):
    url = get_absolute_path(path)
    resp = sess.get(url)
    return BeautifulSoup(resp.content, 'html.parser')

def get_source_id(path):
    return path.strip("/").split("/")[-1]


In [103]:
def get_topics(channel, language):
    doc = get_page(language)
    menu_row = doc.find('div', {'id': 'menu-row'})
    for topic in menu_row.find_all('a'):
        if topic['href'] == '#':
            continue
        title = get_source_id(topic['href'])
        source_id = topic.get_text()
        node = TopicNode(title=title, source_id=source_id)
        channel.add_child(node)

In [106]:
def construct_channel(*args, **kwargs):
    channel = ChannelNode(
        title="Pratham Open School",
        source_domain=BASE_URL,
        source_id="pratham-open-school",
    )
    get_topics(channel, LANGUAGE)
    return channel
LOGGER.addHandler(logging.StreamHandler())
construct_channel().print_tree()

      Pratham Open School (ChannelNode): 14 descendants
         Mathematics (TopicNode): 0 descendants
         English (TopicNode): 0 descendants
         Health (TopicNode): 0 descendants
         Science (TopicNode): 0 descendants
         Hospitality (TopicNode): 0 descendants
         Construction (TopicNode): 0 descendants
         Automobile (TopicNode): 0 descendants
         Electric (TopicNode): 0 descendants
         Beauty (TopicNode): 0 descendants
         Healthcare (TopicNode): 0 descendants
         Std8 (TopicNode): 0 descendants
         Fun (TopicNode): 0 descendants
         Story (TopicNode): 0 descendants
         CRS104 (TopicNode): 0 descendants


In [41]:
print(topics)

['/hn/Course/Mathematics', '/hn/Course/English', '/hn/Course/Health', '/hn/Course/Science', '/hn/Course/Hospitality', '/hn/Course/Construction', '/hn/Course/Automobile', '/hn/Course/Electric', '/hn/Course/Beauty', '/hn/Course/Healthcare', '/hn/Course/Std8', '/hn/Fun', '/hn/Story', '/hn/gamelist/CRS104']


In [48]:
def get_subtopics(topic):
    doc = get_page(topic)
    menu_row = doc.find('div', {'id' : 'body-row'}).find('div', {'class' : 'col-md-2'})
    subtopics = []
    for subtopic in menu_row.find_all('a'):
        subtopics.append(subtopic['href'])
    return subtopics
subtopics = get_subtopics(topics[0])

In [56]:
def get_lessons(subtopic):
    doc = get_page(subtopic)
    menu_row = doc.find('div', {'id' : 'body-row'}).find('div', {'class' : 'col-md-9'})
    lessons = []
    for lesson in menu_row.find_all('a'):
        lessons.append(lesson['href'])
    return lessons
lessons = get_lessons(subtopics[0])

In [76]:
def get_contents(lesson):
    doc = get_page(lesson)
    menu_row = doc.find('div', {'id' : 'row-exu'})
    contents = []
    for content in menu_row.find_all('div', {'class':'col-md-3'}):
        title = content.find('div', {'class' : 'txtline'}).get_text()
        link = content.find('a', {'title' : 'Download'})['href']
        contents.append((title, link))
    return contents
print (get_contents(lessons[0]))

[('त्रिभुजों के प्रकार ', '../../CourseContent/Mathematics/Videos/h_triangle_types.mp4'), ('त्रिभुज के मज़ेदार खेल ', '../../CourseContent/Mathematics/Videos/h_triangle_fun.mp4'), ('त्रिभुज और उसके प्रकार ', '../../CourseContent/Mathematics/Read/Hindi/Reading Cards 7 - H.pdf'), ('त्रिभुज 1 ', '../../CourseContent/Mathematics/Read/Hindi/Reading Cards 6 - H.pdf'), ('त्रिभुज 2 ', '../../CourseContent/Mathematics/Read/Hindi/Reading Cards 8 - H.pdf'), ('त्रिभुज बनायें ', '../../CourseContent/Games/Mathematics.zip'), ('तुक्काबाजी क्षेत्रफल - त्रिभुज ', '../../CourseContent/Games/Mathematics.zip')]
