# STEP 1: Crawler

Crawls the website and extracts the high level topic, and topic-cluster structure.

In [1]:
from itertools import groupby
import json
import os

import requests
from bs4 import BeautifulSoup

from ricecooker.utils.caching import CacheForeverHeuristic, FileCache, CacheControlAdapter


In [2]:
from mitblossoms_chef import (get_lang_paths,
                              get_all_lessons_info,
                              group_lesson_by_topic,
                              build_preliminary_tree,
                              retrieve_topic_clusters,
                              get_or_create_cluster,
                              add_topic_cluster_membership,
                              crawling_step)


In [3]:
BASE_URL = 'https://blossoms.mit.edu'
VIDEOS_BY_LANGUAGE_PATH = '/videos/by_language'

In [5]:
# globals
all_lessons_seen = []    # store a list of all lessons (/videos/lessons/) seen during crawl

In [4]:

lang_paths = get_lang_paths()
lang_paths

[('Arabic',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=62&term_node_tid_depth_1=All'),
 ('English',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=63&term_node_tid_depth_1=All'),
 ('Farsi',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=85&term_node_tid_depth_1=All'),
 ('Hindi',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=64&term_node_tid_depth_1=All'),
 ('Japanese',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=65&term_node_tid_depth_1=All'),
 ('Kannada',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=66&term_node_tid_depth_1=All'),
 ('Korean',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=67&term_node_tid_depth_1=All'),
 ('Malay',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=68&term_node_tid_depth_1=All'),
 ('Mandarin',
  '/videos?field_topic_value_many_to_one=All&term_node_tid_depth=69&term_node_tid_depth_1=All'),
 ('Portuguese',


In [7]:
len(lang_paths)

12

In [8]:
for lang_path in lang_paths:
    print(lang_path[1])

/videos?field_topic_value_many_to_one=All&term_node_tid_depth=62&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=63&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=85&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=64&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=65&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=66&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=67&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=68&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=69&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=70&term_node_tid_depth_1=All
/videos?field_topic_value_many_to_one=All&term_node_tid_depth=71&term_node_tid_depth_1=All

In [5]:


# let's use spanish for testing
test_lang_url = BASE_URL + lang_paths[10][1]
video_lessons = get_all_lessons_info(test_lang_url)


In [9]:
video_counts_by_lang = {}
for lang, lang_url in lang_paths:
    video_lessons = get_all_lessons_info(BASE_URL + lang_url)
    video_counts_by_lang[lang] = len(video_lessons)

video_counts_by_lang

{'Arabic': 48,
 'English': 123,
 'Farsi': 1,
 'Hindi': 1,
 'Japanese': 7,
 'Kannada': 5,
 'Korean': 1,
 'Malay': 18,
 'Mandarin': 31,
 'Portuguese': 44,
 'Spanish': 21,
 'Urdu': 10}

In [11]:
video_lessons[0]

{'title': 'Choosing a College Roommate: How Multi-Criteria Decision Modeling Can Help',
 'topic': 'Mathematics',
 'url': 'https://blossoms.mit.edu/videos/lessons/choosing_college_roommate_how_multi_criteria_decision_modeling_can_help'}

In [12]:

topics_list = group_lesson_by_topic(video_lessons)
# topics_list

In [13]:

# BUILD STAGE 1 INTERMEDIARY OUTPUT (by Topic)
web_resource_tree = build_preliminary_tree(selected_laungages=['Arabic', 'English'])

# web_resource_tree

In [64]:
en_lang = [n for n in web_resource_tree['children'] if n['lang']=='English'][0]
ar_lang = [n for n in web_resource_tree['children'] if n['lang']=='Arabic'][0]

en_lessons = [lesson for topic in en_lang['children'] for lesson in topic['children']]
ar_lessons = [lesson for topic in ar_lang['children'] for lesson in topic['children']]

print('Total videos in EN', len(en_lessons))
print('Total videos in AR', len(ar_lessons))

all_en_urls = set()
all_ar_urls = set()
for l in en_lessons:
    all_en_urls.add(l['url'])
for l in ar_lessons:
    all_ar_urls.add(l['url'])

print('Number of lessons avail. in English but not in Arabic', len(all_en_urls - all_ar_urls))
print('Number of lessons avail. in Arabic but not in English', len(all_ar_urls - all_en_urls))




Total videos in EN 123
Total videos in AR 48
Number of lessons avail. in English but not in Arabic 82
Number of lessons avail. in Arabic but not in English 7


In [65]:
# BUILD STAGE 1 FINAL OUTPUT (by Topic, by TopicCluster)
web_resource_tree = add_topic_cluster_membership(web_resource_tree)

# web_resource_tree

Processing topic Biology
Processing lesson Classifying Animals by Appearance Versus DNA Sequence
Processing lesson Discovering Medicines, Using Robots and Computers
Processing lesson How Does Biotechnology Help Clean Up The Environment?
Processing lesson Methods for Protein Purification
Processing lesson Plants and Environmental Resources
Processing lesson Roots, Shoots, and Wood
Processing lesson The Construction of Proteins
Processing lesson The Disease of Our Time: Diabetes
Processing lesson The Olive Propagation Problem
Processing lesson The Respiratory System of Birds
Processing lesson Wind and Sand
Processing lesson Yeasts, Molds and Mushrooms
Processing topic Chemistry
Processing lesson Catalytic Converter
Processing lesson How Much Small Is Small?
Processing lesson Is There a Connection between Water Desalination and Making Pickles?
Processing lesson Plastics and Covalent Chemical Bonds
Processing lesson Sustainable Energy: Can Water be the Future Fuel?
Processing lesson Why Ar

Processing lesson Selfish Drivers: The Braess Paradox and Traffic Planning
Processing lesson Sorting Algorithms
Processing lesson Taking Walks, Delivering Mail: An Introduction to Graph Theory
Processing lesson The Art of Approximation in Science and Engineering: How to Whip Out Answers Quickly
Processing lesson The Broken Stick Experiment: Triangles, Random Numbers and Probability
Processing lesson The Friendship Paradox: Why We Can’t All Be Popular
Processing lesson The Geometry of Parabolic Sand Dunes
Processing lesson The Mailman and the Five Packages: Data Packets and Data Transfer Speed
Processing lesson The Mathematics of Cryptography
Processing lesson The Mathematics of Voting
Processing lesson The Monty Hall Problem or How to Outsmart a Game Show and Win a Car
Processing lesson The Power of Exponentials, Big and Small
Processing lesson The Pythagorean Theorem: Geometry’s Most Elegant Theorem
Processing lesson The Quadratic Equation: It’s Hip to Be Squared
Processing lesson The

Total videos in EN 123
Total videos in AR 48
Number of lessons avail. in English but not in Arabic 82
Number of lessons avail. in Arabic but not in English 7


Number of lessons avail. in English but not in Arabic 82
Number of lessons avail. in Arabic but not in English 7


In [14]:
def print_tree(web_resource_tree):    
    for lang_node in web_resource_tree['children']:
        print("Content in", lang_node['lang'])

        for topic_node in lang_node['children']:
            print(" "*4, topic_node['title'])


            for node in topic_node['children']:
                if node['__class__'] == 'MitBlossomsTopicCluster':
                    cluster_node = node
                    print(" "*8, 'Cluster:', cluster_node['title'])
                    for lesson_node in cluster_node['children']:
                        print(" "*12, lesson_node['title'])

                elif node['__class__'] == 'MitBlossomsVideoLessonResource':
                    lesson_node = node
                    print(" "*8, lesson_node['title'])

                else:
                    raise ValueError('Unkonwn node type encounted.')
            print()
        print()

print_tree(web_resource_tree)

Content in Arabic
     Biology
         Cluster: DNA
             Classifying Animals by Appearance Versus DNA Sequence
         Cluster: Evolution
             Classifying Animals by Appearance Versus DNA Sequence
         Cluster: Health
             Discovering Medicines, Using Robots and Computers
             The Disease of Our Time: Diabetes
         Cluster: Environmental Sustainability
             How Does Biotechnology Help Clean Up The Environment?
             Plants and Environmental Resources
         Cluster: Botany
             Plants and Environmental Resources
             Roots, Shoots, and Wood
             Yeasts, Molds and Mushrooms
         Methods for Protein Purification
         The Construction of Proteins
         The Olive Propagation Problem
         The Respiratory System of Birds
         Wind and Sand

     Chemistry
         Cluster: Environmental Sustainability
             Sustainable Energy: Can Water be the Future Fuel?
         Catalytic Converter

In [15]:
def write_json_tree(web_resource_tree, datadir='chefdata'):
    with open(os.path.join(datadir, 'web_resource_tree.json'), 'w') as json_file:
        json.dump(web_resource_tree, json_file, indent=2)


write_json_tree(web_resource_tree, datadir='../chefdata')