In [1]:
import requests
from bs4 import BeautifulSoup

from bs4.element import Comment
import pandas as pd

In [2]:
links = ['https://ocw.mit.edu/courses/1-00-introduction-to-computers-and-engineering-problem-solving-spring-2012/',
 'https://ocw.mit.edu/courses/1-010-uncertainty-in-engineering-fall-2008/',
 'https://ocw.mit.edu/courses/1-011-project-evaluation-spring-2011/',
 'https://ocw.mit.edu/courses/1-012-introduction-to-civil-engineering-design-spring-2002/',
 'https://ocw.mit.edu/courses/1-017-computing-and-data-analysis-for-environmental-applications-fall-2003/']

In [3]:
# Get Text from HTML
# https://stackoverflow.com/questions/1936466/how-to-scrape-only-visible-webpage-text-with-beautifulsoup
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return u" ".join(t.strip() for t in visible_texts)

In [4]:
data = []

link_to_index = {}
index_to_link = {}

count = 0

for link in links:
    
    data_per_course = {}
    
    soup = BeautifulSoup(requests.get(link).text, 'html.parser')
    
    # Get all routes
    all_routes = soup.find_all('a', href=True)
    matched_routes = []
    for route in all_routes:
        if '/pages/' in route['href']:
            matched_routes.append('pages/' + route['href'].split('/pages/')[1])
    matched_routes.remove('pages/privacy-and-terms-of-use/')
    
    # Get all text from each route
    text_len = 0
    text_count = 0
    for route in matched_routes:
        url = link + route
        r = requests.get(url)
        text = text_from_html(r.content.decode('utf-8'))
        
        col_name = route.split('/')[1].rstrip('/')
        raw_text = ' '.join(text.split())
        
        data_per_course[col_name] = raw_text
        data_per_course[col_name + '_length'] = len(raw_text.split(' '))
        
        text_len += len(raw_text.split(' '))
        text_count += 1
        
    data_per_course['average_length'] = text_len / text_count
    
    data.append(data_per_course)
    
    link_to_index[link] = count
    index_to_link[count] = link
    count += 1


In [5]:
import json
with open('data.json', 'w') as f:
    json.dump(data, f)

In [6]:
df = pd.DataFrame(data)
df

Unnamed: 0,syllabus,syllabus_length,instructor-insights,instructor-insights_length,readings,readings_length,lecture-notes,lecture-notes_length,recitations,recitations_length,...,calendar,calendar_length,application-examples,application-examples_length,textbook-resources,textbook-resources_length,projects,projects_length,related-resources,related-resources_length
0,Browse Course Material Syllabus Instructor Ins...,1854,Browse Course Material Syllabus Instructor Ins...,1290.0,Browse Course Material Syllabus Instructor Ins...,682.0,Browse Course Material Syllabus Instructor Ins...,917.0,Browse Course Material Syllabus Instructor Ins...,311.0,...,,,,,,,,,,
1,Browse Course Material Syllabus Calendar Lectu...,469,,,,,Browse Course Material Syllabus Calendar Lectu...,271.0,,,...,Browse Course Material Syllabus Calendar Lectu...,680.0,Browse Course Material Syllabus Calendar Lectu...,498.0,,,,,,
2,Browse Course Material Syllabus Subject Topics...,324,,,Browse Course Material Syllabus Subject Topics...,815.0,Browse Course Material Syllabus Subject Topics...,429.0,,,...,Browse Course Material Syllabus Subject Topics...,513.0,,,Browse Course Material Syllabus Subject Topics...,389.0,Browse Course Material Syllabus Subject Topics...,329.0,,
3,Browse Course Material Syllabus Calendar Readi...,348,,,Browse Course Material Syllabus Calendar Readi...,378.0,,,,,...,Browse Course Material Syllabus Calendar Readi...,365.0,,,,,Browse Course Material Syllabus Calendar Readi...,317.0,,
4,Browse Course Material Syllabus Calendar Lectu...,413,,,Browse Course Material Syllabus Calendar Lectu...,725.0,Browse Course Material Syllabus Calendar Lectu...,401.0,Browse Course Material Syllabus Calendar Lectu...,253.0,...,Browse Course Material Syllabus Calendar Lectu...,703.0,,,,,,,Browse Course Material Syllabus Calendar Lectu...,874.0


In [7]:
link_to_index

{'https://ocw.mit.edu/courses/1-00-introduction-to-computers-and-engineering-problem-solving-spring-2012/': 0,
 'https://ocw.mit.edu/courses/1-010-uncertainty-in-engineering-fall-2008/': 1,
 'https://ocw.mit.edu/courses/1-011-project-evaluation-spring-2011/': 2,
 'https://ocw.mit.edu/courses/1-012-introduction-to-civil-engineering-design-spring-2002/': 3,
 'https://ocw.mit.edu/courses/1-017-computing-and-data-analysis-for-environmental-applications-fall-2003/': 4}

In [8]:
index_to_link

{0: 'https://ocw.mit.edu/courses/1-00-introduction-to-computers-and-engineering-problem-solving-spring-2012/',
 1: 'https://ocw.mit.edu/courses/1-010-uncertainty-in-engineering-fall-2008/',
 2: 'https://ocw.mit.edu/courses/1-011-project-evaluation-spring-2011/',
 3: 'https://ocw.mit.edu/courses/1-012-introduction-to-civil-engineering-design-spring-2002/',
 4: 'https://ocw.mit.edu/courses/1-017-computing-and-data-analysis-for-environmental-applications-fall-2003/'}