In [33]:
from bs4 import BeautifulSoup			  #For parsing the HTML
from collections import OrderedDict		#For storing chapter and lesson info
from subprocess import Popen, PIPE, STDOUT #For accessing pandoc
from urllib.request import urlopen, urlretrieve		 #For fetching the webpage
import json								#For dealing with website data that comes in JSON
import pandas as pd						#For create Zettelkasten-style filenames
import re								  #For regex parsing of `sct`
import subprocess						  #For accessing pandoc
import pprint

In [66]:
def get_whole_course(link):
	'''Receives course URL from user. Gets ordered dict of chapters/lessons. 
	Creates list of unique z_numbers, one for each chapter. 
	Creates txt file for each chapter, fills each file with lesson content.
	Gets course name. Creates table of contents.
	Feeds to: get_course_outline(), make_z_number(), make_chapter_notes(), 
	get_course_title(), create_table_of_contents()'''
	course_dictionary = get_course_outline(link)
	z_list = make_z_number(len(course_dictionary.items())+1)
	for chapter, lesson in course_dictionary.items():
		z_index = list(course_dictionary.keys()).index(chapter)
		filename = z_list[z_index] + ' ' + chapter + '.txt'
		chapter_link = [lesson][0][1][1]
		make_chapter_notes(filename, chapter_link)
	course_name = get_course_title(link)
	create_table_of_contents(course_dictionary, course_name, z_list)

def get_course_outline(link):
	'''Receives link to course landing page from get_whole_course(). 
	Returns ordered dict of chapters with lessons.'''
	html = urlopen(link)
	soup = BeautifulSoup(html, 'lxml')
	lesson_outline = soup.find_all(['h4', 'li'])
	chapters = OrderedDict()   # {chapter: [(lesson_name, lesson_link), ...], ...}
	for item in lesson_outline:
		attributes = item.attrs
		try:
			class_type = attributes['class'][0]
			if class_type == 'chapter__title':
				chapter = item.text.strip()
				chapters[chapter] = []
			if class_type == 'chapter__exercise':
				lesson_name = item.find('h5').text
				lesson_link = item.find('a').attrs['href']
				chapters[chapter].append((lesson_name, lesson_link))
		except KeyError:
			pass
	return(chapters)

def make_z_number(num):
	'''Takes int and returns a list of unique, 14-digit numbers. Only goes to 99.'''
	assert num < 100, 'Enter an int that is less than 100. Max list size is 99.'
	string_list = []
	z_index = 0
	for x in range(num):
		z_string = pd.to_datetime('now').strftime('%Y%m%d%H%M')
		z_string = z_string + '{0:0>2}'.format(z_index)
		string_list.append(z_string)
		z_index += 1
	return string_list

def make_chapter_notes(filename, link):
	'''Receives filename and lesson link from get_whole_course().
	(Note that a link from any lesson in a chapter will work. 
    That is, any lesson link has all the information for the chapter.)
	Cycles through all lessons in chapter, converting each lesson and sub-exercise from HTML to Markdown.
	Prints all chapter content into text file.
	Feeds to: get_lesson_json(), NormalExercise_print(), BulletExercise_print(), 
	MultipleChoiceExercise_print(), download_chapter_slides()'''
	lesson_json = get_lesson_json(link)
	for item in lesson_json['exercises']['all']:
		if item['type'] == 'VideoExercise':
			pass
		elif item['type'] == 'NormalExercise':
			NormalExercise_print(item, filename)
		elif item['type'] == 'BulletExercise':
			BulletExercise_print(item, filename)
		elif item['type'] == 'MultipleChoiceExercise':
			MultipleChoiceExercise_print(item, filename)
	download_chapter_slides(lesson_json, filename.strip('.txt'))

def get_course_title(link):
	html = urlopen(link)
	soup = BeautifulSoup(html, 'lxml')
	return soup.title.text

def create_table_of_contents(dictionary, course_name, z_list):
	'''Receives course dictionary, course name, and list of unique z_numbers from get_whole_course().
    Creates text file with contents of course, formatted in Markdown, with wiki-style links to each chapter.'''    
	filename = z_list[-1] + ' ' + course_name + '.txt'
	with open(filename, 'a') as f:
		for chapter, lessons in dictionary.items():
			z_index = list(dictionary.keys()).index(chapter)
			print('\n# ', '[[' + z_list[z_index] + ']]', chapter, '\n', file=f)
			for lesson_name, lesson_link in lessons:
				print("   *", lesson_name, file=f)

def get_lesson_json(link):
	'''Receives lesson link from make_chapter_notes() and returns 
	the dictionary that holds all the information for the lesson's parent chapter.'''
	html = urlopen(link)
	soup = BeautifulSoup(html, 'lxml')
	string = soup.find_all('script')[3].string
	json_text = string.strip('window.PRELOADED_STATE=')[:-1]
	lesson_json = json.loads(json_text)
	return lesson_json
                
def NormalExercise_print(json, f):
	'''Works with make_chapter_notes. Parses NormalExercise type lessons and prints them in 
	markdown to file.
	Feeds to: convert_2_md(), get_success_msg().'''
	with open(f, 'a') as f:
		print('#', json['title'], '\n', file=f)
		print('## Exercise\n', file=f)
		print(convert_2_md(json['assignment']), file=f)
		print('## Instructions\n', file=f)
		print(convert_2_md(json['instructions'][:-2]), file=f)
		print('## Code\n', file=f)
		print('```\n' + convert_2_md(json['sample_code']).replace('\\', ''),'\n```\n', file=f)
		print('```\n' + convert_2_md(json['solution']).replace('\\', ''),'\n```\n', file=f)
		print(get_success_msg(json['sct']) + '\n', file=f)

def BulletExercise_print(json, f):
	'''Works with make_chapter_notes. Parses BulletExercises type lessons and prints them in 
	markdown to file.
	Feeds to: convert_2_md(), get_success_msg().'''
	with open(f, 'a') as f:
		print('# ' + json['title'], '\n', file=f)
		print('## Exercise\n', file=f)
		print(convert_2_md(json['assignment']), file=f)
		print('## Instructions & Code \n', file=f)  
		for item in json['subexercises']:
			print(convert_2_md(item['instructions']), file=f)
			print('```\n' + item['sample_code'] + '\n```\n', file=f)
			print('```\n' + item['solution']	+ '\n```', file=f)
			print(get_success_msg(item['sct']) + '\n', file=f)

def MultipleChoiceExercise_print(json, f):
	'''Works with make_chapter_notes. Parses MultipleChoice type lessons and prints them in 
	markdown to file.
	Feeds to: convert_2_md(), get_correct_mc(), get_success_msg().'''
	with open(f, 'a') as f:
		print('# ' + json['title'], '\n', file=f)
		print('## Exercise\n', file=f)
		print(convert_2_md(json['assignment']), file=f)
		print("## Choices\n", file=f)
		for choice in json['instructions']:
			print('* ' + choice, file=f)
		print('\n**Correct answer: ' + get_correct_mc(json['sct']) + '**\n', file=f)
		print(get_success_msg(json['sct']) + '\n', file=f)

def convert_2_md(string):
	'''Receives a string of HTML and use Pandoc to return string in Markdown.'''
	p = Popen(['pandoc', '-f', 'html', '-t', 'markdown', '--wrap=preserve'], stdout=PIPE, stdin=PIPE, stderr=STDOUT)
	text = p.communicate(input=string.encode('utf-8'))[0]
	text = text.decode('utf-8')
	return text

def get_success_msg(string):
	'''Parses text from DataCamp `sct` JSON and returns the success message as a string.'''
	match = re.search(r'success_msg\("(.*?)"\)', string)
	if match != None:
		message = match.group(1)
		return message
	else:
		return ''

def get_correct_mc(string):
	'''Parses text from DataCamp `sct` JSON and correct answer for MultipleChoice type lessons. 
    Works with MultipleChoiceExercise_print'''
	match = re.search(r'test_mc\(correct = (\d),', string)
	if match != None:
		message = match.group(1)
		return message
	else:
		return ''

def download_chapter_slides(dictionary, filename):
	'''Receives lesson_json and filename from make_chapter_notes(). 
	Finds link to PDF of slides for each chapter. 
	Saves PDF with chapter's unique z_number and title.'''
	for item in lesson_json['course']['chapters']:
		url = item['slides_link']
		urllib.request.urlretrieve(url, filename + '.pdf')

In [67]:
link = 'https://www.datacamp.com/courses/importing-data-in-r-part-1'
get_whole_course(link)