In [6]:
import json
from pathlib import Path
import sqlite3

import requests
from bs4 import BeautifulSoup

In [2]:
r = requests.get('https://elearning.hse.ru/mooc')
soup = BeautifulSoup(r.text, 'lxml')

In [3]:
headers = soup.find_all('h3') + soup.find_all('h2', 'with-indent3')
headers

[<h3>Coursera. Специализация "<a class="link external" href="https://www.coursera.org/specializations/finansovyye-instrumenty" rel="nofollow" target="_blank">Финансовые инструменты для частного инвестора<ins class="i"></ins></a>"</h3>,
 <h3>Coursera. Специализация "<a class="link external" href="https://www.coursera.org/specializations/korporativnyye-finansy" rel="nofollow" target="_blank">Корпоративные финансы и стоимость компании<ins class="i"></ins></a>"</h3>,
 <h3>Coursera. Специализация "<a class="link external" href="https://www.coursera.org/specializations/data-structures-algorithms" rel="nofollow" target="_blank">Data Structures and Algorithms<ins class="i"></ins></a>"</h3>,
 <h3>Coursera. Специализация "<a class="link external" href="https://www.coursera.org/specializations/discrete-mathematics" rel="nofollow" target="_blank">Introduction to Discrete Mathematics for Computer Science<ins class="i"></ins></a>"<br/><br/></h3>,
 <h3>Coursera. Специализация "<a class="link external

In [4]:
specializations = []
for m in headers:
    specialization = {
        'name': m.text,
        'link': m.a['href'] if m.a else None
    }
    courses = []
    for course in m.find_next_sibling('div').table.tbody.find_all('tr'):
        name, teachers, freq, form, lang, sub, free, certificate = course.find_all('td')
        courses.append({
            'name': name.text.strip(),
            'link': name.a['href'],
            'teachers': [{'name': teacher.text, 'link': teacher['href']} for teacher in teachers.find_all('a')],
            'freq': freq.text.strip(),
            'form': form.text.strip(),
            'languages': [l['class'][-1] for l in lang.find_all('ins')],
            'subtitles': [l['class'][-1] for l in sub.find_all('ins')],
            'free': bool(free.text.strip()),
            'certificate': bool(certificate.text.strip())
        })
    specialization['courses'] = courses
    specializations.append(specialization)

In [12]:
with open('mooc.json', 'w', encoding='utf8') as f:
    json.dump(specializations, f, ensure_ascii=False, indent=2)

In [7]:
def dict_factory(cursor, row):
    d = {}
    for idx, col in enumerate(cursor.description):
        d[col[0]] = row[idx]
    return d

conn = sqlite3.connect('mooc_hse.db', isolation_level=None)
conn.row_factory = dict_factory
cursor = conn.cursor()

In [24]:
specializations

[{'courses': [{'certificate': True,
    'form': 'сессии',
    'free': False,
    'freq': 'раз в 2 недели',
    'languages': ['ru'],
    'link': 'https://www.coursera.org/learn/upravlenie-lichnymi-finansami/',
    'name': 'Управление личными финансами',
    'subtitles': ['ru'],
    'teachers': [{'link': 'http://www.hse.ru/org/persons/65829',
      'name': 'Берзон Николай Иосифович'}]},
   {'certificate': True,
    'form': 'сессии',
    'free': False,
    'freq': 'раз в 4 недели',
    'languages': ['ru'],
    'link': 'https://www.coursera.org/learn/investicii-obligacii',
    'name': 'Инвестиции в облигации',
    'subtitles': ['ru'],
    'teachers': [{'link': 'https://www.hse.ru/org/persons/65863',
      'name': 'Столяров Андрей Иванович'}]},
   {'certificate': True,
    'form': 'сессии',
    'free': False,
    'freq': 'раз в 4 недели',
    'languages': ['ru'],
    'link': 'https://www.coursera.org/learn/investicii-akcii',
    'name': 'Инвестиции в акции',
    'subtitles': ['ru'],
    'te

In [25]:
#делаем таблицу
cursor.execute('DROP TABLE IF EXISTS courses_group')
sql = '''CREATE TABLE courses_group (
    id integer PRIMARY KEY autoincrement,
    name text,
    link text
)'''
cursor.execute(sql)

cursor.execute('DROP TABLE IF EXISTS mooc_hse')
sql = '''CREATE TABLE mooc_hse (
    id integer PRIMARY KEY autoincrement,
    group_id int,
    name text,
    certificate int,
    form text,
    free int,
    freq  text,
    languages text,
    subtitles text,
    link text,
    
    FOREIGN KEY(group_id) REFERENCES courses_group(id)
)'''
cursor.execute(sql)

cursor.execute('DROP TABLE IF EXISTS teachers')
sql = '''CREATE TABLE teachers (
    course_id integer,
    teacher_name text,
    teacher_link,
    
    FOREIGN KEY(course_id) REFERENCES mooc_hse(id)
)'''
cursor.execute(sql)

group_sql = "INSERT INTO courses_group (name, link) VALUES(?, ?)"
sql = """INSERT INTO mooc_hse (name, certificate, form, free, freq, languages, subtitles, link, group_id) 
VALUES  (?, ?, ?, ?, ?, ?, ?, ?, ?)"""
teachers_sql = "INSERT INTO TEACHERS (course_id, teacher_name, teacher_link) VALUES (?, ?, ?)"
for group in specializations:
    cursor.execute(group_sql, [group['name'], group['link']])
    group_id = cursor.lastrowid
    for course in group['courses']:
        cursor.execute(sql, [course['name'], course['certificate'], course['form'], course['free'], course['freq'],
                             ', '.join(course['languages']), ', '.join(course['subtitles']),
                             course['link'], group_id
                            ])
        course_id = cursor.lastrowid
        for teacher in course['teachers']:
            cursor.execute(teachers_sql, [course_id, teacher['name'], teacher['link']])