In [75]:
import requests
from bs4 import BeautifulSoup
import mistune
import re

class MyRenderer(mistune.Renderer):
    def text(self, text):
        return text

    def paragraph(self, text):
        return text + '\n\n'

    def block_code(self, code, lang=None):
        return '\n\n```\n{}\n```\n\n'.format(code)

    def block_quote(self, text):
        return '> {}\n\n'.format(text)

    def header(self, text, level, raw=None):
        return '{} {}\n\n'.format('#' * level, text)

    def list(self, body, ordered=True):
        return '{}\n\n'.format(body)

    def list_item(self, text):
        return '- {}\n'.format(text)

    def double_emphasis(self, text):
        return '**{}**'.format(text)

    def emphasis(self, text):
        return '*{}*'.format(text)

    def strikethrough(self, text):
        return '~~{}~~'.format(text)

    def linebreak(self):
        return '\n'

def clean_text(text):
    # Remove excessive line breaks and carriage returns
    cleaned_text = re.sub(r'[\r\n]+', '\n', text)
    cleaned_text = re.sub(r' +', ' ', text)
    cleaned_text = re.sub(r'^( +)', '`', text, flags=re.MULTILINE)
    return cleaned_text

def parse_table_to_json(table):
    # Function to parse an HTML table and return a list of dictionaries (one for each row)
    headers = [th.get_text(strip=True) for th in table.find_all('th')]
    table_rows = []
    for tr in table.find_all('tr'):
        cells = tr.find_all('td')
        if cells:
            row = {headers[i]: cell.get_text(strip=True) for i, cell in enumerate(cells) if i < len(headers)}
            table_rows.append(row)
    return table_rows

In [78]:
def parse_page(url):
    visited.add(url)
    print(f"Parsing page: {url}")

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    for nav in soup.find_all(['nav', 'footer']):
        nav.decompose()

    # Process and save tables in JSON format
    tables = soup.find_all('table')
    for i, table in enumerate(tables):
        table_data = parse_table_to_json(table)
        table_filename = f'table_{i}_{url.replace("https://eduwiki.innopolis.university", "").replace("/", "_")}.json'
        with open(table_filename, 'w', encoding='utf-8') as file:
            json.dump(table_data, file, indent=4)
            print(f"Table content saved to: {table_filename}")

        # Remove the table from the soup to avoid processing its text again
        table.decompose()

    # Extract and process headings and text content
    content = ''
    for element in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p']):
        if element.name.startswith('h'):
            level = int(element.name[1])
            content += '#' * level + ' ' + element.get_text() + '\n\n'
        else:
            content += element.get_text() + '\n\n'

    cleaned_content = clean_text(content)

    # Convert to Markdown
    markdown = mistune.Markdown(renderer=MyRenderer())
    markdown_content = markdown(cleaned_content)

    # Save remaining text content to a file
    filename = 'eduwiki_' + url.replace('https://eduwiki.innopolis.university', '').replace('/', '_') + '.md'
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(markdown_content)
        print(f"Markdown content saved to: {filename}")

In [79]:
visited = set()
queue = ['https://eduwiki.innopolis.university/index.php/BSc:_Introduction_To_Programming']
depth_dict = {'https://eduwiki.innopolis.university/index.php/Main_Page': 0}
max_depth = 3

while queue:
    current_url = queue.pop(0)
    if current_url not in visited:
        parse_page(current_url)


Parsing page: https://eduwiki.innopolis.university/index.php/BSc:_Introduction_To_Programming
Table content saved to: table_0__index.php_BSc:_Introduction_To_Programming.json
Table content saved to: table_1__index.php_BSc:_Introduction_To_Programming.json
Table content saved to: table_2__index.php_BSc:_Introduction_To_Programming.json
Table content saved to: table_3__index.php_BSc:_Introduction_To_Programming.json
Table content saved to: table_4__index.php_BSc:_Introduction_To_Programming.json
Table content saved to: table_5__index.php_BSc:_Introduction_To_Programming.json
Table content saved to: table_6__index.php_BSc:_Introduction_To_Programming.json
Table content saved to: table_7__index.php_BSc:_Introduction_To_Programming.json
Table content saved to: table_8__index.php_BSc:_Introduction_To_Programming.json
Markdown content saved to: eduwiki__index.php_BSc:_Introduction_To_Programming.md


In [80]:
!cat /content/eduwiki__index.php_BSc:_Introduction_To_Programming.md

# BSc: Introduction To Programming

## Contents

# Introduction to Programming

## Short Description

This course covers the following concepts: Basic concept - algorithm, program, data; Computer architecture basics; Structured programming; Object-oriented programming; Generic programming; Exception handling; Programming by contract (c); Functional programming; Concurrent programming.

## Prerequisites

### Prerequisite subjects

### Prerequisite topics

## Course Topics

## Intended Learning Outcomes (ILOs)

### What is the main purpose of this course?

The Introduction to Programming course teaches the fundamental concepts and skills necessary to perform programming at a professional level. Students will learn how to master the fundamental control structures, data structures, reasoning patterns and programming language mechanisms characterizing modern programming, as well as the fundamental rules of producing high-quality software. They will acquire the necessary programming backgrou