## 1. Parsing using mistune

In [None]:
import os
import re
import time
import requests
import mistune
from bs4 import BeautifulSoup


# Custom renderer to treat HTML tags as plain text
class MyRenderer(mistune.Renderer):
    def text(self, text):
        return text

    def paragraph(self, text):
        return text + '\n\n'

    def block_code(self, code, lang=None):
        return '\n\n```\n{}\n```\n\n'.format(code)

    def block_quote(self, text):
        return '> {}\n\n'.format(text)

    def header(self, text, level, raw=None):
        return '{} {}\n\n'.format('#' * level, text)

    def list(self, body, ordered=True):
        return '{}\n\n'.format(body)

    def list_item(self, text):
        return '- {}\n'.format(text)

    def double_emphasis(self, text):
        return '**{}**'.format(text)

    def emphasis(self, text):
        return '*{}*'.format(text)

    def strikethrough(self, text):
        return '~~{}~~'.format(text)

    def linebreak(self):
        return '\n'

def clean_text(text):
    # Remove excessive line breaks and carriage returns
    cleaned_text = re.sub(r'[\r\n]+', '\n', text)
    cleaned_text = re.sub(r' +', ' ', text)
    cleaned_text = re.sub(r'^( +)', '', text, flags=re.MULTILINE)
    return cleaned_text

def parse_page(url):
    visited.add(url)
    print(f"Parsing page: {url}")

    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract text content and clean it
    text_content = soup.get_text()
    cleaned_content = clean_text(text_content)

    # Configure mistune with custom renderer
    markdown = mistune.Markdown(renderer=MyRenderer())

    # Convert cleaned text content to Markdown
    markdown_content = markdown(cleaned_content)

    # Save Markdown content to a file
    filename = url.replace('https://', '').replace('/', '_') + '.md'
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(markdown_content)
        print(f"Markdown content saved to: {filename}")

    # Find links on the page and process them
    links = set()
    for link in soup.find_all('a', href=True):
        link_href = link.get('href')
        links.add(link_href)

        if (link_href.startswith('http') 
                and link_href not in visited
                and depth_dict.get(link_href, 0) < max_depth):
            queue.append(link_href)
            depth_dict[link_href] = depth_dict.get(url, 0) + 1

visited = set()
queue = ['https://innopolis.university/']
depth_dict = {'https://innopolis.university/': 0}
max_depth = 3

while queue:
    current_url = queue.pop(0)
    if current_url not in visited:
        parse_page(current_url)

In [None]:
!cat innopolis.university_.md

## 2. Parsing using readability

### 2.1 Parsing

In [None]:
!pip install readability-lxml

In [None]:
from bs4 import BeautifulSoup

def extract_links_exclude_domains(html_content, base_url, excluded_domains):
    """
    Extract links from a webpage excluding specified domains.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    links = [link.get('href') for link in soup.find_all('a', href=True)]
    
    is_included = lambda x: not any(domain in x for domain in excluded_domains)
    is_innopolis = lambda x: 'innopolis' in x
    
    absolute_links = [urljoin(base_url, link) for link in links if is_included(link) and is_innopolis(link)]
    return absolute_links

In [None]:
import re
from readability import Document

def extract_summary(html_content):
    response = requests.get(html_content)
    doc = Document(response.content)
    soup = BeautifulSoup(doc.summary())
    summary = "\n".join([paragraph.strip() for paragraph in re.split("\n", soup.text) if len(paragraph)])
    return summary

In [None]:
import os
import requests
from urllib.parse import urljoin

def crawl_webpage_exclude_domains(url, depth=0, max_depth=3, visited=None, excluded_domains=None):
    if visited is None:
        visited = set()
    if excluded_domains is None:
        excluded_domains = ['t.me', 'telegram.org', 'vk.com', 'youtube.com']

    try:
        if url in visited:
            return
        
        visited.add(url)
        print(f"Processing depth {depth}: {url}")
        
        response = requests.get(url)

        # Save content to a text file
        filename = url.split('//')[1]
        filename = filename[:-1] if filename.endswith('/') else filename
        filename = filename.replace('/', '_').replace(':', '_') + ".txt"
        with open(filename, 'w', encoding='utf-8') as file:
            file.write(extract_summary(url))

        # Extract links from the webpage excluding specified domains
        if depth < max_depth:
            links = extract_links_exclude_domains(response.content, url, excluded_domains)
            for link in links:
                crawl_webpage_exclude_domains(link, depth + 1, max_depth, visited, excluded_domains)
    except Exception as e:
        print(f"Error occurred: {e}")

In [None]:
starting_url = 'https://innopolis.university'

if not os.path.exists('webpage_content'):
    os.makedirs('webpage_content')

os.chdir('webpage_content')

try:
    crawl_webpage_exclude_domains(starting_url, max_depth=1)
finally:
    os.chdir('..')

### 2.2 Examples

In [None]:
print(extract_summary('https://apply.innopolis.university/faq/'))

In [None]:
print(extract_summary('https://apply.innopolis.university/grant/'))

In [None]:
print(extract_summary('https://media.innopolis.university/news/World-Science-Day/'))

In [13]:
!tar -czvf innopolis.university.tar.gz webpage_content

webpage_content/
webpage_content/itbm.innopolis.university_?utm_source=innopolis_glavnaya#rec469478443.txt
webpage_content/innopolis.university_portfel-proektov.txt
webpage_content/media.innopolis.university_news_iu-ai-journey-science-2023.txt
webpage_content/my.university.innopolis.ru.txt
webpage_content/apply.innopolis.university_master.txt
webpage_content/stc.innopolis.university.txt
webpage_content/innopolis.university_center-cybersecurity.txt
webpage_content/stc.innopolis.university_codeschool.txt
webpage_content/dovuz.innopolis.university_training.txt
webpage_content/media.innopolis.university_news_All-Russian-AI-Olympiad-Results.txt
webpage_content/dovuz.innopolis.university_login.txt
webpage_content/it.university.innopolis.ru_otrs_customer.pl.txt
webpage_content/media.innopolis.university.txt
webpage_content/innopolis.university_search.txt
webpage_content/innopolis.university_sveden.txt
webpage_content/apply.innopolis.university_bachelor.txt
webpage_content/apply.innopolis.univ