In [14]:
import requests
import shutil
import re
import hashlib
import html

from parsel import Selector
from urllib.parse import urljoin, unquote
from os import path, makedirs

In [25]:
start_url = "https://pt.wikipedia.org/wiki/Clube_Atlético_Mineiro"

response = requests.get(start_url)
start_sel = Selector(response.text)

In [26]:
ARTICLE_LINK_XPATH = '//div[@id="bodyContent"]/descendant::a[re:test(@href, "^/wiki/[^:]*$")]/@href'

related_urls = start_sel.xpath(ARTICLE_LINK_XPATH).extract()

In [17]:
related_urls[:5]

['/wiki/Galo',
 '/wiki/25_de_mar%C3%A7o',
 '/wiki/1908',
 '/wiki/Est%C3%A1dio_Raimundo_Sampaio',
 '/wiki/Mineir%C3%A3o']

In [18]:
related_urls = [unquote(url) for url in related_urls]
related_urls[:5]

['/wiki/Galo',
 '/wiki/25_de_março',
 '/wiki/1908',
 '/wiki/Estádio_Raimundo_Sampaio',
 '/wiki/Mineirão']

In [41]:
BASE_URL = "https://pt.wikipedia.org/"
FILE_NAME_RE = re.compile(r'([^\.]+\.\w+)')

def download_page(url):
    response = requests.get(urljoin(BASE_URL, url))
    response_text = response.text
    sel = Selector(response_text)
    response_text = download_css(response_text, sel)
    response_text = download_js(response_text, sel)
    response_text = download_images(response_text, sel)
    for url in related_urls:
        file_name = 'wikipedia/' + url.split('/')[-1] + '.html'
        response_text = replace_link(response_text, url, file_name)
    page_name = unquote(response.url.split('/')[-1]) + '.html'
    with open(f'wikipedia/{page_name}', 'w', encoding='utf8') as f:
        f.write(response_text)

def download_links(response_text, sel, links_css_selector, download_folder):
    links = get_links(sel, links_css_selector)
    for link in links:
        file_path = download_file(link, download_folder)
        response_text = replace_link(response_text, link, file_path)
    return response_text

def download_file(link, download_folder):
    if not path.exists(f'wikipedia/{download_folder}'):
        makedirs(f'wikipedia/{download_folder}')

    file_path = get_file_path(link, download_folder)
    if not path.isfile(file_path):
        r = requests.get(urljoin(BASE_URL, link), stream=True)
        if r.status_code == 200:
            with open(file_path, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
    return file_path

def get_file_path(link, download_folder):
    if download_folder in ['js', 'css']:
        file_name = hashlib.sha1(link.encode()).hexdigest()[:8] + f'.{download_folder}'
    else:
        file_name = link.split('/')[-1]
        m = FILE_NAME_RE.match(file_name)
        if m:
            file_name = m.group(1)
    return f'wikipedia/{download_folder}/{file_name}'
    
def download_css(response_text, sel):
    return download_links(response_text, sel, 'link[rel="stylesheet"]::attr(href)', 'css')

def download_js(response_text, sel):
    return download_links(response_text, sel, 'script::attr(src)', 'js')

def download_images(response_text, sel):
    return download_links(response_text, sel, 'img::attr(src)', 'images')

def replace_link(response_text, link, file_path):
    return response_text.replace(html.escape(link), file_path.replace('wikipedia/', ''))
    
def get_links(sel, links_css_selector):
    return sel.css(links_css_selector).extract()

In [42]:
download_page(start_url)

for url in related_urls[:5]:
    download_page(url)