In [1]:
import urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

# Crawl num pages


In [2]:
import requests
from bs4 import BeautifulSoup


def get_num_pages():
    response = requests.get(
        "https://www.gso.gov.vn/bao-cao-tinh-hinh-kinh-te-xa-hoi-hang-thang/", verify=False
    )
    soup = BeautifulSoup(response.text, "html.parser")
    container = soup.select_one("div.archive-container")

    return max(
        int(a.text)
        for a in container.select_one("div.pagination-wrap").select("a.page-numbers")
        if a.text.isnumeric()
    )


get_num_pages()

30

# Crawl list pages


In [3]:
import requests
from bs4 import BeautifulSoup


def crawl_list_pages(page: int = 1):
    response = requests.get(
        f"https://www.gso.gov.vn/bao-cao-tinh-hinh-kinh-te-xa-hoi-hang-thang/?paged={page}",
        verify=False,
    )
    soup = BeautifulSoup(response.text, "html.parser")
    container = soup.select_one("div.archive-container")

    pages = []
    for p in container.find_all(recursive=False):
        if p.name == "p" and p.select_one("a"):
            link = p.select_one("a").get("href")

        if p.name == "section":
            date = p.select_one("span.archive-issue-date").text[-10:]

            date = "/".join(date.split("/")[::-1])

            pages.append((link, date))
    return pages


crawl_list_pages(24)

[('https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2019/04/tinh-hinh-kinh-te-xa-hoi-thang-02-va-2-thang-dau-nam-2005/',
  '2005/02/28'),
 ('https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2019/04/tinh-hinh-kinh-te-xa-hoi-thang-01-nam-2005/',
  '2005/01/28'),
 ('https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2019/05/tinh-hinh-kinh-te-xa-hoi-nam-2004/',
  '2004/12/25'),
 ('https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2019/05/tinh-hinh-kinh-te-xa-hoi-thang-11-va-11-thang-nam-2004/',
  '2004/11/29'),
 ('https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2019/05/tinh-hinh-kinh-te-xa-hoi-thang-10-va-10-thang-nam-2004/',
  '2004/10/29'),
 ('https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2019/05/tinh-hinh-kinh-te-xa-hoithang-9-va-9-thang-nam-2004/',
  '2004/09/29'),
 ('https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2020/11/bao-cao-tinh-hinh-kinh-te-xa-hoi-thang-8-va-thang-nam-2004/',
  '2004/08/30'),
 ('https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2019/05/tinh-hin

# Crawl one page


In [4]:
import requests
from bs4 import BeautifulSoup


def get_download_link(url: str):
    response = requests.get(url, verify=False)
    soup = BeautifulSoup(response.text, "html.parser")
    section = soup.find("section", {"id": "file_attachments_widget-3"})

    try:
        for a in section.select("a"):
            if a.get("href").endswith(("xlsx", "xls")):
                return a.get("href")
        else:
            raise RuntimeError(f"Can't get link file for url: {url}")
    except Exception as e:
        raise RuntimeError(f"Can't get link file for url: {url}") from e


get_download_link(
    "https://www.gso.gov.vn/du-lieu-va-so-lieu-thong-ke/2020/11/bc-tinh-hinh-kinh-te-xa-hoi-quy-i-nam-2016/"
)

'https://www.gso.gov.vn/wp-content/uploads/2020/11/Bieu-3-2016.xlsx'

# Download one link


In [5]:
import os
import shutil
import requests
import pyexcel as p


def download_and_save(url: str, path: str):
    response = requests.get(url, verify=False, allow_redirects=True)

    with open(path, "wb") as f:
        f.write(response.content)

    # if path.endswith("xls"):
    #     p.save_book_as(file_name=path, dest_file_name=f"{os.path.splitext(path)[0]}.xlsx")
    #     os.remove(path)


download_and_save(
    "https://www.gso.gov.vn/wp-content/uploads/2023/08/02.-Bieu-T8.2023-1.xlsx",
    "../datas/T8.2023-1.xlsx",
)

# Download all files


In [6]:
def crawl_links_one_page(num_page):
    links = []

    pages = crawl_list_pages(num_page)

    assert isinstance(pages, list)

    for page, date in pages:
        try:
            link = get_download_link(page)
            links.append((link, date))
        except Exception as e:
            # print(e)
            pass

    return links


crawl_links_one_page(19)

[('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu04.09-1.xls',
  '2009/04/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu0309-1.xls',
  '2009/03/30'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Solieu0209-1.xls',
  '2009/02/28'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu-0109-1.xls',
  '2009/01/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu1208-1.xls',
  '2008/12/30'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu112008-1.xls',
  '2008/11/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu1008-1.xls',
  '2008/10/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu0908-1.xls',
  '2008/09/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu0808-1.xls',
  '2008/08/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2019/04/Bieu0708-1.xls',
  '2008/07/29')]

In [7]:
import urllib3
import itertools
from concurrent.futures import ThreadPoolExecutor

from tqdm.auto import tqdm

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

num_pages = get_num_pages()


with ThreadPoolExecutor(max_workers=10) as executor:
    links = list(
        tqdm(executor.map(crawl_links_one_page, list(range(1, num_pages + 1))), total=num_pages)
    )

links = list(itertools.chain(*links))
links

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 30/30 [00:23<00:00,  1.28it/s]


[('https://www.gso.gov.vn/wp-content/uploads/2024/03/02-Bieu-3.2024-ngay-28.3.xlsx',
  '2024/03/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2024/02/02-Bieu-T2.2024-1.xlsx',
  '2024/02/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2024/01/02-Bieu-T1-2024-1.xlsx',
  '2024/01/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2023/12/02.Bieu-T12.2023.xlsx',
  '2023/12/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2023/11/02.-Bieu-T11.2023-2.xlsx',
  '2023/11/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2023/10/02.-Bieu-so-lieu-T10.2023.xlsx',
  '2023/10/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2023/09/02-Bieu-so-lieu-9-thang-2023-2.xlsx',
  '2023/09/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2023/08/02.-Bieu-T8.2023-2.xlsx',
  '2023/08/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2023/07/02.-Bieu-T7.2023.xlsx',
  '2023/07/29'),
 ('https://www.gso.gov.vn/wp-content/uploads/2023/06/02.-Bieu-T6.2023.xlsx',
  '2023/06/29'),
 ('https://www.gso.gov

In [8]:
# !rm -rf datas
# !mkdir datas

In [None]:
import os
import urllib3
import itertools
from concurrent.futures import ThreadPoolExecutor

from tqdm.auto import tqdm


def download_save_one_link(link_date):
    link, date = link_date

    if int(date.split("/")[0]) <= 2022:
        return

    file_name = f"../datas/{date.replace('/', '-')}-{link.split('/')[-1]}"

    download_and_save(link, file_name)


with ThreadPoolExecutor(max_workers=10) as executor:
    list(tqdm(executor.map(download_save_one_link, links), total=len(links)))