In [None]:
pip install requests beautifulsoup4

In [3]:
import os
import requests
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse

# ============ CẤU HÌNH ============
BASE_URL = "https://www.vnu.edu.vn"
START_PAGE = "https://vnu.edu.vn/home/?C1916"  
SAVE_DIR = "data_raw/vnu_site"
TEXT_DIR = "data_clean/vnu_text"

os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)


def get_clean_filename(url):
    # Lấy phần cuối của URL để tránh tên tệp quá dài
    path = urlparse(url).path + urlparse(url).query
    # Thay thế các ký tự không hợp lệ bằng '_'
    clean_name = re.sub(r'[^\w\-\.]', '_', path.strip('/'))
    # Loại bỏ các dấu '_' liên tiếp và đảm bảo tên hợp lệ
    clean_name = re.sub(r'_+', '_', clean_name)
    return clean_name

def clean_html_to_text(html_content):
    soup = BeautifulSoup(html_content, "html.parser", from_encoding='utf-8')

    # Loại bỏ script và style
    for tag in soup(["script", "style", "nav", "footer", "header"]):
        tag.decompose()

    text = soup.get_text(separator='\n')
    # Làm sạch dòng trắng, loại bỏ khoảng trắng thừa
    lines = [line.strip() for line in text.splitlines() if line.strip()]
    return "\n".join(lines)

# ============ CRAWL VÀ CLEAN ============
def crawl_and_clean(url_path):
    full_url = urljoin(BASE_URL, url_path)
    try:
        print(f"Crawling {full_url}")
        response = requests.get(full_url, timeout=10)
        response.raise_for_status()

        # Đảm bảo giải mã đúng với UTF-8
        response.encoding = 'utf-8'  # Ép buộc sử dụng UTF-8

        # Lưu HTML gốc (tuỳ chọn)
        html_file = os.path.join(SAVE_DIR, get_clean_filename(full_url) + ".html")
        with open(html_file, "w", encoding="utf-8") as f:
            f.write(response.text)

        # Làm sạch và lưu văn bản
        cleaned_text = clean_html_to_text(response.text)
        txt_file = os.path.join(TEXT_DIR, get_clean_filename(full_url) + ".txt")
        with open(txt_file, "w", encoding="utf-8") as f:
            f.write(cleaned_text)

        print(f"✔ Saved: {txt_file}")
    except Exception as e:
        print(f"❌ Failed to crawl {full_url}: {e}")

pages = [
    "https://vnu.edu.vn/home/?C1916",
    "https://vnu.edu.vn/home/?C1917",
    "https://vnu.edu.vn/home/?C1918",
    "https://vnu.edu.vn/home/?C1919",
    "https://vnu.edu.vn/home/?C2451",
    "https://vnu.edu.vn/home/?C1700",
    "https://vnu.edu.vn/home/?C1701",
    "https://vnu.edu.vn/home/?C1702",
    "https://vnu.edu.vn/home/?C1705",
    "https://www.vnu.edu.vn/home/?C1706",
    "https://www.vnu.edu.vn/home/?C1703",
    "https://vnu.edu.vn/home/?C1704",
    "https://vnu.edu.vn/home/?C1707",
    "https://vnu.edu.vn/home/?C1708",
    "https://vnu.edu.vn/home/?C1709",
    "https://vnu.edu.vn/home/?C1710",
    "https://vnu.edu.vn/home/?C1711",
    "https://vnu.edu.vn/home/?C2758",
    "https://vnu.edu.vn/home/?C2015",
    "https://vnu.edu.vn/home/?C2731",
    "https://vnu.edu.vn/home/?C2038",
    "https://vnu.edu.vn/home/?C2039",
    "https://vnu.edu.vn/home/?C2040",
    "https://vnu.edu.vn/home/?C2575",
    "https://vnu.edu.vn/home/?C1963",
    "https://vnu.edu.vn/home/?C2019",
    "https://vnu.edu.vn/home/?C1885",
    "https://vnu.edu.vn/home/?C2455",
    "https://vnu.edu.vn/home/?C2456",
    "https://vnu.edu.vn/home/?C2452",
    "https://vnu.edu.vn/home/?C2020",
    "https://vnu.edu.vn/home/?C1965",
    "https://vnu.edu.vn/home/?C2042",
]


In [None]:
for page in pages:
    crawl_and_clean(page)


In [5]:
pip install wikipedia

Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: wikipedia
  Building wheel for wikipedia (pyproject.toml): started
  Building wheel for wikipedia (pyproject.toml): finished with status 'done'
  Created wheel for wikipedia: filename=wikipedia-1.4.0-py3-none-any.whl size=11785 sha256=4966c872683de2e2ae67cee5196645c74ae7277fd896296b7205201a3df6eac9
  Stored in directory: c:\users\tanphat\appdata\local\pip\cache\wheels\5e\b6\c5\93f3dec388ae76edc830cb42901bb0232504dfc0df02fc50de
Successfully built wikipedia
Installing collected packages: wikipedia
Successfully installed wikipedia-1.4.0
Note: you may need to re


[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [6]:
import wikipedia

wikipedia.set_lang("vi")
content = wikipedia.page("Vietnam National University, Hanoi").content

with open("data_clean/wiki_text.txt", "w", encoding="utf-8") as f:
    f.write(content)


In [1]:
pip install pdfplumber

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
import os

SAVE_DIR = "data_raw/admission"
TEXT_DIR = "data_clean/admission_text"

os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)

pages = [
    "https://tuyensinh.vnu.edu.vn/index.php/Home/viewnewsVNU/1185",
    "https://tuyensinh.vnu.edu.vn/index.php/Home/viewpage/57",
    "https://tuyensinh.vnu.edu.vn/index.php/Home/viewpage/143",
]

for page in pages:
    crawl_and_clean(page)

Crawling https://tuyensinh.vnu.edu.vn/index.php/Home/viewnewsVNU/1185




✔ Saved: data_clean/admission_text\index.php_Home_viewnewsVNU_1185.txt
Crawling https://tuyensinh.vnu.edu.vn/index.php/Home/viewpage/57
✔ Saved: data_clean/admission_text\index.php_Home_viewpage_57.txt
Crawling https://tuyensinh.vnu.edu.vn/index.php/Home/viewpage/143
✔ Saved: data_clean/admission_text\index.php_Home_viewpage_143.txt


In [4]:
import pdfplumber

def clean_pdf_file(pdf_path, txt_path):
    try:
        with pdfplumber.open(pdf_path) as pdf:
            texts = [page.extract_text() for page in pdf.pages if page.extract_text()]
        full_text = "\n".join(texts)
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(full_text)
        print(f"Processed: {os.path.basename(pdf_path)}")
    except Exception as e:
        print(f"Failed: {pdf_path} — {e}")

In [None]:
import os
import pdfplumber

# Đường dẫn thư mục
INPUT_DIR = "data_raw/admission_pdf"
OUTPUT_DIR = "data_clean/admission_text"

# Tạo thư mục đầu ra nếu chưa có
os.makedirs(OUTPUT_DIR, exist_ok=True)


for filename in os.listdir(INPUT_DIR):
    if filename.lower().endswith(".pdf"):
        input_path = os.path.join(INPUT_DIR, filename)
        output_filename = filename.replace(".pdf", ".txt")
        output_path = os.path.join(OUTPUT_DIR, output_filename)
        clean_pdf_file(input_path, output_path)


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processed: De an TS2022 HUS.pdf


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processed: XHNV_De an tuyen sinh trinh do dai hoc nam 2022 (kem CV1688).pdf


In [11]:
import os

# Đường dẫn thư mục
INPUT_DIR = "data_raw/regulations_pdf"
OUTPUT_DIR = "data_clean/regulations_text"

# Tạo thư mục đầu ra nếu chưa có
os.makedirs(OUTPUT_DIR, exist_ok=True)


for filename in os.listdir(INPUT_DIR):
    if filename.lower().endswith(".pdf"):
        input_path = os.path.join(INPUT_DIR, filename)
        output_filename = filename.replace(".pdf", ".txt")
        output_path = os.path.join(OUTPUT_DIR, output_filename)
        clean_pdf_file(input_path, output_path)


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Processed: 3626_21.10.2022quy-che-dao-tao-dai-hoc-tai-dhqghn-ap-dung-tu-khoa-qh2022.pdf
Processed: Quy-chế-ĐTĐH-3626.pdf


In [12]:
import os

SAVE_DIR = "data_raw/programs"
TEXT_DIR = "data_clean/programs_text"

os.makedirs(SAVE_DIR, exist_ok=True)
os.makedirs(TEXT_DIR, exist_ok=True)

pages = [
    "https://vnu.edu.vn/home/?C1963",
    "https://vnu.edu.vn/home/?C2019",
    "https://www.vnu.edu.vn/home/?C1885",
    "https://www.vnu.edu.vn/home/?C2455",
    "https://www.vnu.edu.vn/home/?C2456",
    "https://www.vnu.edu.vn/home/?C2452",
    "https://www.vnu.edu.vn/home/?C2020",
    "https://www.vnu.edu.vn/home/?C1965",
]

for page in pages:
    crawl_and_clean(page)

Crawling https://vnu.edu.vn/home/?C1963




✔ Saved: data_clean/programs_text\home_C1963.txt
Crawling https://vnu.edu.vn/home/?C2019
✔ Saved: data_clean/programs_text\home_C2019.txt
Crawling https://www.vnu.edu.vn/home/?C1885
✔ Saved: data_clean/programs_text\home_C1885.txt
Crawling https://www.vnu.edu.vn/home/?C2455
✔ Saved: data_clean/programs_text\home_C2455.txt
Crawling https://www.vnu.edu.vn/home/?C2456
✔ Saved: data_clean/programs_text\home_C2456.txt
Crawling https://www.vnu.edu.vn/home/?C2452
✔ Saved: data_clean/programs_text\home_C2452.txt
Crawling https://www.vnu.edu.vn/home/?C2020
✔ Saved: data_clean/programs_text\home_C2020.txt
Crawling https://www.vnu.edu.vn/home/?C1965
✔ Saved: data_clean/programs_text\home_C1965.txt
