In [None]:
import requests
from bs4 import BeautifulSoup
import time
import os
from urllib.parse import urljoin, urlparse
import urllib3
import json
import threading 
import queue       
import logging     


In [None]:

logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(threadName)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S')

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

BASE_URL_UET = "https://www.vnu.edu.vn/home/?C1885"
ALLOWED_DOMAINS = ["uet.vnu.edu.vn", "vnu.edu.vn","ueb.edu.vn","hus.vnu.edu.vn","ussh.vnu.edu.vn","ulis.vnu.edu.vn","education.vnu.edu.vn","vju.ac.vn","is.vnu.edu.vn","hsb.edu.vn","law.vnu.edu.vn"]
DATA_DIR = "uet_vnu_data"
STATE_FILE = "crawl_state_2.json"
NUM_WORKER_THREADS = 8 

if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)


In [None]:

work_queue = queue.Queue() 
visited_urls = set()
lock_visited_urls = threading.Lock() 
shutdown_event = threading.Event()   
urls_processed_count = 0             
lock_processed_count = threading.Lock()



In [None]:

def save_state(q_to_save, visited_to_save, filepath):
    logging.info(f"Đang lưu trạng thái vào {filepath}...")
    queue_items = []
    while not q_to_save.empty():
        try:
            queue_items.append(q_to_save.get_nowait())
        except queue.Empty:
            break

    state = {
        "queue": queue_items,
        "visited_urls": list(visited_to_save)
    }
    try:
        with open(filepath, 'w', encoding='utf-8') as f:
            json.dump(state, f, ensure_ascii=False, indent=4)
        logging.info(f"Đã lưu trạng thái: Queue={len(queue_items)}, Visited={len(visited_to_save)}")
    except Exception as e:
        logging.error(f"Lỗi khi lưu trạng thái: {e}")
    finally:
        for item in queue_items:
            q_to_save.put(item)


def load_state(filepath, q_to_load):
    global visited_urls 
    if os.path.exists(filepath):
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                state = json.load(f)
            
            while not q_to_load.empty():
                try: q_to_load.get_nowait()
                except queue.Empty: break

            loaded_queue_items = state.get("queue", [])
            for item in loaded_queue_items:
                q_to_load.put(item)
            
            visited_urls = set(state.get("visited_urls", []))
            logging.info(f"Đã tải trạng thái từ {filepath}: Queue={q_to_load.qsize()}, Visited={len(visited_urls)}")
            return True
        except Exception as e:
            logging.error(f"Lỗi khi tải trạng thái từ {filepath}: {e}. Bắt đầu với trạng thái mới.")
            visited_urls = set()
            return False
    else:
        logging.info("Không tìm thấy file trạng thái. Bắt đầu với trạng thái mới.")
        visited_urls = set()
        return False

def is_valid_url(url):
    parsed_url = urlparse(url)
    if parsed_url.scheme in ['http', 'https'] and parsed_url.netloc in ALLOWED_DOMAINS:
        return True
    return False

def sanitize_filename(url_str):
    filename = url_str.replace("http://", "").replace("https://", "").replace("/", "_").replace("?", "_").replace("=", "_").replace("&", "_").replace("%", "_")
    return "".join(c if c.isalnum() or c in ['_', '.'] else '_' for c in filename)[:150] + ".txt"

def save_content(url, content):
    try:
        filename = sanitize_filename(url)
        filepath = os.path.join(DATA_DIR, filename)
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(content)
        logging.info(f"Đã lưu: {filepath}")
    except Exception as e:
        logging.error(f"Lỗi khi lưu file cho {url}: {e}")


In [None]:

def worker():
    global urls_processed_count 
    while not shutdown_event.is_set():
        try:
            current_url = work_queue.get(timeout=1) 
        except queue.Empty:
            continue 

        logging.info(f"Đang crawl: {current_url} (Queue: {work_queue.qsize()}, Visited: {len(visited_urls)})")
        
        try:
            headers = {'User-Agent': f'My Threaded UET Crawler ({threading.current_thread().name})'}
            response = requests.get(current_url, headers=headers, timeout=15, verify=False)
            response.raise_for_status()

            with lock_processed_count: 
                urls_processed_count +=1

            content_type = response.headers.get('Content-Type', '').lower()
            if 'text/html' in content_type:
                soup = BeautifulSoup(response.content, "html.parser")
                selectors = [
                    'article .entry-content', 'article .td-post-content', 'article .content',
                    '.noidung', '.content', '.entry-content', '.td-post-content', '.page-content',
                    'article', 'main', '#content',
                ]
                page_text_parts = []
                found_main_content = False
                for selector in selectors:
                    elements = soup.select(selector)
                    if elements:
                        for el in elements:
                            for unwanted_tag in el.select('nav, footer, script, style, .menu, .sidebar, .related-posts, .comments-area, .header, .site-header'):
                                unwanted_tag.decompose()
                            page_text_parts.append(el.get_text(separator="\n", strip=True))
                        found_main_content = True
                        break
                if not found_main_content and soup.body:
                    body_clone = BeautifulSoup(str(soup.body), "html.parser")
                    for unwanted_tag in body_clone.select('nav, footer, script, style, .menu, .sidebar, .header, .site-header'):
                        unwanted_tag.decompose()
                    page_text_parts.append(body_clone.get_text(separator="\n", strip=True))
                page_text = "\n\n".join(filter(None, page_text_parts))

                logging.info(f"Đang crawl: {current_url} (Queue: {work_queue.qsize()}, Visited: {len(visited_urls)})")
                
                for link_tag in soup.find_all('a', href=True):
                    href = link_tag['href'].strip()
                    if not href or href.startswith(('#', 'javascript:', 'mailto:')):
                        continue
                    
                    next_url = urljoin(current_url, href)
                    next_url_parsed = urlparse(next_url)
                    next_url_clean = next_url_parsed._replace(fragment="", query="").geturl()

                    excluded_extensions = ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx',
                                           '.jpg', '.jpeg', '.png', '.gif', '.bmp', '.svg',
                                           '.zip', '.rar', '.tar.gz', '.mp3', '.mp4', '.avi']
                    if any(next_url_clean.lower().endswith(ext) for ext in excluded_extensions):
                        continue
                    
                    if is_valid_url(next_url_clean):
                        with lock_visited_urls: 
                            if next_url_clean not in visited_urls:
                                logging.debug(f"Thêm link mới vào queue: {next_url_clean}")
                                work_queue.put(next_url_clean)
                                
                with lock_visited_urls:
                    if current_url in visited_urls:
                        work_queue.task_done() 
                        continue 
                visited_urls.add(current_url)                

                if page_text.strip():
                    save_content(current_url, page_text)
                print(f"Đã lưu nội dung từ {current_url} vào file.")
            elif 'application/pdf' in content_type:
                logging.info(f"Tìm thấy file PDF: {current_url}. (Logic tải PDF chưa được triển khai)")
            else:
                logging.info(f"Bỏ qua nội dung không phải HTML/PDF: {content_type} tại {current_url}")

        except requests.exceptions.RequestException as e_req:
            logging.warning(f"Lỗi request với {current_url}: {e_req}")
        except Exception as e_proc:
            logging.error(f"Lỗi xử lý {current_url}: {e_proc}")
        finally:
            work_queue.task_done()

    logging.info(f"{threading.current_thread().name} is stopping.")


In [None]:
if not load_state(STATE_FILE, work_queue): 
    if work_queue.empty() and BASE_URL_UET not in visited_urls:
            work_queue.put(BASE_URL_UET)

threads = []
logging.info(f"Bắt đầu {NUM_WORKER_THREADS} worker threads...")
for i in range(NUM_WORKER_THREADS):
    thread = threading.Thread(target=worker, name=f"Worker-{i+1}")
    thread.daemon = True 
    threads.append(thread)
    thread.start()

try:
    while True:
        if work_queue.empty():
            all_tasks_done = True
            logging.info("Queue is empty. Checking if all workers are idle...")
            time.sleep(5) 
            if work_queue.empty():
                logging.info("Queue is still empty. Signaling workers to stop.")
                shutdown_event.set() 
                break 
        time.sleep(1) 

except KeyboardInterrupt:
    logging.info("\nĐã nhận tín hiệu dừng (Ctrl+C). Báo hiệu cho các worker dừng...")
    shutdown_event.set()
except Exception as e_main:
    logging.error(f"Lỗi nghiêm trọng trong main thread: {e_main}")
    shutdown_event.set()
finally:
    logging.info("Đợi các worker threads hoàn thành...")
    for thread in threads:
        thread.join(timeout=10) 
        if thread.is_alive():
            logging.warning(f"Thread {thread.name} vẫn còn chạy sau khi join timeout.")

    logging.info("Tất cả worker threads đã dừng (hoặc timeout).")
    save_state(work_queue, visited_urls, STATE_FILE)
    logging.info(f"Tổng số URL đã xử lý (ước tính): {urls_processed_count}")
    logging.info("Chương trình kết thúc.")

2025-05-12 12:10:56 MainThread: Đã tải trạng thái từ crawl_state_2.json: Queue=64899, Visited=9722
2025-05-12 12:10:56 MainThread: Bắt đầu 8 worker threads...
2025-05-12 12:10:56 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/67 (Queue: 64898, Visited: 9722)
2025-05-12 12:10:56 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/68 (Queue: 64897, Visited: 9722)
2025-05-12 12:10:56 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/listnews/154/0 (Queue: 64896, Visited: 9722)
2025-05-12 12:10:56 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/index.php/WebControl/viewpage/190 (Queue: 64895, Visited: 9722)
2025-05-12 12:10:56 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/95 (Queue: 64894, Visited: 9722)
2025-05-12 12:10:56 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/listnews/185/0 (Queue: 64893, Visited: 9722)


Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/viewnews/1832 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/viewnews/1830 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/viewnews/1831 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/viewnews/1829 vào file.


2025-05-12 12:12:22 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/1824 (Queue: 67860, Visited: 9726)
2025-05-12 12:12:22 Worker-4: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_WebControl_viewnews_1824.txt
2025-05-12 12:12:22 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/3427 (Queue: 67862, Visited: 9727)
2025-05-12 12:12:22 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/1827 (Queue: 67862, Visited: 9727)
2025-05-12 12:12:22 Worker-5: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_WebControl_viewnews_1827.txt
2025-05-12 12:12:22 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/3440 (Queue: 67863, Visited: 9728)


Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/viewnews/1824 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/viewnews/1827 vào file.


2025-05-12 12:12:23 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/1826 (Queue: 67863, Visited: 9728)
2025-05-12 12:12:23 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/3442 (Queue: 67865, Visited: 9729)
2025-05-12 12:12:23 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/listnews/185/0 (Queue: 67865, Visited: 9729)
2025-05-12 12:12:23 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/1825 (Queue: 67865, Visited: 9729)
2025-05-12 12:12:23 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/3419 (Queue: 67866, Visited: 9729)
2025-05-12 12:12:23 Worker-3: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_WebControl_viewnews_1826.txt
2025-05-12 12:12:23 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/3410 (Queue: 67869, Visited: 9730)
2025-05-12 12:12:23 Worker-4: Đang crawl: https://education.vnu.edu.vn/i

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/viewnews/1826 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/viewnews/1825 vào file.


2025-05-12 12:12:23 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/3410 (Queue: 67871, Visited: 9730)
2025-05-12 12:12:24 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewnews/3419 (Queue: 67871, Visited: 9730)
2025-05-12 12:12:24 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/24 (Queue: 67872, Visited: 9730)
2025-05-12 12:12:24 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/25 (Queue: 67873, Visited: 9730)
2025-05-12 12:12:24 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/listnews/16/0 (Queue: 67873, Visited: 9730)
2025-05-12 12:12:24 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/22 (Queue: 67873, Visited: 9730)
2025-05-12 12:12:24 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/26 (Queue: 67891, Visited: 9730)
2025-05-12 12:12:24 Worker-4: Đang crawl: https://educat

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/listnews/185/index.php/WebControl/viewpage/190 vào file.


2025-05-12 12:12:28 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/74 (Queue: 67925, Visited: 9731)
2025-05-12 12:12:28 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/Site/site/KCNGD/ (Queue: 67925, Visited: 9731)
2025-05-12 12:12:28 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/Site/site/KCKHGD/ (Queue: 67925, Visited: 9731)
2025-05-12 12:12:28 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/Site/site/KGDSTH/ (Queue: 67951, Visited: 9731)
2025-05-12 12:12:28 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/73 (Queue: 67951, Visited: 9731)
2025-05-12 12:12:28 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/Site/site/KSP/ (Queue: 67951, Visited: 9731)
2025-05-12 12:12:28 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/138 (Queue: 67951, Visited: 9731)
2025-05-12 12:12:28 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/WebC

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/27 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/28 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/29 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/30 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/31 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/site/KCKHGD/34 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/32 vào file.


2025-05-12 12:15:10 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/33 (Queue: 73179, Visited: 9738)
2025-05-12 12:15:10 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/43 (Queue: 73179, Visited: 9738)
2025-05-12 12:15:10 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/63 (Queue: 73179, Visited: 9738)
2025-05-12 12:15:10 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/63 (Queue: 73179, Visited: 9738)
2025-05-12 12:15:10 Worker-2: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_33.txt
2025-05-12 12:15:10 Worker-3: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_43.txt
2025-05-12 12:15:10 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/55 (Queue: 73181, Visited: 9741)
2025-05-12 12:15:10 Worker-5: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_63.txt
2025-05-12 12:15:10 Worker-

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/33 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/43 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/63 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/62 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/60 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/55 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/54 vào file.


2025-05-12 12:15:10 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/54 (Queue: 73171, Visited: 9745)
2025-05-12 12:15:10 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/23 (Queue: 73170, Visited: 9745)
2025-05-12 12:15:10 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/30 (Queue: 73170, Visited: 9745)
2025-05-12 12:15:10 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/30 (Queue: 73170, Visited: 9745)
2025-05-12 12:15:10 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/28 (Queue: 73170, Visited: 9745)
2025-05-12 12:15:10 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/28 (Queue: 73170, Visited: 9745)
2025-05-12 12:15:10 Worker-7: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_30.txt
2025-05-12 12:15:10 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/19 (Queue: 73171, Visited: 9

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/30 vào file.Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/28 vào file.

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/25 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/23 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/19 vào file.


2025-05-12 12:15:10 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/21 (Queue: 73172, Visited: 9750)
2025-05-12 12:15:10 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/22 (Queue: 73180, Visited: 9751)
2025-05-12 12:15:10 Worker-4: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_21.txt
2025-05-12 12:15:10 Worker-7: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_22.txt
2025-05-12 12:15:10 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/20 (Queue: 73187, Visited: 9752)
2025-05-12 12:15:10 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/43 (Queue: 73186, Visited: 9752)
2025-05-12 12:15:10 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/23 (Queue: 73186, Visited: 9752)
2025-05-12 12:15:10 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/25 (Queue: 73186,

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/21 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/22 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/25 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/23 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/24 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/20 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/45 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/site/KQLGD/26 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/42 vào file.


2025-05-12 12:15:10 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/16 (Queue: 73203, Visited: 9760)
2025-05-12 12:15:10 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/43 (Queue: 73203, Visited: 9760)
2025-05-12 12:15:10 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/17 (Queue: 73202, Visited: 9760)
2025-05-12 12:15:10 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/41 (Queue: 73202, Visited: 9760)
2025-05-12 12:15:10 Worker-2: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_41.txt
2025-05-12 12:15:10 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/41 (Queue: 73202, Visited: 9761)
2025-05-12 12:15:10 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/site/KSP/18 (Queue: 73201, Visited: 9761)
2025-05-12 12:15:10 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/13 (Queue: 73201

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/43 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/41 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/11 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/13 vào file.


2025-05-12 12:15:11 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/16 (Queue: 73219, Visited: 9764)
2025-05-12 12:15:11 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/59 (Queue: 73218, Visited: 9764)
2025-05-12 12:15:11 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/14 (Queue: 73218, Visited: 9764)
2025-05-12 12:15:11 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/61 (Queue: 73223, Visited: 9765)
2025-05-12 12:15:11 Worker-3: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_14.txt
2025-05-12 12:15:11 Worker-4: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_16.txt
2025-05-12 12:15:11 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/47 (Queue: 73227, Visited: 9766)
2025-05-12 12:15:11 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/61 (Queue: 73227, Visited: 97

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/15 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/14 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/16 vào file.


2025-05-12 12:15:11 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/site/KSP/18 (Queue: 73227, Visited: 9766)
2025-05-12 12:15:11 Worker-8: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_61.txt
2025-05-12 12:15:11 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/59 (Queue: 73229, Visited: 9767)
2025-05-12 12:15:11 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/53 (Queue: 73228, Visited: 9767)
2025-05-12 12:15:11 Worker-6: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_47.txt
2025-05-12 12:15:11 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/53 (Queue: 73232, Visited: 9768)
2025-05-12 12:15:11 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/59 (Queue: 73232, Visited: 9768)
2025-05-12 12:15:11 Worker-2: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_site_KSP_18.txt
2025-05-12 12:15:11 Worker-2: Đang crawl: htt

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/61 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/47 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/site/KSP/18 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/59 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/17 vào file.


2025-05-12 12:15:11 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/50 (Queue: 73256, Visited: 9771)
2025-05-12 12:15:11 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/52 (Queue: 73257, Visited: 9771)
2025-05-12 12:15:11 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/53 (Queue: 73257, Visited: 9771)
2025-05-12 12:15:11 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/51 (Queue: 73256, Visited: 9771)
2025-05-12 12:15:11 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/51 (Queue: 73256, Visited: 9771)
2025-05-12 12:15:11 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/50 (Queue: 73256, Visited: 9771)
2025-05-12 12:15:11 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/53 (Queue: 73256, Visited: 9771)
2025-05-12 12:15:11 Worker-6: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_52.txt
2

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/52 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/53 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/51 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/50 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/48 vào file.


2025-05-12 12:15:11 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/48 (Queue: 73249, Visited: 9776)
2025-05-12 12:15:11 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/47 (Queue: 73249, Visited: 9776)
2025-05-12 12:15:11 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/44 (Queue: 73248, Visited: 9776)
2025-05-12 12:15:11 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/47 (Queue: 73248, Visited: 9777)
2025-05-12 12:15:11 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/46 (Queue: 73248, Visited: 9777)
2025-05-12 12:15:11 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/46 (Queue: 73248, Visited: 9777)
2025-05-12 12:15:11 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/45 (Queue: 73248, Visited: 9777)
2025-05-12 12:15:11 Worker-6: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_47.txt
2

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/47 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/46 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/45 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/44 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/29 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/22 vào file.


2025-05-12 12:15:12 Worker-4: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_24.txt
2025-05-12 12:15:12 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/26 (Queue: 73238, Visited: 9783)
2025-05-12 12:15:12 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/38 (Queue: 73237, Visited: 9783)
2025-05-12 12:15:12 Worker-7: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/29 (Queue: 73237, Visited: 9784)
2025-05-12 12:15:12 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/26 (Queue: 73237, Visited: 9784)
2025-05-12 12:15:12 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/24 (Queue: 73237, Visited: 9784)
2025-05-12 12:15:12 Worker-2: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_26.txt
2025-05-12 12:15:12 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/site/KCNGD/42 (Queue: 73233, Visited: 9784)
2025-05-12 12:15:1

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/24 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/26 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/35 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/37 vào file.


2025-05-12 12:15:12 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/40 (Queue: 73245, Visited: 9786)
2025-05-12 12:15:12 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/38 (Queue: 73245, Visited: 9786)
2025-05-12 12:15:12 Worker-8: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_40.txt
2025-05-12 12:15:12 Worker-4: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewstaticsite_38.txt
2025-05-12 12:15:12 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/39 (Queue: 73255, Visited: 9788)
2025-05-12 12:15:12 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/39 (Queue: 73254, Visited: 9788)
2025-05-12 12:15:12 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/41 (Queue: 73254, Visited: 9788)
2025-05-12 12:15:12 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewstaticsite/44 (Queue: 73254, Visit

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/40 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/38 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/41 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/44 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/site/KCNGD/42 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewstaticsite/39 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/40 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/39 vào file.


2025-05-12 12:15:12 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/32 (Queue: 73271, Visited: 9794)
2025-05-12 12:15:12 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/39 (Queue: 73271, Visited: 9794)
2025-05-12 12:15:12 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/32 (Queue: 73270, Visited: 9794)
2025-05-12 12:15:12 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/38 (Queue: 73270, Visited: 9794)
2025-05-12 12:15:12 Worker-3: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_38.txt
2025-05-12 12:15:12 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/31 (Queue: 73269, Visited: 9795)
2025-05-12 12:15:12 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/38 (Queue: 73269, Visited: 9795)
2025-05-12 12:15:12 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/31 (Queue: 73268, Visited: 9795)
2

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/38 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/36 vào file.


2025-05-12 12:15:12 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/31 (Queue: 73276, Visited: 9797)
2025-05-12 12:15:12 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/34 (Queue: 73275, Visited: 9797)
2025-05-12 12:15:12 Worker-7: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_37.txt
2025-05-12 12:15:12 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/34 (Queue: 73274, Visited: 9798)
2025-05-12 12:15:12 Worker-3: Đã lưu: uet_vnu_data/education.vnu.edu.vn_index.php_site_viewnews_31.txt
2025-05-12 12:15:12 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/329 (Queue: 73272, Visited: 9799)
2025-05-12 12:15:12 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/35 (Queue: 73274, Visited: 9799)
2025-05-12 12:15:12 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/37 (Queue: 73274, Visited: 9799)
2025-05-12 12:15:12

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/32 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/37 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/31 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/35 vào file.
Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/site/viewnews/34 vào file.


2025-05-12 12:15:13 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/Site/site/KGDSTH/ (Queue: 73267, Visited: 9801)
2025-05-12 12:15:13 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/208 (Queue: 73277, Visited: 9801)
2025-05-12 12:15:13 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/site/viewnews/34 (Queue: 73277, Visited: 9801)
2025-05-12 12:15:13 Worker-4: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/246 (Queue: 73276, Visited: 9801)
2025-05-12 12:15:13 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/83 (Queue: 73276, Visited: 9801)
2025-05-12 12:15:13 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/listnews/76/0 (Queue: 73276, Visited: 9801)
2025-05-12 12:15:13 Worker-1: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/153 (Queue: 73276, Visited: 9801)
2025-05-12 12:15:13 Worker-8: Đang crawl: https://education.vnu.edu.

Đã lưu nội dung từ https://education.vnu.edu.vn/index.php/WebControl/listnews/76/index.php/WebControl/viewpage/190 vào file.


2025-05-12 12:15:52 Worker-3: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/listnews/221/0 (Queue: 74428, Visited: 9802)
2025-05-12 12:15:52 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/273 (Queue: 74428, Visited: 9802)
2025-05-12 12:15:52 Worker-5: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/listnews/170/0 (Queue: 74439, Visited: 9802)
2025-05-12 12:15:52 Worker-6: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/198 (Queue: 74439, Visited: 9802)
2025-05-12 12:15:52 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/278 (Queue: 74439, Visited: 9802)
2025-05-12 12:15:52 Worker-8: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/viewpage/190 (Queue: 74439, Visited: 9802)
2025-05-12 12:15:52 Worker-2: Đang crawl: https://education.vnu.edu.vn/index.php/WebControl/listnews/148/0 (Queue: 74439, Visited: 9802)
2025-05-12 12:15:52 Worker-4: Đang crawl: https:/