<a href="https://colab.research.google.com/github/hoanglambinh/ISODS-Entrance/blob/main/ISODS_Test_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import os
import re
import requests
from bs4 import BeautifulSoup
from transformers import AutoTokenizer, AutoModel
import torch
import glob

def ensure_directories(base_path):
    """Ensure all necessary directories exist."""
    sub_dirs = ["vbpl", "property", "history", "related", "pdf"]
    for sub_dir in sub_dirs:
        os.makedirs(os.path.join(base_path, sub_dir), exist_ok=True)

def sanitize_filename(filename):
    """Ensure filenames are safe for saving."""
    return re.sub(r'[\\/*?"<>|#]', '_', filename)

def fetch_and_save(url, save_path):
    """Fetches a URL and saves the response to a file."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(save_path, "w", encoding="utf-8") as file:
            file.write(response.text)
        print(f"Saved: {save_path}")
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")

def save_pdf(url, save_path):
    """Download and save PDF files."""
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        with open(save_path, "wb") as file:
            file.write(response.content)
        print(f"Saved PDF: {save_path}")
    except requests.RequestException as e:
        print(f"Failed to fetch {url}: {e}")

def scrape_documents(base_path):
    """Main function to scrape all required documents."""
    ensure_directories(base_path)
    index_files = glob.glob(os.path.join(base_path, "demuc", "*.html"))

    for index_file in index_files:
        with open(index_file, "r", encoding="utf-8") as file:
            soup = BeautifulSoup(file, "html.parser")

        for link in soup.find_all("a", href=True):
            href = link["href"]
            match = re.search(r"ItemID=(\d+)", href)
            if match:
                item_id = match.group(1)

                urls = {
                    "vbpl": (f"https://vbpl.vn/TW/Pages/vbpq-toanvan.aspx?ItemID={item_id}", f"full_{item_id}.html"),
                    "property": (f"https://vbpl.vn/tw/Pages/vbpq-thuoctinh.aspx?dvid=13&ItemID={item_id}", f"p_{item_id}.html"),
                    "history": (f"https://vbpl.vn/tw/Pages/vbpq-lichsu.aspx?dvid=13&ItemID={item_id}", f"h_{item_id}.html"),
                    "related": (f"https://vbpl.vn/TW/Pages/vbpq-vanbanlienquan.aspx?ItemID={item_id}", f"r_{item_id}.html"),
                    "pdf": (f"https://vbpl.vn/tw/Pages/vbpq-van-ban-goc.aspx?ItemID={item_id}", f"pdf_{item_id}.pdf")
                }

                for key, (url, filename) in urls.items():
                    save_path = os.path.join(base_path, key, sanitize_filename(filename))
                    if key == "pdf":
                        save_pdf(url, save_path)
                    else:
                        fetch_and_save(url, save_path)

def load_embedding_model():
    """Load the Vietnamese bi-encoder model from Hugging Face."""
    model_name = "bkai-foundation-models/vietnamese-bi-encoder"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    return tokenizer, model

def chunk_text(text, tokenizer, max_tokens=2000, overlap=20):
    """Chunk text into segments of max_tokens with overlap."""
    tokens = tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    for i in range(0, len(tokens), max_tokens - overlap):
        chunk = tokens[i:i + max_tokens]
        chunks.append(tokenizer.decode(chunk))
    return chunks

def generate_embeddings(base_path, tokenizer, model):
    """Generate embeddings for all scraped text files."""
    vbpl_files = glob.glob(os.path.join(base_path, "vbpl", "*.html"))
    for file_path in vbpl_files:
        with open(file_path, "r", encoding="utf-8") as file:
            text = BeautifulSoup(file, "html.parser").get_text()

        chunks = chunk_text(text, tokenizer)

        embeddings = []
        for chunk in chunks:
            inputs = tokenizer(chunk, return_tensors="pt", truncation=True, padding=True, max_length=2000)
            with torch.no_grad():
                embedding = model(**inputs).last_hidden_state.mean(dim=1).squeeze().numpy()
            embeddings.append(embedding)

        torch.save(embeddings, file_path.replace(".html", ".pt"))
        print(f"Embeddings saved for {file_path}")

# === Run the Scraper ===
BASE_PATH = "/content/drive/MyDrive/BoPhapDienDienTu"
scrape_documents(BASE_PATH)

# === Load Model and Generate Embeddings ===
tokenizer, model = load_embedding_model()
generate_embeddings(BASE_PATH, tokenizer, model)

print("Scraping and embedding complete!")


Saved: /content/drive/MyDrive/BoPhapDienDienTu/vbpl/full_25819.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/property/p_25819.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/history/h_25819.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/related/r_25819.html
Saved PDF: /content/drive/MyDrive/BoPhapDienDienTu/pdf/pdf_25819.pdf
Saved: /content/drive/MyDrive/BoPhapDienDienTu/vbpl/full_26535.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/property/p_26535.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/history/h_26535.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/related/r_26535.html
Saved PDF: /content/drive/MyDrive/BoPhapDienDienTu/pdf/pdf_26535.pdf
Saved: /content/drive/MyDrive/BoPhapDienDienTu/vbpl/full_25819.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/property/p_25819.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/history/h_25819.html
Saved: /content/drive/MyDrive/BoPhapDienDienTu/related/r_25819.html
Saved PDF: /content/drive/MyDrive/BoPhapDie