In [1]:
# Import required libraries for web scraping and API calls
import requests
from bs4 import BeautifulSoup
from openai import OpenAI
import json
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [2]:
# Base URL for vneconomy website
BASE_URL = "https://vneconomy.vn/"

# Mapping of VN30 stock codes to their related keywords
KEYWORDS_MAP = {
    "ACB": ["ACB", "Ngân hàng ACB", "Ngân hàng TMCP Á Châu"],
    "BCM": ["BCM", "Becamex", "KCN Bình Dương", "khu công nghiệp Bình Dương", "VSIP", "Becamex IDC"],
    "BID": ["BIDV", "Ngân hàng Đầu tư và Phát triển Việt Nam"],
    "CTG": ["CTG", "VietinBank", "Ngân hàng Công Thương Việt Nam"],
    "DGC": ["DGC", "Hóa chất Đức Giang"],
    "FPT": ["FPT"],
    "GAS": ["PV GAS", "PV Gas", "Tổng Công ty Khí Việt Nam"],
    "GVR": ["GVR", "Tập đoàn Cao su", "Tập đoàn Công nghiệp Cao su Việt Nam"],
    "HDB": ["HDB", "HDBank", "Ngân hàng TMCP Phát triển Thành phố Hồ Chí Minh"],
    "HPG": ["HPG", "Hòa Phát"],
    "LPB": ["LPB", "LPBank", "LienVietPostBank", "Ngân hàng Bưu điện Liên Việt"],
    "MBB": ["MBB", "MBBank", "Ngân hàng Quân đội", "MB", "Ngân hàng TMCP Quân đội"],
    "MSN": ["MSN", "Masan", "WinCommerce"],
    "MWG": ["MWG", "Thế Giới Di Động", "Mobile World", "Bách Hóa Xanh", "BHX", "Điện Máy Xanh", "ĐMX", "TGDĐ"],
    "PLX": ["PLX", "Petrolimex", "Tập đoàn Xăng dầu Việt Nam"],
    "SAB": ["SAB", "Sabeco", "Tổng Công ty CP Bia - Rượu - Nước giải khát Sài Gòn"],
    "SHB": ["SHB", "Ngân hàng Thương mại Cổ phần Sài Gòn – Hà Nội", "Ngân hàng TMCP Sài Gòn Hà Nội"],
    "SSB": ["SSB", "Ngân hàng Thương mại Cổ phần Đông Nam Á", "Ngân hàng TMCP Đông Nam Á", "SeABank"],
    "SSI": ["SSI", "Chứng khoán SSI"],
    "STB": ["STB", "Sài Gòn Thương Tín", "Sacombank"],
    "TCB": ["TCB", "Techcombank", "Ngân hàng TMCP Kỹ Thương Việt Nam"],
    "TPB": ["TPB", "TPBank", "Ngân hàng Tiên Phong", "Ngân hàng TMCP Tiên Phong"],
    "VCB": ["VCB", "Vietcombank", "Ngân hàng TMCP Ngoại Thương Việt Nam", "Ngân hàng Ngoại thương"],
    "VHM": ["VHM", "Vinhomes"],
    "VIB": ["VIB", "Ngân hàng TMCP Quốc Tế Việt Nam", "Ngân hàng Quốc Tế"],
    "VIC": ["VIC", "Vingroup", "Công ty Cổ phần Tập đoàn Vingroup"],
    "VJC": ["VJC", "Vietjet Air", "Công ty Cổ phần Hàng không Vietjet", "máy bay Vietjet"],
    "VNM": ["VNM", "Vinamilk", "Công ty Cổ phần Sữa Việt Nam"],
    "VPB": ["VPB", "VPBank", "Ngân hàng TMCP Việt Nam Thịnh Vượng"],
    "VRE": ["VRE", "Vincom Retail", "Công ty Cổ phần Vincom Retail"]
}

# Limit category option
LIMIT_CATEGORY = False
NUM_CATEGORY = 5

In [3]:
def get_categories():
    # Fetch the homepage HTML
    html = requests.get(BASE_URL, timeout=10).text
    soup = BeautifulSoup(html, "html.parser")

    categories = []

    # Find all anchor tags with title attribute
    for a in soup.find_all("a", title=True):
        href = a.get("href", "")
        
        # Only keep links ending with .htm
        if href.endswith(".htm"):
            title = a.get("title").strip()

            # Normalize the URL to absolute path
            if href.startswith("/"):
                full_url = BASE_URL.rstrip("/") + href
            else:
                full_url = href

            categories.append({
                "title": title,
                "url": full_url
            })

    return categories


def filter_category_pages(categories):
    # Initialize OpenAI client with API key from environment variable
    client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

    # Format categories as text for GPT
    urls_text = "\n".join([f"- {c['title']}: {c['url']}" for c in categories])

    # Call GPT to filter out article pages and keep only category pages
    response = client.chat.completions.create(
        model="gpt-4.1-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an assistant that filters a list of URLs. "
                           "Keep only main category pages and remove specific news articles. "
                           "Return only a JSON array of objects with 'title' and 'url'."
            },
            {
                "role": "user",
                "content": f"Here is the list:\n{urls_text}\n\nFilter them and return JSON array."
            }
        ],
        temperature=0
    )

    # Parse the JSON response
    filtered_json = response.choices[0].message.content
    try:
        filtered_list = json.loads(filtered_json)
    except json.JSONDecodeError:
        print("Failed to parse JSON. Here's raw output:")
        print(filtered_json)
        filtered_list = []

    return filtered_list


# Get all categories from the homepage
cats = get_categories()
# Filter to keep only main category pages
filtered_cats = filter_category_pages(cats)

# Display filtered categories
print("Filtered Categories:")
for c in filtered_cats:
    print(f"- {c['title']}: {c['url']}")

Filtered Categories:
- Chứng khoán: https://vneconomy.vn/chung-khoan.htm
- Tiêu & Dùng: https://vneconomy.vn/tieu-dung.htm
- VnE TV: https://vneconomy.vn/video.htm
- eMagazine: https://vneconomy.vn/emagazine.htm
- Infographics: https://vneconomy.vn/infographics.htm
- Kinh tế xanh: https://vneconomy.vn/kinh-te-xanh.htm
- Chuyển động xanh: https://vneconomy.vn/chuyen-dong-xanh.htm
- Pháp lý: https://vneconomy.vn/phap-ly-kinh-te-xanh.htm
- Thương hiệu xanh: https://vneconomy.vn/thuong-hieu-xanh.htm
- Diễn đàn: https://vneconomy.vn/dien-dan-kinh-te-xanh.htm
- Tiêu điểm: https://vneconomy.vn/tieu-diem.htm
- Tài chính: https://vneconomy.vn/tai-chinh.htm
- Ngân hàng: https://vneconomy.vn/tai-chinh-ngan-hang.htm
- Thị trường vốn: https://vneconomy.vn/thi-truong-von-tai-chinh.htm
- Thuế: https://vneconomy.vn/thue-tai-chhinh.htm
- Bảo hiểm: https://vneconomy.vn/bao-hiem-tai-chinh.htm
- Doanh nghiệp niêm yết: https://vneconomy.vn/doanh-nghiep-niem-yet.htm
- Thị trường: https://vneconomy.vn/thi-tr

In [4]:
import requests
from bs4 import BeautifulSoup
import time
import json

def fetch_article_summary(article_url):
    # Fetch the meta description from article page as summary
    try:
        html = requests.get(article_url, timeout=10).text
        soup = BeautifulSoup(html, "html.parser")
        meta_desc = soup.find("meta", attrs={"name": "description"})
        if meta_desc and meta_desc.get("content"):
            return meta_desc["content"].strip()
    except Exception as e:
        print(f"Failed to fetch summary for {article_url}: {e}")
    return ""


def crawl_articles(category_url, max_pages=20, delay=1):
    # Crawl all articles from a category with pagination
    articles = []

    for page in range(1, max_pages + 1):
        # Build pagination URL
        if "?" in category_url:
            url = f"{category_url}&page={page}"
        else:
            url = f"{category_url}?page={page}"

        print(f"Crawling {url} ...")
        try:
            html = requests.get(url, timeout=10).text
        except Exception as e:
            print(f"Failed to fetch {url}: {e}")
            break

        soup = BeautifulSoup(html, "html.parser")
        # Find all article links with responsive-image-link class
        article_tags = soup.find_all("a", class_="responsive-image-link", title=True, href=True)

        # Stop if no more articles found
        if not article_tags:
            break

        for a in article_tags:
            title = a.get("title").strip()
            href = a.get("href").strip()
            full_url = href if href.startswith("http") else f"https://vneconomy.vn{href}"

            # Fetch article summary
            summary = fetch_article_summary(full_url)

            articles.append({
                "title": title,
                "url": full_url,
                "summary": summary
            })

        # Delay between requests to avoid overwhelming the server
        time.sleep(delay)

    return articles


# Crawl articles from each category
all_articles = {}
cnt = 0
for cat in filtered_cats:
    print(f"Processing category: {cat['title']}")
    cat_articles = crawl_articles(cat['url'])
    all_articles[cat['title']] = cat_articles
    cnt += 1
    if LIMIT_CATEGORY:
        # Limit to 5 categories
        if cnt >= NUM_CATEGORY:
            break

# Save all articles to JSON file
with open("vneconomy_articles.json", "w", encoding="utf-8") as f:
    json.dump(all_articles, f, ensure_ascii=False, indent=2)

print("Done. Total categories:", len(all_articles))

Processing category: Chứng khoán
Crawling https://vneconomy.vn/chung-khoan.htm?page=1 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=2 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=3 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=4 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=5 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=6 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=7 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=8 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=9 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=10 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=11 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=12 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=13 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=14 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=15 ...
Crawling https://vneconomy.vn/chung-khoan.htm?page=16 ...
Crawling https://vneconomy.vn/chung-khoan.htm?pa

In [5]:
def filter_articles_by_keywords(articles, keywords_map=KEYWORDS_MAP):
    # Filter articles that contain VN30 stock-related keywords
    filtered = []
    for art in articles:
        # Combine title and summary for keyword matching
        text = (art["title"] + " " + art.get("summary", "")).lower()
        matched_codes = []
        # Check each stock code's keywords
        for code, kws in keywords_map.items():
            for kw in kws:
                if kw.lower() in text:
                    matched_codes.append(code)
                    break
        # Only keep articles that matched at least one stock code
        if matched_codes:
            art_copy = art.copy()
            art_copy["codes"] = matched_codes
            filtered.append(art_copy)
    return filtered

In [6]:
# Flatten all articles from all categories into a single list
all_articles_list = []
for cat_articles in all_articles.values():
    all_articles_list.extend(cat_articles)

# Filter articles by VN30 stock keywords
filtered_articles = filter_articles_by_keywords(all_articles_list, keywords_map=KEYWORDS_MAP)
print(f"Articles matching keywords: {len(filtered_articles)}")

# Save filtered articles to JSON file
with open("filtered_articles.json", "w", encoding="utf-8") as f:
    json.dump(filtered_articles, f, ensure_ascii=False, indent=2)

Articles matching keywords: 154
