In [None]:
!pip install beautifulsoup4 requests pandas numpy openpyxl openai python-dotenv selenium webdriver-manager httpx pydantic tqdm

In [47]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import os
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [48]:
LIMIT_CATEGORY = False
NUM_CATEGORIES = 5

In [49]:
# Base URL for cafef website
BASE_URL = "https://cafef.vn"

# Check if API key is loaded
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    print("WARNING: OPENAI_API_KEY not found in environment variables!")
    print("Please create a .env file with: OPENAI_API_KEY=your-api-key")
    print("Or set it manually below:")
    # Uncomment and add your key here if .env doesn't work
    # api_key = "your-api-key-here"
else:
    print("API key loaded successfully")

def get_categories():
    # Fetch the homepage HTML
    html = requests.get(BASE_URL, timeout=10).text
    soup = BeautifulSoup(html, "html.parser")

    categories = []

    # Find the menu category div
    menu_div = soup.find("div", class_="menucategory")
    if menu_div:
        # Find all anchor tags within the menu
        for a in menu_div.find_all("a", href=True):
            href = a.get("href", "")
            title = a.get("title", a.text).strip()
            
            # Skip empty titles and home page
            if not title or href == "/":
                continue
            
            # Only keep links ending with .chn
            if href.endswith(".chn"):
                # Normalize the URL to absolute path
                if href.startswith("/"):
                    full_url = BASE_URL.rstrip("/") + href
                else:
                    full_url = href

                categories.append({
                    "title": title,
                    "url": full_url
                })

    return categories


def filter_category_pages(categories):
    # Initialize OpenAI client with API key from environment variable
    if not api_key:
        print("Skipping GPT filtering - using all categories")
        return categories
    
    client = OpenAI(api_key=api_key)

    # Format categories as text for GPT
    urls_text = "\n".join([f"- {c['title']}: {c['url']}" for c in categories])

    # Call GPT to filter out article pages and keep only main category pages
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an assistant that filters a list of URLs. "
                           "Keep only main category pages and remove specific news articles or subcategories. "
                           "Return only a JSON array of objects with 'title' and 'url'."
            },
            {
                "role": "user",
                "content": f"Here is the list:\n{urls_text}\n\nFilter them and return JSON array."
            }
        ],
        temperature=0
    )

    # Parse the JSON response
    filtered_json = response.choices[0].message.content
    try:
        filtered_list = json.loads(filtered_json)
    except json.JSONDecodeError:
        print("Failed to parse JSON. Here's raw output:")
        print(filtered_json)
        filtered_list = []

    return filtered_list


# Get all categories from the homepage
cats = get_categories()

# Check if categories were found
if not cats:
    print("ERROR: No categories found!")
else:
    print(f"Found {len(cats)} categories")

# Filter to keep only main category pages
filtered_cats = filter_category_pages(cats)

# Check if filtering returned results
if not filtered_cats:
    print("WARNING: No filtered categories returned. Using all categories instead.")
    filtered_cats = cats

# Display filtered categories
print(f"\nFiltered Categories ({len(filtered_cats)}):")
for c in filtered_cats:
    print(f"- {c['title']}: {c['url']}")

API key loaded successfully
Found 26 categories
Failed to parse JSON. Here's raw output:
```json
[
    {
        "title": "XÃ HỘI",
        "url": "https://cafef.vn/xa-hoi.chn"
    },
    {
        "title": "CHỨNG KHOÁN",
        "url": "https://cafef.vn/thi-truong-chung-khoan.chn"
    },
    {
        "title": "BẤT ĐỘNG SẢN",
        "url": "https://cafef.vn/bat-dong-san.chn"
    },
    {
        "title": "DOANH NGHIỆP",
        "url": "https://cafef.vn/doanh-nghiep.chn"
    },
    {
        "title": "NGÂN HÀNG",
        "url": "https://cafef.vn/tai-chinh-ngan-hang.chn"
    },
    {
        "title": "Smart Money",
        "url": "https://cafef.vn/smart-money.chn"
    },
    {
        "title": "TÀI CHÍNH QUỐC TẾ",
        "url": "https://cafef.vn/tai-chinh-quoc-te.chn"
    },
    {
        "title": "VĨ MÔ",
        "url": "https://cafef.vn/vi-mo-dau-tu.chn"
    },
    {
        "title": "KINH TẾ SỐ",
        "url": "https://cafef.vn/kinh-te-so.chn"
    },
    {
        "title": "THỊ TR

In [50]:
# Hardcoded zone_id for all categories
DEFAULT_ZONE_ID = 18832

# Mapping of VN30 stock codes to their related keywords
KEYWORDS_MAP = {
    "ACB": ["ACB", "Ngân hàng ACB", "Ngân hàng TMCP Á Châu"],
    "BCM": ["BCM", "Becamex", "KCN Bình Dương", "khu công nghiệp Bình Dương", "VSIP", "Becamex IDC"],
    "BID": ["BIDV", "Ngân hàng Đầu tư và Phát triển Việt Nam"],
    "CTG": ["CTG", "VietinBank", "Ngân hàng Công Thương Việt Nam"],
    "DGC": ["DGC", "Hóa chất Đức Giang"],
    "FPT": ["FPT"],
    "GAS": ["PV GAS", "PV Gas", "Tổng Công ty Khí Việt Nam"],
    "GVR": ["GVR", "Tập đoàn Cao su", "Tập đoàn Công nghiệp Cao su Việt Nam"],
    "HDB": ["HDB", "HDBank", "Ngân hàng TMCP Phát triển Thành phố Hồ Chí Minh"],
    "HPG": ["HPG", "Hòa Phát"],
    "LPB": ["LPB", "LPBank", "LienVietPostBank", "Ngân hàng Bưu điện Liên Việt"],
    "MBB": ["MBB", "MBBank", "Ngân hàng Quân đội", "MB", "Ngân hàng TMCP Quân đội"],
    "MSN": ["MSN", "Masan", "WinCommerce"],
    "MWG": ["MWG", "Thế Giới Di Động", "Mobile World", "Bách Hóa Xanh", "BHX", "Điện Máy Xanh", "ĐMX", "TGDĐ"],
    "PLX": ["PLX", "Petrolimex", "Tập đoàn Xăng dầu Việt Nam"],
    "SAB": ["SAB", "Sabeco", "Tổng Công ty CP Bia - Rượu - Nước giải khát Sài Gòn"],
    "SHB": ["SHB", "Ngân hàng Thương mại Cổ phần Sài Gòn – Hà Nội", "Ngân hàng TMCP Sài Gòn Hà Nội"],
    "SSB": ["SSB", "Ngân hàng Thương mại Cổ phần Đông Nam Á", "Ngân hàng TMCP Đông Nam Á", "SeABank"],
    "SSI": ["SSI", "Chứng khoán SSI"],
    "STB": ["STB", "Sài Gòn Thương Tín", "Sacombank"],
    "TCB": ["TCB", "Techcombank", "Ngân hàng TMCP Kỹ Thương Việt Nam"],
    "TPB": ["TPB", "TPBank", "Ngân hàng Tiên Phong", "Ngân hàng TMCP Tiên Phong"],
    "VCB": ["VCB", "Vietcombank", "Ngân hàng TMCP Ngoại Thương Việt Nam", "Ngân hàng Ngoại thương"],
    "VHM": ["VHM", "Vinhomes"],
    "VIB": ["VIB", "Ngân hàng TMCP Quốc Tế Việt Nam", "Ngân hàng Quốc Tế"],
    "VIC": ["VIC", "Vingroup", "Công ty Cổ phần Tập đoàn Vingroup"],
    "VJC": ["VJC", "Vietjet Air", "Công ty Cổ phần Hàng không Vietjet", "máy bay Vietjet"],
    "VNM": ["VNM", "Vinamilk", "Công ty Cổ phần Sữa Việt Nam"],
    "VPB": ["VPB", "VPBank", "Ngân hàng TMCP Việt Nam Thịnh Vượng"],
    "VRE": ["VRE", "Vincom Retail", "Công ty Cổ phần Vincom Retail"]
}

In [51]:
class CafefScraper:
    BASE_URL = "https://cafef.vn"
    
    def __init__(self, zone_id, category_url, category_name=""):
        self.zone_id = zone_id
        self.category_url = category_url
        self.category_name = category_name
        self.headers = {
            'Accept': '*/*',
            'Referer': category_url,
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.articles = []
    
    def parse_html(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        articles = []
        
        article_elements = soup.select('.firstitem, .cate-hl-row2 .big, .tlitem.box-category-item')
        
        for article in article_elements:
            try:
                title_element = article.find(['h2', 'h3']).find('a')
                if not title_element:
                    continue
                
                title = title_element.get('title', title_element.text).strip()
                relative_link = title_element['href']
                link = self.BASE_URL + relative_link if relative_link.startswith('/') else relative_link
                
                summary_element = article.find('p', class_='sapo')
                summary = summary_element.text.strip() if summary_element else "N/A"
                
                articles.append({
                    'Title': title,
                    'Link': link,
                    'Summary': summary,
                })
            except Exception:
                continue
        
        return articles
    
    def scrape_initial_page(self):
        print(f"Loading initial page: {self.category_url}")
        try:
            response = requests.get(self.category_url, headers=self.headers, timeout=15)
            response.raise_for_status()
            initial_articles = self.parse_html(response.text)
            self.articles.extend(initial_articles)
            print(f"Found {len(initial_articles)} articles on initial page")
            return True
        except requests.RequestException as e:
            print(f"Error loading initial page: {e}")
            return False
    
    def scrape_api_pages(self, max_pages):
        for page_num in range(1, max_pages + 1):
            api_url = f"{self.BASE_URL}/timelinelist/{self.zone_id}/{page_num}.chn"
            print(f"Loading API page {page_num}...")
            
            try:
                response = requests.get(api_url, headers=self.headers, timeout=10)
                response.raise_for_status()
                
                if not response.text.strip():
                    print("No more articles available")
                    break
                
                page_articles = self.parse_html(response.text)
                
                if not page_articles:
                    print("Last page reached")
                    break
                
                self.articles.extend(page_articles)
                print(f"Loaded {len(page_articles)} articles from page {page_num}")
                
                time.sleep(1)
                
            except requests.RequestException as e:
                print(f"Error loading API page {page_num}: {e}")
                break
    
    def scrape(self, max_pages=5):
        if not self.scrape_initial_page():
            return []
        
        self.scrape_api_pages(max_pages)
        return self.articles
    
    def to_dataframe(self):
        df = pd.DataFrame(self.articles)
        df.drop_duplicates(subset=['Link'], inplace=True, keep='first')
        return df

In [52]:
# Configuration
MAX_PAGES = 20

# Check if we have categories to scrape
if not filtered_cats:
    print("ERROR: No categories available to scrape!")
else:
    # Store all articles from all categories
    all_category_articles = {}
    
    total_categories = len(filtered_cats)
    print(f"\n{'='*60}")
    print(f"Starting to scrape {total_categories} categories")
    print(f"{'='*60}\n")

    # Scrape each category
    cnt = 0
    for idx, cat in enumerate(filtered_cats, 1):
        print(f"\n{'='*60}")
        print(f"[{idx}/{total_categories}] Processing category: {cat['title']}")
        print(f"URL: {cat['url']}")
        print(f"{'='*60}")
        
        # Use hardcoded zone_id
        zone_id = DEFAULT_ZONE_ID
        print(f"Zone ID: {zone_id}")
        
        # Create scraper instance
        scraper = CafefScraper(zone_id, cat['url'], cat['title'])
        scraper.scrape(max_pages=MAX_PAGES)
        
        # Convert to dataframe
        df = scraper.to_dataframe()
        
        print(f"Total unique articles found: {len(df)}")
        
        # Store articles
        all_category_articles[cat['title']] = df
        
        # Save individual category CSV
        csv_filename = f"cafef_{cat['title'].lower().replace(' ', '_')}_articles.csv"
        try:
            df.to_csv(csv_filename, index=False, encoding='utf-8-sig')
            print(f"✓ Data saved to '{csv_filename}'")
        except Exception as e:
            print(f"✗ Error saving CSV file: {e}")
        
        print(f"✓ Category '{cat['title']}' completed ({idx}/{total_categories})")
    
        # Delay between categories
        time.sleep(2)
        
        if LIMIT_CATEGORY:
            cnt += 1
            if cnt >= NUM_CATEGORIES:
                break

    print(f"\n{'='*60}")
    print(f"Scraping completed for all categories")
    print(f"Total categories processed: {len(all_category_articles)}")

    # Combine all articles and save
    all_articles_combined = pd.concat(all_category_articles.values(), ignore_index=True)
    all_articles_combined.drop_duplicates(subset=['Link'], inplace=True)

    combined_filename = "cafef_all_categories_articles.csv"
    all_articles_combined.to_csv(combined_filename, index=False, encoding='utf-8-sig')
    print(f"Combined data saved to '{combined_filename}'")
    print(f"Total unique articles across all categories: {len(all_articles_combined)}")


Starting to scrape 26 categories


[1/26] Processing category: XÃ HỘI
URL: https://cafef.vn/xa-hoi.chn
Zone ID: 18832
Loading initial page: https://cafef.vn/xa-hoi.chn
Found 23 articles on initial page
Loading API page 1...
Found 23 articles on initial page
Loading API page 1...
Loaded 15 articles from page 1
Loaded 15 articles from page 1
Loading API page 2...
Loaded 15 articles from page 2
Loading API page 2...
Loaded 15 articles from page 2
Loading API page 3...
Loaded 15 articles from page 3
Loading API page 3...
Loaded 15 articles from page 3
Loading API page 4...
Loaded 15 articles from page 4
Loading API page 4...
Loaded 15 articles from page 4
Loading API page 5...
Loaded 15 articles from page 5
Loading API page 5...
Loaded 15 articles from page 5
Loading API page 6...
Loaded 15 articles from page 6
Loading API page 6...
Loaded 15 articles from page 6
Loading API page 7...
Loaded 15 articles from page 7
Loading API page 7...
Loaded 15 articles from page 7
Loading API page 8...

In [53]:
def filter_articles_by_keywords(df, keywords_map=KEYWORDS_MAP):
    """
    Filter articles that contain VN30 stock-related keywords in title or summary.
    Returns a new dataframe with an additional 'codes' column containing matched stock codes.
    """
    filtered_rows = []
    
    for idx, row in df.iterrows():
        # Combine title and summary for keyword matching
        text = (str(row['Title']) + " " + str(row.get('Summary', ''))).lower()
        matched_codes = []
        
        # Check each stock code's keywords
        for code, kws in keywords_map.items():
            for kw in kws:
                if kw.lower() in text:
                    matched_codes.append(code)
                    break
        
        # Only keep articles that matched at least one stock code
        if matched_codes:
            row_copy = row.copy()
            row_copy['Codes'] = ', '.join(matched_codes)
            filtered_rows.append(row_copy)
    
    if filtered_rows:
        return pd.DataFrame(filtered_rows)
    else:
        return pd.DataFrame(columns=list(df.columns) + ['Codes'])


# Filter all collected articles by VN30 keywords
print(f"\n{'='*60}")
print("Filtering articles by VN30 keywords...")
print(f"{'='*60}\n")

# Combine all articles from all categories
if all_category_articles:
    all_articles_combined = pd.concat(all_category_articles.values(), ignore_index=True)
    all_articles_combined.drop_duplicates(subset=['Link'], inplace=True)
    
    print(f"Total articles before filtering: {len(all_articles_combined)}")
    
    # Filter by keywords
    filtered_articles = filter_articles_by_keywords(all_articles_combined, keywords_map=KEYWORDS_MAP)
    
    print(f"Articles matching VN30 keywords: {len(filtered_articles)}")
    
    # Save filtered articles
    filtered_filename = "cafef_vn30_filtered_articles.csv"
    filtered_articles.to_csv(filtered_filename, index=False, encoding='utf-8-sig')
    print(f"✓ Filtered articles saved to '{filtered_filename}'")
    
    # Also save as JSON for compatibility
    filtered_json = filtered_articles.to_dict('records')
    with open("cafef_vn30_filtered_articles.json", "w", encoding="utf-8") as f:
        json.dump(filtered_json, f, ensure_ascii=False, indent=2)
    print(f"✓ Filtered articles saved to 'cafef_vn30_filtered_articles.json'")
    
    # Display sample of filtered articles
    if len(filtered_articles) > 0:
        print(f"\nSample of filtered articles (showing first 5):")
        for idx, row in filtered_articles.head(5).iterrows():
            print(f"\n- Title: {row['Title']}")
            print(f"  Codes: {row['Codes']}")
            print(f"  Link: {row['Link']}")
else:
    print("No articles collected to filter!")


Filtering articles by VN30 keywords...

Total articles before filtering: 553
Articles matching VN30 keywords: 29
✓ Filtered articles saved to 'cafef_vn30_filtered_articles.csv'
✓ Filtered articles saved to 'cafef_vn30_filtered_articles.json'

Sample of filtered articles (showing first 5):

- Title: Thủ đoạn của 'trùm buôn người' đội lốt thị trưởng ở Philippines
  Codes: MBB
  Link: https://cafef.vn/thu-doan-cua-trum-buon-nguoi-doi-lot-thi-truong-o-philippines-18825112204384971.chn

- Title: Nga phát triển thành công thuốc điều trị ung thư Pembroria như thế nào?
  Codes: MBB
  Link: https://cafef.vn/nga-phat-trien-thanh-cong-thuoc-dieu-tri-ung-thu-pembroria-nhu-the-nao-188251117101819073.chn

- Title: Ukraine đánh 'cảng dầu số 2 của Nga', khói trùm Novorossiysk — Moscow đáp trả bằng 430 UAV và 18 tên lửa
  Codes: SSI
  Link: https://cafef.vn/ukraine-danh-cang-dau-so-2-cua-nga-khoi-trum-novorossiysk-moscow-dap-tra-bang-430-uav-va-18-ten-lua-188251115181826018.chn

- Title: UBCKNN “bật đ