In [56]:
!pwd

/Volumes/MTS800/tmp/hust/study_at_hust/semesters/2025_1/DS/hello_ngoc_linh/cafef


In [57]:
!pip install beautifulsoup4 requests pandas numpy openpyxl openai python-dotenv selenium webdriver-manager httpx pydantic tqdm



In [58]:
# Import required libraries
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import json
import os
import re
from datetime import datetime
from openai import OpenAI
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

True

In [59]:
LIMIT_CATEGORY = True
NUM_CATEGORIES = 5
MAX_PAGES_PER_CATEGORY = 5

# Date filtering option
FILTER_BY_DATE = False  # Set to False to disable date filtering

# Date range filter (inclusive)
START_DATE = datetime(2020, 1, 1)
END_DATE = datetime(2024, 12, 31)

# Base URL for cafef website
BASE_URL = "https://cafef.vn"

In [60]:
# Check if API key is loaded
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    print("WARNING: OPENAI_API_KEY not found in environment variables!")
    print("Please create a .env file with: OPENAI_API_KEY=your-api-key")
    print("Or set it manually below:")
    # Uncomment and add your key here if .env doesn't work
    # api_key = "your-api-key-here"
else:
    print("API key loaded successfully")

def get_categories():
    # Fetch the homepage HTML
    html = requests.get(BASE_URL, timeout=10).text
    soup = BeautifulSoup(html, "html.parser")

    categories = []

    # Find the menu category div
    menu_div = soup.find("div", class_="menucategory")
    if menu_div:
        # Find all anchor tags within the menu
        for a in menu_div.find_all("a", href=True):
            href = a.get("href", "")
            title = a.get("title", a.text).strip()
            
            # Skip empty titles and home page
            if not title or href == "/":
                continue
            
            # Only keep links ending with .chn
            if href.endswith(".chn"):
                # Normalize the URL to absolute path
                if href.startswith("/"):
                    full_url = BASE_URL.rstrip("/") + href
                else:
                    full_url = href

                categories.append({
                    "title": title,
                    "url": full_url
                })

    return categories


def filter_category_pages(categories):
    # Initialize OpenAI client with API key from environment variable
    if not api_key:
        print("Skipping GPT filtering - using all categories")
        return categories
    
    client = OpenAI(api_key=api_key)

    # Format categories as text for GPT
    urls_text = "\n".join([f"- {c['title']}: {c['url']}" for c in categories])

    # Call GPT to filter out article pages and keep only main category pages
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {
                "role": "system",
                "content": "You are an assistant that filters a list of URLs. "
                           "Keep only main category pages and remove specific news articles or subcategories. "
                           "Return only a JSON array of objects with 'title' and 'url'."
            },
            {
                "role": "user",
                "content": f"Here is the list:\n{urls_text}\n\nFilter them and return JSON array."
            }
        ],
        temperature=0
    )

    # Parse the JSON response
    filtered_json = response.choices[0].message.content
    try:
        filtered_list = json.loads(filtered_json)
    except json.JSONDecodeError:
        print("Failed to parse JSON. Here's raw output:")
        print(filtered_json)
        filtered_list = []

    return filtered_list


# Get all categories from the homepage
cats = get_categories()

# Check if categories were found
if not cats:
    print("ERROR: No categories found!")
else:
    print(f"Found {len(cats)} categories")

# Filter to keep only main category pages
filtered_cats = filter_category_pages(cats)

# Check if filtering returned results
if not filtered_cats:
    print("WARNING: No filtered categories returned. Using all categories instead.")
    filtered_cats = cats

# Display filtered categories
print(f"\nFiltered Categories ({len(filtered_cats)}):")
for c in filtered_cats:
    print(f"- {c['title']}: {c['url']}")

API key loaded successfully
Found 26 categories
Found 26 categories
Failed to parse JSON. Here's raw output:
```json
[
    {
        "title": "XÃ HỘI",
        "url": "https://cafef.vn/xa-hoi.chn"
    },
    {
        "title": "CHỨNG KHOÁN",
        "url": "https://cafef.vn/thi-truong-chung-khoan.chn"
    },
    {
        "title": "BẤT ĐỘNG SẢN",
        "url": "https://cafef.vn/bat-dong-san.chn"
    },
    {
        "title": "DOANH NGHIỆP",
        "url": "https://cafef.vn/doanh-nghiep.chn"
    },
    {
        "title": "NGÂN HÀNG",
        "url": "https://cafef.vn/tai-chinh-ngan-hang.chn"
    },
    {
        "title": "Smart Money",
        "url": "https://cafef.vn/smart-money.chn"
    },
    {
        "title": "TÀI CHÍNH QUỐC TẾ",
        "url": "https://cafef.vn/tai-chinh-quoc-te.chn"
    },
    {
        "title": "VĨ MÔ",
        "url": "https://cafef.vn/vi-mo-dau-tu.chn"
    },
    {
        "title": "KINH TẾ SỐ",
        "url": "https://cafef.vn/kinh-te-so.chn"
    },
    {
    

In [61]:
# Hardcoded zone_id for all categories
DEFAULT_ZONE_ID = 18832

# Mapping of VN30 stock codes to their related keywords
KEYWORDS_MAP = {
    "ACB": ["ACB", "Ngân hàng ACB", "Ngân hàng TMCP Á Châu"],
    "BCM": ["BCM", "Becamex", "KCN Bình Dương", "khu công nghiệp Bình Dương", "VSIP", "Becamex IDC"],
    "BID": ["BIDV", "Ngân hàng Đầu tư và Phát triển Việt Nam"],
    "CTG": ["CTG", "VietinBank", "Ngân hàng Công Thương Việt Nam"],
    "DGC": ["DGC", "Hóa chất Đức Giang"],
    "FPT": ["FPT"],
    "GAS": ["PV GAS", "PV Gas", "Tổng Công ty Khí Việt Nam"],
    "GVR": ["GVR", "Tập đoàn Cao su", "Tập đoàn Công nghiệp Cao su Việt Nam"],
    "HDB": ["HDB", "HDBank", "Ngân hàng TMCP Phát triển Thành phố Hồ Chí Minh"],
    "HPG": ["HPG", "Hòa Phát"],
    "LPB": ["LPB", "LPBank", "LienVietPostBank", "Ngân hàng Bưu điện Liên Việt"],
    "MBB": ["MBB", "MBBank", "Ngân hàng Quân đội", "MB", "Ngân hàng TMCP Quân đội"],
    "MSN": ["MSN", "Masan", "WinCommerce"],
    "MWG": ["MWG", "Thế Giới Di Động", "Mobile World", "Bách Hóa Xanh", "BHX", "Điện Máy Xanh", "ĐMX", "TGDĐ"],
    "PLX": ["PLX", "Petrolimex", "Tập đoàn Xăng dầu Việt Nam"],
    "SAB": ["SAB", "Sabeco", "Tổng Công ty CP Bia - Rượu - Nước giải khát Sài Gòn"],
    "SHB": ["SHB", "Ngân hàng Thương mại Cổ phần Sài Gòn – Hà Nội", "Ngân hàng TMCP Sài Gòn Hà Nội"],
    "SSB": ["SSB", "Ngân hàng Thương mại Cổ phần Đông Nam Á", "Ngân hàng TMCP Đông Nam Á", "SeABank"],
    "SSI": ["SSI", "Chứng khoán SSI"],
    "STB": ["STB", "Sài Gòn Thương Tín", "Sacombank"],
    "TCB": ["TCB", "Techcombank", "Ngân hàng TMCP Kỹ Thương Việt Nam"],
    "TPB": ["TPB", "TPBank", "Ngân hàng Tiên Phong", "Ngân hàng TMCP Tiên Phong"],
    "VCB": ["VCB", "Vietcombank", "Ngân hàng TMCP Ngoại Thương Việt Nam", "Ngân hàng Ngoại thương"],
    "VHM": ["VHM", "Vinhomes"],
    "VIB": ["VIB", "Ngân hàng TMCP Quốc Tế Việt Nam", "Ngân hàng Quốc Tế"],
    "VIC": ["VIC", "Vingroup", "Công ty Cổ phần Tập đoàn Vingroup"],
    "VJC": ["VJC", "Vietjet Air", "Công ty Cổ phần Hàng không Vietjet", "máy bay Vietjet"],
    "VNM": ["VNM", "Vinamilk", "Công ty Cổ phần Sữa Việt Nam"],
    "VPB": ["VPB", "VPBank", "Ngân hàng TMCP Việt Nam Thịnh Vượng"],
    "VRE": ["VRE", "Vincom Retail", "Công ty Cổ phần Vincom Retail"]
}

In [62]:
class CafefScraper:
    BASE_URL = "https://cafef.vn"
    
    def __init__(self, zone_id, category_url, category_name=""):
        self.zone_id = zone_id
        self.category_url = category_url
        self.category_name = category_name
        self.headers = {
            'Accept': '*/*',
            'Referer': category_url,
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'X-Requested-With': 'XMLHttpRequest',
        }
        self.articles = []
    
    def parse_html(self, html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        articles = []
        
        article_elements = soup.select('.firstitem, .cate-hl-row2 .big, .tlitem.box-category-item')
        
        for article in article_elements:
            try:
                title_element = article.find(['h2', 'h3']).find('a')
                if not title_element:
                    continue
                
                title = title_element.get('title', title_element.text).strip()
                relative_link = title_element['href']
                link = self.BASE_URL + relative_link if relative_link.startswith('/') else relative_link
                
                summary_element = article.find('p', class_='sapo')
                summary = summary_element.text.strip() if summary_element else "N/A"
                
                articles.append({
                    'Title': title,
                    'Link': link,
                    'Summary': summary,
                })
            except Exception:
                continue
        
        return articles
    
    def scrape_initial_page(self):
        print(f"Loading initial page: {self.category_url}")
        try:
            response = requests.get(self.category_url, headers=self.headers, timeout=15)
            response.raise_for_status()
            initial_articles = self.parse_html(response.text)
            self.articles.extend(initial_articles)
            print(f"Found {len(initial_articles)} articles on initial page")
            return True
        except requests.RequestException as e:
            print(f"Error loading initial page: {e}")
            return False
    
    def scrape_api_pages(self, MAX_PAGES_PER_CATEGORY):
        for page_num in range(1, MAX_PAGES_PER_CATEGORY + 1):
            api_url = f"{self.BASE_URL}/timelinelist/{self.zone_id}/{page_num}.chn"
            print(f"Loading API page {page_num}...")
            
            try:
                response = requests.get(api_url, headers=self.headers, timeout=10)
                response.raise_for_status()
                
                if not response.text.strip():
                    print("  ⚠ No more articles available. Reached end of pages.")
                    break
                
                page_articles = self.parse_html(response.text)
                
                if not page_articles:
                    print("  ⚠ Last page reached (no articles found).")
                    break
                
                self.articles.extend(page_articles)
                print(f"Loaded {len(page_articles)} articles from page {page_num}")
                
                time.sleep(1)
                
            except requests.RequestException as e:
                print(f"Error loading API page {page_num}: {e}")
                break
    
    def scrape(self, max_pages=5):
        if not self.scrape_initial_page():
            return []
        
        self.scrape_api_pages(MAX_PAGES_PER_CATEGORY)
        return self.articles
    
    def to_dataframe(self):
        df = pd.DataFrame(self.articles)
        df.drop_duplicates(subset=['Link'], inplace=True, keep='first')
        return df

In [63]:
# Check if we have categories to scrape
if not filtered_cats:
    print("ERROR: No categories available to scrape!")
else:
    # Store all articles from all categories
    all_articles_list = []
    
    total_categories = len(filtered_cats)
    print(f"\n{'='*60}")
    print(f"Starting to scrape {total_categories} categories")
    print(f"{'='*60}\n")

    # Scrape each category
    cnt = 0
    for idx, cat in enumerate(filtered_cats, 1):
        print(f"\n{'='*60}")
        print(f"[{idx}/{total_categories}] Processing category: {cat['title']}")
        print(f"URL: {cat['url']}")
        print(f"{'='*60}")
        
        # Use hardcoded zone_id
        zone_id = DEFAULT_ZONE_ID
        print(f"Zone ID: {zone_id}")
        
        # Create scraper instance
        scraper = CafefScraper(zone_id, cat['url'], cat['title'])
        scraper.scrape(max_pages=MAX_PAGES_PER_CATEGORY)
        
        # Convert to dataframe
        df = scraper.to_dataframe()
        
        print(f"Total unique articles found: {len(df)}")
        
        # Add articles to list
        all_articles_list.extend(df.to_dict('records'))
        
        print(f"✓ Category '{cat['title']}' completed ({idx}/{total_categories})")
    
        # Delay between categories
        time.sleep(2)
        
        if LIMIT_CATEGORY:
            cnt += 1
            if cnt >= NUM_CATEGORIES:
                break

    print(f"\n{'='*60}")
    print(f"Scraping completed for all categories")
    print(f"Total articles collected: {len(all_articles_list)}")


Starting to scrape 26 categories


[1/26] Processing category: XÃ HỘI
URL: https://cafef.vn/xa-hoi.chn
Zone ID: 18832
Loading initial page: https://cafef.vn/xa-hoi.chn
Found 24 articles on initial page
Loading API page 1...
Loaded 15 articles from page 1
Found 24 articles on initial page
Loading API page 1...
Loaded 15 articles from page 1
Loading API page 2...
Loaded 15 articles from page 2
Loading API page 2...
Loaded 15 articles from page 2
Loading API page 3...
Loaded 15 articles from page 3
Loading API page 3...
Loaded 15 articles from page 3
Loading API page 4...
Loaded 15 articles from page 4
Loading API page 4...
Loaded 15 articles from page 4
Loading API page 5...
Loaded 15 articles from page 5
Loading API page 5...
Loaded 15 articles from page 5
Total unique articles found: 84
✓ Category 'XÃ HỘI' completed (1/26)
Total unique articles found: 84
✓ Category 'XÃ HỘI' completed (1/26)

[2/26] Processing category: CHỨNG KHOÁN
URL: https://cafef.vn/thi-truong-chung-khoan.chn
Zone 

In [64]:
def extract_publish_date(article_url):
    """
    Extract publication date from article page's meta tag.
    Returns date in format YYYY-MM-DD or None if not found.
    """
    try:
        # Use proper browser headers to avoid 403 errors
        date_headers = {
            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'Referer': 'https://cafef.vn/',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1'
        }
        
        response = requests.get(article_url, headers=date_headers, timeout=10)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find meta tag with article:published_time
        meta_tag = soup.find('meta', property='article:published_time')
        if meta_tag and meta_tag.get('content'):
            # Extract date part (YYYY-MM-DD) from datetime string
            datetime_str = meta_tag['content']
            date_part = datetime_str.split('T')[0]
            return date_part
        
        return None
    except Exception as e:
        print(f"  ✗ Error extracting date: {e}")
        return None


def is_date_in_range(date_str, start_date=START_DATE, end_date=END_DATE):
    """
    Check if date string is within the specified range.
    """
    if not date_str or date_str == 'N/A':
        return False
    try:
        article_date = datetime.strptime(date_str, '%Y-%m-%d')
        return start_date <= article_date <= end_date
    except:
        return False


def filter_and_group_by_codes(articles_list, keywords_map=KEYWORDS_MAP):
    """
    Filter articles by VN30 keywords and group by stock codes.
    Uses word boundaries to match full keywords only.
    Returns a list of dictionaries with stock code and associated articles.
    """
    # Dictionary to store articles for each code (using link as key to avoid duplicates)
    code_articles = {code: {} for code in keywords_map.keys()}
    
    for article in articles_list:
        # Combine title and summary for keyword matching
        text = (str(article.get('Title', '')) + " " + str(article.get('Summary', ''))).lower()
        matched_codes = []
        
        # Check each stock code's keywords
        for code, kws in keywords_map.items():
            for kw in kws:
                # Use word boundary regex to match full words only
                # \b ensures we match complete words, not substrings
                pattern = r'\b' + re.escape(kw.lower()) + r'\b'
                if re.search(pattern, text):
                    matched_codes.append(code)
                    break
        
        # Add article to each matched code's dictionary (using link as key to prevent duplicates)
        article_link = article.get('Link', '')
        for code in matched_codes:
            if article_link and article_link not in code_articles[code]:
                code_articles[code][article_link] = {
                    'Stock_Code': code,
                    'Title': article.get('Title', ''),
                    'Link': article_link,
                    'Summary': article.get('Summary', 'N/A')
                }
    
    # Flatten into single list, sorted by stock code
    result = []
    for code in sorted(code_articles.keys()):
        result.extend(code_articles[code].values())
    
    return result, {code: list(articles.values()) for code, articles in code_articles.items()}


# Filter and group articles by VN30 keywords
print(f"\n{'='*60}")
print("Filtering and grouping articles by VN30 stock codes...")
print(f"{'='*60}\n")

if all_articles_list:
    print(f"Total articles before filtering: {len(all_articles_list)}")
    
    # Filter and group by codes
    grouped_articles, code_dict = filter_and_group_by_codes(all_articles_list, keywords_map=KEYWORDS_MAP)
    
    print(f"Articles matching VN30 keywords: {len(grouped_articles)}")
    
    # Extract publication dates for filtered articles
    if grouped_articles:
        print(f"\n{'='*60}")
        print("Extracting publication dates from article pages...")
        print(f"{'='*60}\n")
        
        for idx, article in enumerate(grouped_articles, 1):
            print(f"[{idx}/{len(grouped_articles)}] Extracting date for: {article['Title'][:50]}...")
            pub_date = extract_publish_date(article['Link'])
            article['Date'] = pub_date if pub_date else 'N/A'
            time.sleep(0.5)  # Delay between requests
        
        print("\n✓ Date extraction completed")
        
        # Filter by date range
        if FILTER_BY_DATE:
            print(f"\n{'='*60}")
            print(f"Filtering articles by date range: {START_DATE.strftime('%Y-%m-%d')} to {END_DATE.strftime('%Y-%m-%d')}")
            print(f"{'='*60}\n")
            
            articles_before_date_filter = len(grouped_articles)
            grouped_articles = [art for art in grouped_articles if is_date_in_range(art['Date'])]
            
            print(f"Articles before date filter: {articles_before_date_filter}")
            print(f"Articles after date filter: {len(grouped_articles)}")
        else:
            print(f"\n⚠ Date filtering is disabled (FILTER_BY_DATE=False)")
    
    # Save to single CSV file
    output_filename = "titles_vn30_cafef.csv"
    if grouped_articles:
        df_output = pd.DataFrame(grouped_articles)
        # Reorder columns to have Date after Stock_Code
        cols = ['Stock_Code', 'Date', 'Title', 'Link', 'Summary']
        df_output = df_output[cols]
        df_output.to_csv(output_filename, index=False, encoding='utf-8-sig')
        print(f"✓ Saved to '{output_filename}'")
        
        # Recalculate statistics from date-filtered articles
        code_stats_filtered = {}
        for article in grouped_articles:
            code = article['Stock_Code']
            code_stats_filtered[code] = code_stats_filtered.get(code, 0) + 1
        
        # Print statistics by stock code
        print(f"\n{'='*60}")
        print("Statistics by stock code (after date filtering):")
        print(f"{'='*60}")
        for code in sorted(code_stats_filtered.keys()):
            count = code_stats_filtered[code]
            if count > 0:
                print(f"{code}: {count} articles")
        
        # Display sample
        print(f"\n{'='*60}")
        print("Sample articles (first 5):")
        print(f"{'='*60}")
        for idx, row in df_output.head(5).iterrows():
            print(f"\n[{row['Stock_Code']}] {row['Date']} - {row['Title']}")
            print(f"Link: {row['Link']}")
    else:
        print("No articles matched VN30 keywords or date range!")
else:
    print("No articles collected to filter!")


Filtering and grouping articles by VN30 stock codes...

Total articles before filtering: 423
Articles matching VN30 keywords: 19

Extracting publication dates from article pages...

[1/19] Extracting date for: Lãi suất tiết kiệm Agribank, BIDV, VietinBank, Vie...
Articles matching VN30 keywords: 19

Extracting publication dates from article pages...

[1/19] Extracting date for: Lãi suất tiết kiệm Agribank, BIDV, VietinBank, Vie...
[2/19] Extracting date for: Động thái lạ của các “ông lớn” ngân hàng giữa làn ...
[2/19] Extracting date for: Động thái lạ của các “ông lớn” ngân hàng giữa làn ...
[3/19] Extracting date for: Lãi suất tiết kiệm Agribank, BIDV, VietinBank, Vie...
[3/19] Extracting date for: Lãi suất tiết kiệm Agribank, BIDV, VietinBank, Vie...
[4/19] Extracting date for: Tỷ phú Trần Đình Long và dàn lãnh đạo Hòa Phát đem...
[4/19] Extracting date for: Tỷ phú Trần Đình Long và dàn lãnh đạo Hòa Phát đem...
[5/19] Extracting date for: UBCKNN “bật đèn xanh”, SSI sắp tăng vốn điều