In [218]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
from tqdm import tqdm

In [None]:
def get_featured_url(category, max_page):
    featured_urls = []
    for page in range(1, max_page+1):
        vnecon = f"https://vneconomy.vn/{category}.htm?trang={page}"
        html = BeautifulSoup(
            requests.get(vnecon).content, 'html.parser'
        )
        featured = html.find("div", {'class':'col-12 col-lg-9 column-border'})
        heading3 = featured.find_all("h3", {'class':"story__title"})
        featured_urls.extend(['https://vneconomy.vn' + a.find('a').get('href') for a in heading3])
        return featured_urls

In [220]:
def get_article(url):
    html = BeautifulSoup(requests.get(url).content, 'html.parser')
    date = datetime.strptime(
        html.find('div', {'class':'detail__meta'}).get_text(), 
        '%H:%M %d/%m/%Y'
    ).timestamp()
    title = html.find('h1', {'class':'detail__title'}).get_text().strip()
    desc = html.find('h2', {'detail__summary'}).get_text().strip()
    categ = html.find('h1', {'class':'category-main'}).find('a').get_text().strip()
    body = html.find("div", {"class":'detail__content'})
    body_text = [text.get_text().strip() for text in body.find_all("p")]
    body_text = " ".join(body_text)
    return {
        'date': date,
        'title': title,
        'desc': desc,
        'body': body_text,
        'categ': categ
    }

In [221]:
# Get highlight section
vnecon = "https://vneconomy.vn/dau-tu.htm?trang=1"
html = BeautifulSoup(
    requests.get(vnecon).content, 'html.parser'
)
highlight = html.find("section", {'class':'zone zone--highlight'})
story_highlight = highlight.find('div', {'class':'row'}).find_all('h3', {'class':'story__title'})
story_highlight_url = ['https://vneconomy.vn' + a.find('a').get('href') for a in story_highlight]

In [222]:
vne = pd.DataFrame(
    {
        "muc": ["dau-tu", 'tai-chinh', 'kinh-te-the-gioi', 'thi-truong', 'nhip-cau-doanh-nghiep', 'dia-oc'],
        'max_page': [10, 10, 9, 10, 12, 13]
    }
)
vne = vne.set_index("muc")

In [223]:
featured_urls = []
for category in list(vne.index):
    featured_urls.extend(get_featured_url(category, vne.loc[category, "max_page"]))

In [224]:
all_urls = [*story_highlight_url, *featured_urls]

In [225]:
len(all_urls)

41

In [None]:
dates = []
urls = []
titles = []
descs = []
categs = []
bodies = []
for url in tqdm(all_urls):
    try:
        info = get_article(url)
        if info['date'] < < datetime.strptime('30/10/2024 00:00', '%d/%m/%Y %H:%M').timestamp():
            next
        dates.append(info["date"])
        urls.append(url)
        titles.append(info["title"])
        descs.append(info["desc"])
        bodies.append(info["body"])
        categs.append(info["categ"])
    except Exception as error:
        print("Error at:", url, ": ", error)

dict = {
    'date': dates,
    # 'source' : 'cafef',
    'url' : urls,
    'title' : titles,
    'description' : descs,
    'category' : categs,
    'body' : bodies
}
data = pd.DataFrame(dict)
data["source"] = "VNEconomy"

 78%|███████▊  | 32/41 [00:06<00:01,  5.11it/s]

Error at: https://vneconomy.vn/nganh-cong-nghiep-cua-ha-tinh-gap-kho.htm :  'NoneType' object has no attribute 'find'


100%|██████████| 41/41 [00:08<00:00,  4.94it/s]


In [228]:
data.to_csv("vneconomy_data.csv")