In [1]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
from datetime import datetime
import re
from tqdm import tqdm

In [2]:
cff = pd.DataFrame(
    {
        "muc": ["chung_khoan", "bds", 'doanh_nghiep', 'ngan_hang', 'vi_mo', "tcqt", 'thi_truong'],
        'id': [18831, 18832, 18833, 18834, 18835, 18836, 18839],
        'max_page': [10, 16, 13, 9, 10, 16, 15]
    }
)
cff = cff.set_index("muc")

In [3]:
def get_article(url):
    url_test = url
    html_test = BeautifulSoup(requests.get(url_test).content, 'html.parser')

    dateandcat = html_test.find("p", {"class":"dateandcat"})
    title = html_test.find("h1", {"data-role":"title"}).get_text().strip()
    date = datetime.strptime(
        dateandcat.find("span", {"class":"pdate"}).get_text()[:-4],
        '%d-%m-%Y - %H:%M'
    ).timestamp()
    description = html_test.find("h2", {"class": "sapo"}).get_text().strip()
    categ = dateandcat.find("a", {"data-role":"cate-name"}).get_text()

    body = [text.get_text().strip() for text in html_test.find("div", {"id":"mainContent"}).find_all("p")]
    body = ' '.join(body).replace("\n", "")
    return {"date": date, "title":title, "des": description, "categ": categ, "body": body}

In [None]:
def crawl_cafef(categ_id, max_page):
    article_urls = []
    for page in range(1, max_page + 1):
        url = f'https://cafef.vn/timelinelist/{categ_id}/{page}.chn'
        req = requests.get(url).content
        html = BeautifulSoup(req, "html.parser")
        article_urls.extend(['https://cafef.vn' + h3.find("a").get("href") for h3 in html.find_all("h3")])
    urls = []
    dates = []
    titles = []
    descs = []
    categs = []
    bodies = []
    # i = 0
    for url in tqdm(article_urls):
        # print(len(article_urls) - i , "articles remaining")
        try:
            data = get_article(url)
            if data["date"] < datetime.strptime('30/10/2024 00:00', '%d/%m/%Y %H:%M').timestamp():
                next
            else:
                urls.append(url)
                dates.append(data["date"])
                titles.append(data["title"])
                descs.append(data["des"])
                categs.append(data["categ"])
                bodies.append(data["body"])
                # i += 1
        except Exception as error:
            print(f'error at: ', url, error)

    dict = {
        'date': dates,
        # 'source' : 'cafef',
        'url' : urls,
        'title' : titles,
        'description' : descs,
        'category' : categs,
        'body' : bodies
    }
    data = pd.DataFrame(dict)
    data["source"] = "CafeF"
    return data

In [5]:
chung_khoan = crawl_cafef(
    cff.loc["chung_khoan", "id"],
    cff.loc["chung_khoan", "max_page"]
)
chung_khoan.to_csv("cafef_chung_khoan.csv", index= False)

bds = crawl_cafef(
    cff.loc["bds", "id"],
    cff.loc["bds", "max_page"]
)
bds.to_csv("cafef_bds.csv", index= False)

doanh_nghiep = crawl_cafef(
    cff.loc["doanh_nghiep", "id"],
    cff.loc["doanh_nghiep", "max_page"]
)
doanh_nghiep.to_csv('cafef_dn.csv', index= False)

ngan_hang = crawl_cafef(
    cff.loc["ngan_hang", "id"],
    cff.loc["ngan_hang", "max_page"]
)
ngan_hang.to_csv("cafef_ngan_hang.csv", index= False)

vi_mo = crawl_cafef(
    cff.loc["vi_mo", "id"],
    cff.loc["vi_mo", "max_page"]
)
vi_mo.to_csv("cafef_vi_mo.csv", index= False)

tcqt = crawl_cafef(
    cff.loc["tcqt", "id"],
    cff.loc["tcqt", "max_page"]
)
tcqt.to_csv("cafef_tcqt.csv", index= False)

thi_truong = crawl_cafef(
    cff.loc["thi_truong", "id"],
    cff.loc["thi_truong", "max_page"]
)
thi_truong.to_csv("cafef_thi_truong.csv", index= False)


100%|██████████| 150/150 [00:37<00:00,  4.02it/s]
 22%|██▏       | 53/240 [00:11<00:46,  4.04it/s]

error at:  https://cafef.vn/big-story/ket-qua-cuoc-dua-trump-harris-23-3-ong-trump-gianh-duoc-3-bang-ba-harris-thang-o-vermont-188241106074440362.chn 'NoneType' object has no attribute 'find'


 31%|███       | 74/240 [00:17<00:45,  3.65it/s]

error at:  https://cafef.vn/big-story/my-chinh-thuc-buoc-vao-ngay-bau-cu-da-co-ket-qua-cua-diem-bo-phieu-dau-tien-188241105144036214.chn 'NoneType' object has no attribute 'find'


 32%|███▎      | 78/240 [00:18<00:42,  3.84it/s]

error at:  https://cafef.vn/nhung-ti-phu-cong-nghe-am-tham-ung-ho-hai-ung-vien-tong-thong-my-18824110511563972.chn 'NoneType' object has no attribute 'get_text'


 39%|███▉      | 94/240 [00:22<00:37,  3.92it/s]

error at:  https://cafef.vn/tan-tong-thong-my-va-nhung-kich-ban-sau-bau-cu-188241105073319444.chn 'NoneType' object has no attribute 'get_text'


100%|██████████| 240/240 [01:00<00:00,  3.96it/s]
 86%|████████▌ | 168/195 [00:43<00:06,  3.86it/s]

Min date reached!



  8%|▊         | 11/135 [00:02<00:26,  4.70it/s]

error at:  https://cafef.vn/cu-chuyen-minh-giup-bidv-metlife-tang-cuong-niem-tin-cua-khach-hang-tren-con-duong-dong-hanh-xay-dung-mot-doi-dang-song-188241105190815865.chn 'NoneType' object has no attribute 'get_text'


 17%|█▋        | 23/135 [00:04<00:23,  4.79it/s]

error at:  https://cafef.vn/cu-chuyen-minh-giup-bidv-metlife-tang-cuong-niem-tin-cua-khach-hang-tren-con-duong-dong-hanh-xay-dung-mot-doi-dang-song-188241105190815865.chn 'NoneType' object has no attribute 'get_text'


 97%|█████████▋| 131/135 [00:43<00:01,  2.98it/s]

error at:  https://cafef.vn/vpbank-hanoi-international-marathon-2024-diem-cham-cam-xuc-va-sang-tao-188241029232559217.chn 'NoneType' object has no attribute 'get_text'
Min date reached!



100%|██████████| 150/150 [00:36<00:00,  4.10it/s]
  3%|▎         | 7/240 [00:01<00:47,  4.87it/s]

error at:  https://cafef.vn/thuong-hieu-o-to-100-nam-tuoi-va-cau-chuyen-tro-thanh-xe-cua-moi-nha-tai-viet-nam-188241106120610324.chn 'NoneType' object has no attribute 'get_text'


 10%|▉         | 23/240 [00:04<00:37,  5.77it/s]

error at:  https://cafef.vn/thuong-hieu-o-to-100-nam-tuoi-va-cau-chuyen-tro-thanh-xe-cua-moi-nha-tai-viet-nam-188241106120610324.chn 'NoneType' object has no attribute 'get_text'


 15%|█▍        | 35/240 [00:06<00:37,  5.43it/s]

error at:  https://cafef.vn/roi-nghe-giang-vien-vi-me-ngam-bang-dien-ceo-manulife-investment-management-viet-nam-tiet-lo-bi-kip-giup-dau-tu-vui-ve-va-tre-trung-188241106001814072.chn 'NoneType' object has no attribute 'get_text'


 62%|██████▏   | 148/240 [00:43<00:26,  3.43it/s]

error at:  https://cafef.vn/xay-dung-duong-cao-toc-cong-nghe-nganh-duoc-nhin-tu-su-quyet-liet-cua-gonsa-188241031124511682.chn 'NoneType' object has no attribute 'get_text'


 65%|██████▍   | 155/240 [00:45<00:18,  4.54it/s]

error at:  https://cafef.vn/viet-nam-co-nhieu-tiem-nang-de-thuc-day-tang-truong-xuat-khau-truc-tuyen-thong-qua-thuong-mai-dien-tu-188241031094543644.chn 'NoneType' object has no attribute 'get_text'


 79%|███████▉  | 189/240 [00:53<00:10,  4.79it/s]

error at:  https://cafef.vn/ong-tran-le-nguyen-chia-se-bi-quyet-ma-cua-kido-va-loi-khuyen-dac-biet-danh-cho-nha-dau-tu-ca-nhan-188241030002531918.chn 'NoneType' object has no attribute 'get_text'


 80%|███████▉  | 191/240 [00:55<00:14,  3.42it/s]

Min date reached!



 92%|█████████▏| 207/225 [01:32<00:08,  2.23it/s]

Min date reached!





In [13]:
cafef_news_data = pd.concat([chung_khoan, bds, doanh_nghiep, ngan_hang, vi_mo, tcqt, thi_truong])
cafef_news_data.to_csv("CafeF_data.csv")