## Data Collection
Our team uses its own dataset obtained through web crawling.

In this ipynb file, the main task we have accomplished is to retrieve the latest about 1000 news headlines and their contents from  ChineseNews.And we also perform data cleaning and management.

https://www.chinanews.com.cn/scroll-news/news1.html


In [1]:
import sys
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import requests
import time
import re
import random
import csv
import pandas as pd
from urllib.parse import urljoin

In [2]:

BASE_URL = "https://www.chinanews.com.cn"
def crawl_with_selenium(title_url, title_href_dict):
    chrome_options = Options()
    chrome_options.add_argument('--headless')
    chrome_options.add_argument('--disable-gpu')
    driver = webdriver.Chrome(options=chrome_options)
    try:
        driver.get(title_url)
        time.sleep(random.uniform(2, 4))
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        content_list = soup.find('div', class_='content_list')
        if not content_list:
            return
        for item in content_list.find_all('li'):
            classes = item.get('class', [])
            if isinstance(classes, str):
                classes = [classes]
            if 'nocontent' in classes:
                continue
            title_block = item.find('div', class_='dd_bt')
            if not title_block:
                continue
            a_tag = title_block.find('a')
            if not a_tag:
                continue
            title_text = a_tag.get_text().strip()
            href = a_tag.get('href')
            if not href:
                continue
            full_url = urljoin(BASE_URL, href)
            title_href_dict[title_text] = full_url
    finally:
        driver.quit()
def crawl_content(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
    except requests.RequestException:
        return ""
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    container = soup.find('div', class_='content_maincontent_content')
    if container:
        main_content = container.find('div', class_='left_zw')
        if main_content:
            container = main_content
    else:
        container = soup
    text_blocks = []
    for block in container.find_all(['p', 'div'], recursive=True):
        classes = block.get('class', [])
        if isinstance(classes, str):
            classes = [classes]
        if any(cls in {'adInContent', 'adEditor', 'channel'} for cls in classes):
            continue
        text = block.get_text(strip=True)
        if text:
            text_blocks.append(text)
    return "\n".join(text_blocks)
if __name__=='__main__':
    title_href_dict = {}
    for i in range(1, 11):
        title_url = f"{BASE_URL}/scroll-news/news{i}.html"
        print(f"Crawling page {i}...")
        crawl_with_selenium(title_url, title_href_dict)
    data_rows = []
    for title, url in title_href_dict.items():
        content = crawl_content(url)
        cleaned_content = content.replace('\n', '')
        # Remove editor notes at the end
        cleaned_content = re.sub(r'【编辑:.*?】$', '', cleaned_content).strip()
        data_rows.append({'title': title, 'content': cleaned_content})
    df = pd.DataFrame(data_rows, columns=['title', 'content'])
    df.to_csv('./data/chinanews.csv', index=False, header=['文本标题', '文本内容'])

Crawling page 1...
Crawling page 2...
Crawling page 3...
Crawling page 4...
Crawling page 5...
Crawling page 6...
Crawling page 7...
Crawling page 8...
Crawling page 9...
Crawling page 10...


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 965 entries, 0 to 964
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    965 non-null    object
 1   content  965 non-null    object
dtypes: object(2)
memory usage: 15.2+ KB
