## Data Collection,Data Engineering & Cleaning 
Our team uses its own dataset obtained through web crawling.

In this ipynb file, the main task we have accomplished is to retrieve the latest about 1000 news headlines and their contents from Sina News.And we also perform data cleaning and management.

https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page={i}

In [1]:
import sys
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from bs4 import BeautifulSoup
import requests
import time
import re
import random
import csv
import pandas as pd

In [2]:
def crawl_with_selenium(url,title_href_dict,k):
    chrome_options = Options()
    chrome_options.add_argument("--headless")  
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0"
)

    driver = webdriver.Chrome(options=chrome_options)
    try:
        driver.get(url)
        time.sleep(3)  
        print(f"Crawling page {k}...")
        html = driver.page_source
        soup = BeautifulSoup(html, 'html.parser')
        for x in soup.find_all('a',{'target':['_blank']},href=True):
            for y in x:
                title_href_dict[y] = x['href'].strip()
        del title_href_dict['意见反馈留言板']
        return title_href_dict

    except Exception as e:
        print(f"Error with Selenium {url}: {e}")
        return ""
    finally:
        driver.quit()
        
def crawl_sina_article(url):
    headers = {
        'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/141.0.0.0 Safari/537.36 Edg/141.0.0.0'
    }
    try:
        response = requests.get(url, headers=headers, timeout=random.uniform(3, 5))
        response.encoding = 'utf-8'
        soup = BeautifulSoup(response.text, 'html.parser')
        
        content_div = soup.find('div', id='artibody') or \
                      soup.find('div', class_='article') or \
                      soup.find('div', class_='content')
        
        if not content_div:
            return ""
        
        for tag in content_div(['script', 'style', 'a', 'img', 'iframe']):
            tag.decompose()
        
        text = content_div.get_text(strip=True)
        cleaned = re.sub(r'[^\u4e00-\u9fa5，。！？；：、‘’“”（）【】\n\s]', '', text)
        return cleaned.strip()
    
    except Exception as e:

        print(f"Failed to crawl {url}: {e}")
        return ""
  

In [3]:
if __name__=='__main__':
    title_href_dict = dict()
    for i in range(1,21):
        title_url = f"https://news.sina.com.cn/roll/#pageid=153&lid=2509&k=&num=50&page={i}"
        crawl_with_selenium(title_url,title_href_dict,i)
    data_dict = dict()
    for title,url in title_href_dict.items():
        data_dict[title] = crawl_sina_article(url)


Crawling page 1...
Crawling page 2...
Crawling page 3...
Crawling page 4...
Crawling page 5...
Crawling page 6...
Crawling page 7...
Crawling page 8...
Crawling page 9...
Crawling page 10...
Crawling page 11...
Crawling page 12...
Crawling page 13...
Crawling page 14...
Crawling page 15...
Crawling page 16...
Crawling page 17...
Crawling page 18...
Crawling page 19...
Crawling page 20...


In [4]:
df = pd.DataFrame(columns=['文本标题','文本内容'])
#remove the part after "海量资讯、精准解读"
for title,content in data_dict.items():
    cleaned_content = re.split(r'海量资讯、精准解读', content)[0].strip()
    df.loc[len(df.index)] = [title, cleaned_content]
    
df.info()
df.to_csv('./data/xinlangnews.csv', index=False)

<class 'pandas.core.frame.DataFrame'>
Index: 975 entries, 0 to 974
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   文本标题    975 non-null    object
 1   文本内容    975 non-null    object
dtypes: object(2)
memory usage: 22.9+ KB
