In [1]:
import pandas as pd
import requests
import time

In [2]:
def transform_date_format(date):
    return time.strftime('%Y/%m/%d %H:%M:%S', time.gmtime(date))

In [3]:
# 匯入JSON檔
initial_url = "https://news.cnyes.com/api/v3/news/category/tw_stock"
target_url = initial_url

news_collection = pd.DataFrame()
while True:
    response = requests.get(target_url) # 匯入第一頁url
    response_dict = response.json()
    
    response_status = response_dict['statusCode']
    target_url = response_dict['items']['next_page_url'] # 紀錄下一頁url
    if target_url == None: # 最後一頁則不再紀錄
        print('finish !')
        break
    target_url = 'https://news.cnyes.com/' + target_url
    current_page = response_dict['items']['current_page']
    
    assert response_status==200, f"Error code: {response_status}" # statusCode若=200表示伺服器傳送成功
    
    current_page_news_collection = pd.DataFrame() # 為當頁的新聞建立dataframe，再concact在一起
    for i in range(len(response_dict['items']['data'])):
        current_news = response_dict['items']['data'][i]
        title = current_news['title']
        date = transform_date_format(current_news['publishAt']) # 將時間轉換成可讀的格式

        related_stock = pd.DataFrame(current_news['market'])
        if len(related_stock) > 0:
            related_stock_code = related_stock['code'].values
            related_stock_code = ''
            for x in related_stock['code'].values:
                related_stock_code += x
                related_stock_code += ', '
            related_stock_code = related_stock_code[:-2]

            related_stock_name = related_stock['name'].values
            related_stock_name = ''
            for x in related_stock['name'].values:
                related_stock_name += x
                related_stock_name += ', '
            related_stock_name = related_stock_name[:-2]
        else:
            related_stock_code = None
            related_stock_name = None

        news_url = 'https://news.cnyes.com/news/id/{}?exp=a'.format(current_news['newsId'])

        single_news = pd.DataFrame(
            [
                [title, date, related_stock_code, related_stock_name, news_url]
            ],
            columns=[
                '新聞標題', '發布時點', '相關個股代碼', '相關個股名稱', '新聞網址' 
            ]
        )

        current_page_news_collection = pd.concat([
            current_page_news_collection,
            single_news
        ])

    news_collection = pd.concat([
        news_collection,
        current_page_news_collection
    ])
    
    print(f'Page {current_page} is finished...')
    
    time.sleep(0.5)

Page 1 is finished...
Page 2 is finished...
Page 3 is finished...
Page 4 is finished...
Page 5 is finished...
Page 6 is finished...
Page 7 is finished...
Page 8 is finished...
Page 9 is finished...
Page 10 is finished...
Page 11 is finished...
Page 12 is finished...
Page 13 is finished...
Page 14 is finished...
Page 15 is finished...
Page 16 is finished...
Page 17 is finished...
Page 18 is finished...
Page 19 is finished...
Page 20 is finished...
Page 21 is finished...
Page 22 is finished...
Page 23 is finished...
Page 24 is finished...
Page 25 is finished...
Page 26 is finished...
Page 27 is finished...
Page 28 is finished...
Page 29 is finished...
Page 30 is finished...
Page 31 is finished...
Page 32 is finished...
Page 33 is finished...
Page 34 is finished...
Page 35 is finished...
Page 36 is finished...
Page 37 is finished...
Page 38 is finished...
Page 39 is finished...
Page 40 is finished...
Page 41 is finished...
Page 42 is finished...
Page 43 is finished...
Page 44 is finished.

# 更新用
* 將第一版的xlsx檔存在資料夾中，不斷更新此code即可獲得最新的爬蟲新聞資料

In [4]:
previous_news_collection = pd.read_excel('news_collection.xlsx')
news_collection = pd.concat([
    previous_news_collection,
    news_collection
])

news_collection = news_collection.drop_duplicates()
news_collection.to_excel('news_collection.xlsx', index=False)

# 輿情分析-篩選負面字詞的新聞標的並給予正負分數

In [5]:
import jieba

jieba.set_dictionary('/Users/yvette/GitHub/Sentiment-Analysis/dictionary/dict.txt.big')

with open('/Users/yvette/GitHub/Sentiment-Analysis/dictionary/NTUSD_positive_unicode.txt') as f:
    positive_words = []
    for l in f:
        positive_words.append(l.strip())
 
with open('/Users/yvette/GitHub/Sentiment-Analysis/dictionary/NTUSD_negative_unicode.txt') as f:
    negative_words = []
    for l in f:
        negative_words.append(l.strip())

news_collection.dropna(inplace=True)
        
scorelist=[]
for i in range(len(news_collection)):
    text = news_collection['新聞標題'].iloc[i]
    seg_list = jieba.cut(text, cut_all=False, HMM=True)
    score = 0
    jieba_result = jieba.cut(text, cut_all=False, HMM=True)
    for word in jieba_result:
        if word in positive_words:
            score += 1
            print(f'詞彙:{word}, 總分:{score}')
        elif word in negative_words:
            score -= 1
            print(f'詞彙:{word}, 總分:{score}')
        else:
            pass
    scorelist.append(score)

    
news_collection['輿情分數']=scorelist
news_collection.to_excel('news_collection.xlsx')

Building prefix dict from /Users/yvette/GitHub/Sentiment-Analysis/dictionary/dict.txt.big ...
Loading model from cache /var/folders/r_/rz7m20_x5pq002gc649c_2sr0000gn/T/jieba.u9cd194e7fec48e6c769b0d0284157686.cache
Loading model cost 0.759 seconds.
Prefix dict has been built successfully.


詞彙:收盤, 總分:-1
詞彙:獲利, 總分:1
詞彙:不到, 總分:-1
詞彙:影響, 總分:0
詞彙:小, 總分:-1
詞彙:超過, 總分:1
詞彙:小, 總分:-1
詞彙:為, 總分:-2
詞彙:正常, 總分:1
詞彙:生產, 總分:2
詞彙:影響, 總分:3
詞彙:企業, 總分:-1
詞彙:資訊, 總分:0
詞彙:智慧, 總分:1
詞彙:取得, 總分:2
詞彙:逼近, 總分:-1
詞彙:了解, 總分:1
詞彙:推薦, 總分:2
詞彙:將, 總分:-1
詞彙:取得, 總分:0
詞彙:有, 總分:-1
詞彙:差距, 總分:-2
詞彙:公開, 總分:1
詞彙:說, 總分:-1
詞彙:小, 總分:-1
詞彙:通過, 總分:1
詞彙:健康, 總分:1
詞彙:預防, 總分:0
詞彙:取得, 總分:1
詞彙:說, 總分:-1
詞彙:獲利, 總分:1
詞彙:提升, 總分:1
詞彙:競爭力, 總分:0
詞彙:受惠, 總分:1
詞彙:冠軍, 總分:1
詞彙:說, 總分:-1
詞彙:獨家, 總分:1
詞彙:授權, 總分:2
詞彙:說, 總分:-1
詞彙:可望, 總分:-2
詞彙:鎖定, 總分:-1
詞彙:正式, 總分:0
詞彙:大, 總分:1
詞彙:更好, 總分:1
詞彙:表現, 總分:-1
詞彙:開發, 總分:1
詞彙:驚奇, 總分:2
詞彙:大, 總分:1
詞彙:發展, 總分:1
詞彙:積極, 總分:1
詞彙:低迷, 總分:-1
詞彙:健康, 總分:1
詞彙:業績, 總分:-1
詞彙:報喜, 總分:0
詞彙:歡慶, 總分:1
詞彙:業績, 總分:-1
詞彙:成長, 總分:0
詞彙:進行, 總分:1
詞彙:智慧, 總分:1
詞彙:真正, 總分:1
詞彙:可望, 總分:-1
詞彙:信心, 總分:0
詞彙:留意, 總分:1
詞彙:影響, 總分:1
詞彙:鎖定, 總分:0
詞彙:停滯, 總分:-1
詞彙:將, 總分:-2
詞彙:關閉, 總分:-3
詞彙:安泰, 總分:1
詞彙:法律, 總分:2
詞彙:風險, 總分:1
詞彙:衝擊, 總分:-1
詞彙:保守, 總分:-2
詞彙:最大, 總分:1
詞彙:捐贈, 總分:1
詞彙:創新, 總分:1
詞彙:可望, 總分:-1
詞彙:正式, 總分:1
詞彙:終止, 總分:-1
詞彙:長期, 總分:-2
詞彙:發展, 總分:-1
詞彙:大, 總分: