In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import re
import numpy as np

In [2]:
st = datetime.now()
data = pd.read_csv('F:\\Github\\esun_competition\\模型訓練資料\\tbrain_train_final_0610.csv')

In [50]:
def scraper(news_id: str, url: str) -> dict:
    
    full_content_dict = {}
    # 爬蟲
    headers = {"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36"}
    r = requests.get(url.strip(), headers=headers)
    # 編碼判斷
    if r.text.find('MS950') > 0:
        r.encoding = 'MS950'
    else:
        r.encoding = 'utf8'
    # 判斷網頁是否還有效
    if (r.status_code != 200) or (r.text.find('/e404?') != -1):
        full_content_dict['news_id'] = news_id
        full_content_dict['url'] = url
        full_content_dict['raw_content'] = ''
        full_content_dict['status'] = '404NotFound'
        return full_content_dict
    else:
        # 拿出P標籤的內容  
        soup = BeautifulSoup(r.text, 'html.parser')
        tmp_txt = list(map(lambda x: x.text, soup.findAll('p')))
        tmp_txt = list(filter(None, tmp_txt))
        # 例外處理:
        # 1. http://domestic.judicial.gov.tw/abbs/wkw/WHD9HN03.jsp?crtid=CYD&filenm=68ADCBFE767075384653495F3FA64520CA9752844054E3A0
        if len(tmp_txt) == 0:
            tmp_txt = list(map(lambda x: x.string, soup.findAll('pre')))
            tmp_txt = list(filter(None, tmp_txt))
        # 2. https://www.fsc.gov.tw/ch/home.jsp?id=131&parentpath=0,2&mcustomize=multimessages_view.jsp&dataserno=201908150001&aplistdn=ou=data,ou=penalty,ou=multisite,ou=chinese,ou=ap_root,o=fsc,c=tw&dtable=Penalty
        if len(tmp_txt) == 0:
            tmp_txt = list(map(lambda x: x.string, soup.findAll('div')))
            tmp_txt = list(filter(None, tmp_txt))
        # 3. https://www.nownews.com/news/20190802/3537141/
        if url.find('https://www.nownews.com/news/') > -1:
            tmp_txt = list(map(lambda x: x.text, soup.findAll('div', class_='newsMsg')))
            tmp_txt = list(filter(None, tmp_txt))
        # 4. http://domestic.judicial.gov.tw/abbs/wkw/WHD9HN03.jsp?crtid=PCD&filenm=ECE5DAECAF563DA1EDA2C73542845EC436C311AF4D5042BF
        if ''.join(tmp_txt).replace('\n', '') == '':
            tmp_txt = list(map(lambda x: x.string, soup.findAll('pre')))
            tmp_txt = list(filter(None, tmp_txt))
            
        # list join成一個str
        tmp_txt = ''.join(tmp_txt)

        # 標出內容太多的，但不一定是垃圾
        if len(tmp_txt) > 7000:
            full_content_dict['status'] = 'warning'
        else:
            full_content_dict['status'] = 'ok'

        full_content_dict['news_id'] = news_id # 必須留，避免一樣url時無法識別。
        full_content_dict['url'] = url 
        full_content_dict['raw_content'] = tmp_txt # 紀錄原始文本，釐清是清錯了，還是根本沒爬到

        return full_content_dict

# check 名字有沒有都爬到
def check_name(names: pd.Series, contents: pd.Series) -> list:
    check_list = [] # 1 element = 1 news
    for name, content in zip(names, contents):
        if name == '[]' or content is np.nan:
            check_list.append('no names.')
        else:
            # 處理名字格式
            temp_name = name.replace('[', '').replace(']', '').replace('\'', '')
            temp_name = temp_name.split(', ')
            temp_list = [] # 1 element = 1 name
            # 人名是否有在content裡，因為有多個名字，要展開比對
            for nn in temp_name: 
                if nn in content:
                    temp_list.append(True)
                else:
                    temp_list.append(False)
            check_list.append(all(temp_list)) # 每個人名都有，才算True
    return check_list

In [51]:
full_content_list = []
str_list = []
for (news_id, url) in zip(data['news_ID'].values, data['hyperlink'].values):
    try:
        full_content_list.append(scraper(news_id, url))
    except Exception as e:
        full_content_list.append({'news_id': news_id,
                                  'url': url,
                                  'raw_content': '',
                                  'status': e
                                 })      
    if news_id % 100 == 0:
        print(news_id)
print('Finish!')

KeyboardInterrupt: 

In [6]:
# to pd.df
content_df = pd.DataFrame(full_content_list)
print(content_df.shape)
print(content_df.columns)

# 把train與爬下來的資料合併
content_df = data.merge(content_df, how='inner', left_on='news_ID', right_on='news_id').drop(columns='hyperlink')
content_df = content_df[['news_ID', 'url', 'content', 'name', 'raw_content', 'status']]
content_df.columns = ['news_id', 'url', 'context', 'name', 'raw_content', 'status']
print(content_df.shape)
print(content_df.columns)


(5023, 4)
Index(['news_id', 'raw_content', 'status', 'url'], dtype='object')
(5023, 6)
Index(['news_id', 'url', 'context', 'name', 'raw_content', 'status'], dtype='object')


In [34]:
# 嘗試補爬失敗的部分
fail_index = content_df.loc[content_df['status'].str.len() > 12].index
print(fail_index)
for i in fail_index:
    try:
        temp = scraper(content_df.iloc[i, 0], content_df.iloc[i, 1])
        content_df.iloc[i, 4] = temp['raw_content']
        content_df.iloc[i, 5] = temp['status']
        print(content_df.iloc[i, 0])
    except:
        continue


Int64Index([], dtype='int64')


In [35]:
# save data
content_df.to_csv('content_df_0612.csv', index=False)
ed = datetime.now()
print(ed - st)

1:22:58.293786


# 檢查資料

In [36]:
content_df = pd.read_csv('content_df_0612.csv')

In [49]:
# 隨著時間流逝，404的應該會越來越多，所以想把以前有爬到現在沒爬到的補上
# 但目前看來並沒有增加
content_df1 = pd.read_csv('content_df_4.csv')
content_df2 = pd.read_csv('content_df_0611.csv')

# 最初版與0612，是否404的index都一樣
print(all(content_df1.loc[content_df['status'] == '404NotFound'].index 
          == content_df.loc[content_df['status'] == '404NotFound'].index))
# 0611與0612
print(all(content_df1.loc[content_df['status'] == '404NotFound'].index 
          == content_df.loc[content_df['status'] == '404NotFound'].index))

True
True


In [37]:
# 404 or 500等，找不到文章的情況
print(content_df['status'].value_counts())
content_df.loc[content_df['status'] == '404NotFound', 'news_id'].to_list()

ok             4855
404NotFound     121
Name: status, dtype: int64


[4,
 15,
 150,
 151,
 177,
 269,
 291,
 332,
 353,
 407,
 423,
 424,
 430,
 462,
 484,
 492,
 500,
 597,
 637,
 714,
 720,
 864,
 899,
 913,
 967,
 1038,
 1056,
 1063,
 1085,
 1142,
 1151,
 1160,
 1173,
 1175,
 1220,
 1303,
 1331,
 1333,
 1367,
 1478,
 1521,
 1523,
 1545,
 1666,
 1715,
 1796,
 1868,
 1900,
 1934,
 1984,
 2031,
 2114,
 2171,
 2313,
 2319,
 2397,
 2431,
 2442,
 2486,
 2494,
 2510,
 2543,
 2552,
 2725,
 2734,
 2791,
 2835,
 2851,
 2883,
 2891,
 2942,
 2985,
 3040,
 3143,
 3160,
 3194,
 3239,
 3276,
 3364,
 3392,
 3401,
 3463,
 3470,
 3524,
 3567,
 3628,
 3632,
 3647,
 3805,
 3905,
 4000,
 4004,
 4066,
 4074,
 4140,
 4149,
 4229,
 4245,
 4334,
 4400,
 4403,
 4425,
 4441,
 4447,
 4554,
 4555,
 4578,
 4593,
 4627,
 4691,
 4752,
 4795,
 4837,
 4890,
 4892,
 4898,
 4907,
 4939,
 4944,
 4946,
 5012]

In [38]:
# 找不到文章，但是有目標人名
content_df.loc[(content_df['status'] == '404NotFound') & (content_df['name'] != '[]'), 'url']

14                https://udn.com/news/story/7321/3845624
149     https://udn.com/news/story/7321/3833161?from=u...
150               https://udn.com/news/story/7321/3745835
176     https://udn.com/news/story/6656/3788408?from=u...
268               https://udn.com/news/story/7315/3710851
290     http://news.ltn.com.tw/news/society/breakingne...
352               https://udn.com/news/story/7321/3723499
422               https://udn.com/news/story/7321/3664040
719     https://news.ltn.com.tw/news/politics/paper/13...
898               https://udn.com/news/story/7332/3777321
912               https://udn.com/news/story/7321/3744162
1150    https://udn.com/news/story/7321/3751082?from=u...
1172              https://udn.com/news/story/7321/3773733
1477             https://udn.com/news/story/11322/3661563
1867       https://money.udn.com/money/story/5621/3790042
1983              https://udn.com/news/story/7315/3657017
2113              https://udn.com/news/story/7321/3775400
2170          

In [39]:
# check raw_content 是否都有爬到
print(((content_df['raw_content'].str.replace('\n', '') == '') & (content_df['status'] != '404NotFound')).sum())
content_df.loc[(content_df['raw_content'].str.replace('\n', '') == '') & (content_df['status'] != '404NotFound')]

0


Unnamed: 0,news_id,url,context,name,raw_content,status


In [40]:
# check raw_content 是否都有爬到
print(((content_df['raw_content'].isna()) & (content_df['status'] != '404NotFound')).sum())
content_df.loc[(content_df['raw_content'].isna()) & (content_df['status'] != '404NotFound')]

0


Unnamed: 0,news_id,url,context,name,raw_content,status


In [41]:
# check name(每個) 都可以在 content裡面找到
check_list = check_name(content_df['name'], content_df['raw_content'])
print(sum((pd.Series(check_list) == False) & (content_df['status'] != '404NotFound')))
content_df.loc[(pd.Series(check_list) == False) & (content_df['status'] != '404NotFound')]
# 其中 news_id = 121, 2143, 4597 都是實際文章沒有data裡的人名
# 2253 單獨爬可以，不知道為什麼機器爬就出錯

4


Unnamed: 0,news_id,url,context,name,raw_content,status
120,121,https://news.ltn.com.tw/news/society/breakingn...,新竹市中正市場得標廠商力揚展業負責人、竹市攤販協會理事長林良琪 ### 省略內文 ### 再...,"['楊展業', '林良琪', '戴吟曲']",為達最佳瀏覽效果，建議使用 Chrome、Firefox 或 Microsoft Edge ...,ok
2142,2143,https://www.chinatimes.com/realtimenews/201910...,前富味鄉董事陳瑞禮因假純芝麻油案遭境管 ### 省略內文 ### 最高法院駁回確定。,"['林秀蓉', '陳瑞禮']",前富味鄉董事陳瑞禮因假純芝麻油案遭境管，他以到大陸廣西參與富味鄉工廠建置技術改善等理由聲請暫...,ok
2252,2253,https://www.chinatimes.com/realtimenews/201903...,知名牙材設備代理商「珖億企業」負責人李建邦 ### 省略內文 ### 今依法起訴4人。,"['李建邦', '裘振儀', '張景祥', '古少禾', '何宗英']",可能是因為：,ok
4596,4597,https://www.chinatimes.com/realtimenews/201904...,「伯利恆綠能土地開發」公司負責人宋姓女子、王姓男子2人 ### 省略內文 ### 近日將對2...,"['宋芷妍', '王安石']",「伯利恆綠能土地開發」公司負責人宋姓女子、王姓男子2人，打著與農委會農改場合作名號，謊稱在南...,ok


In [86]:
# 原始train_data有兩個一樣的url
data.loc[data['hyperlink'] == 'https://www.chinatimes.com/realtimenews/20191014001482-260402?chdtv']

Unnamed: 0,news_ID,hyperlink,content,name
2142,2143,https://www.chinatimes.com/realtimenews/201910...,前富味鄉董事陳瑞禮因假純芝麻油案遭境管 ### 省略內文 ### 最高法院駁回確定。,"['林秀蓉', '陳瑞禮']"
2425,2426,https://www.chinatimes.com/realtimenews/201910...,前富味鄉董事陳瑞禮因假純芝麻油案遭境管 ### 省略內文 ### 最高法院駁回確定。,"['陳瑞禮', '陳文南']"


In [76]:
# don't run
# 前後文不可以清，實測可知前後文會包含姓名

# 清理前後文
def extract_content(context: pd.Series, raw_content: pd.Series) -> pd.Series:
    temp_list = []
    for ct, rc in zip(context, raw_content):
        # 清符號(為了正則匹配)，以"省略內文"分前後文
        pattern = re.sub('[*.?(){}\[\]\\\\]|<BR>', '', ct).split(' ### 省略內文 ### ')
        # 前後5個字做正則匹配，DOTALL可匹配\n
        pattern = re.compile(f'{pattern[0][-5:]}(.*){pattern[1][:5]}', re.DOTALL)
        content = re.search(pattern, rc) 
        
        # 沒匹配到會出錯，做try
        try:
            temp_list.append(content.group(1)) # 只要被省略的部分
        except:
            temp_list.append('')
            
    return pd.Series(temp_list)

# # content另外清
# content_df['content'] = extract_content(content_df['context'], content_df['raw_content'])