In [19]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import numpy as np

In [20]:
# 웹 페이지 정보를 추출하는 함수 수정
def extract_news_info(df, media_outlet, tag_info):
    kmib_df = df[df['언론사'] == media_outlet]
    news_urls = []  # URL을 저장할 리스트
    news_times = []  # 해당 시간을 저장할 리스트

    for i, row in kmib_df.iterrows():
        url = row['URL']
        print(i + 1, ": ", url)
        try:
            headers = {"User-Agent": "Mozilla/5.0"}
            response = requests.get(url, headers=headers)
            response.raise_for_status()  # 오류가 발생하면 예외를 발생시킴
            news = BeautifulSoup(response.content, 'html.parser', from_encoding='EUC-KR')
            news_text = news.find(tag_info['tag_name'], attrs=tag_info['tag_attrs'])
            if news_text is not None:
                news_urls.append(url)  # URL 추가
                news_times.append(news_text.text)  # 해당 시간 추가
            else:
                print("뉴스 텍스트를 가져올 수 없습니다.")
        except requests.exceptions.RequestException as e:
            print("웹 페이지를 가져오는 중 오류가 발생했습니다:", str(e))
            continue  # 다음 반복으로 넘어감

        time.sleep(3)

    news_data = pd.DataFrame({'URL': news_urls, '시간': news_times})  # 'URL'과 '시간' 열로 데이터프레임 생성
    return news_data


tag_info_dict = {
    "KBS": {"tag_name": "em", "tag_attrs": {"class": "date"}},
    "MBC": {"tag_name": "span", "tag_attrs": {"class": "input"}},
    "OBS": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "SBS": {"tag_name": "div", "tag_attrs": {"class": "date_area"}},
    "YTN": {"tag_name": "span", "tag_attrs": {"class": "time"}},
    "강원도민일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "강원일보": {"tag_name": "span", "tag_attrs": {"class": "date"}},
    "경기일보": {"tag_name": "div", "tag_attrs": {"class": "article_date"}},
    "경남도민일보": {"tag_name": "ul", "tag_attrs": {"class": "no-bullet auto-marbtm-0 line-height-6"}},
    "경상일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "경인일보": {"tag_name": "span", "tag_attrs": {"class": "news-date"}},
    "경향신문": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "광주일보": {"tag_name": "div", "tag_attrs": {"class": "read_time"}},
    "광주매일신문": {"tag_name": "ul", "tag_attrs": {"id": "byline"}},
    "국민일보": {"tag_name": "span", "tag_attrs": {"class": "t11"}},
    "국제신문": {"tag_name": "span", "tag_attrs": {"class": "f_news_date"}},
    "내일신문": {"tag_name": "div", "tag_attrs": {"class": "date"}},
    "대전일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "대구일보": {"tag_name": "span", "tag_attrs": {"class": "date"}},
    "동아일보": {"tag_name": "span", "tag_attrs": {"class": "date01"}},
    "디지털타임스": {"tag_name": "span", "tag_attrs": {"class": "url_txt"}},
    "매일경제": {"tag_name": "dl", "tag_attrs": {"class": "registration"}},
    "매일신문": {"tag_name": "div", "tag_attrs": {"class": "date"}},
    "머니투데이": {"tag_name": "ul", "tag_attrs": {"class": "info2"}},
    "무등일보": {"tag_name": "span", "tag_attrs": {"class": "txt_info"}},
    "문화일보": {"tag_name": "dl", "tag_attrs": {"class": "date"}},
    "부산일보": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "서울경제": {"tag_name": "span", "tag_attrs": {"class": "url_txt"}},
    "서울신문": {"tag_name": "span", "tag_attrs": {"itemprop": "datePublished"}},
    "세계일보": {"tag_name": "p", "tag_attrs": {"class": "viewInfo"}},
    "아시아경제": {"tag_name": "div", "tag_attrs": {"class": "date_box"}},
    "아주경제": {"tag_name": "dd", "tag_attrs": {"class": "date"}},
    "영남일보": {"tag_name": "li", "tag_attrs": {"class": "article-bottom-input"}},
    "울산매일": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "전남일보": {"tag_name": "ul", "tag_attrs": {"id": "byline"}},
    "전북도민일보": {"tag_name": "i", "tag_attrs": {"class": "fa fa-clock-o fa-fw"}},
    "전북일보": {"tag_name": "i", "tag_attrs": {"class": "far fa-clock pr5"}},
    "전자신문": {"tag_name": "time", "tag_attrs": {"class": "date"}},
    "조선일보": {"tag_name": "span", "tag_attrs": {"class": "upDate | flex flex--align-items-end "}},
    "중도일보": {"tag_name": "ul", "tag_attrs": {"class": "view-term"}},
    "중부매일": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "중부일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "중앙일보": {"tag_name": "time", "tag_attrs": {"": ""}},
    "충청일보": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "충북일보": {"tag_name": "div", "tag_attrs": {"class": "art_sum"}},
    "충청투데이": {"tag_name": "ul", "tag_attrs": {"class": "infomation"}},
    "파이낸셜뉴스": {"tag_name": "div", "tag_attrs": {"class": "byline"}},
    "한겨레": {"tag_name": "p", "tag_attrs": {"class": "date-time"}},
    "한국경제": {"tag_name": "span", "tag_attrs": {"class": "txt-date"}},
    "한국일보": {"tag_name": "dl", "tag_attrs": {"class": "wrt-text"}},
    "한라일보": {"tag_name": "ul", "tag_attrs": {"class": "byline"}},
    "헤럴드경제": {"tag_name": "li", "tag_attrs": {"class": "article_date"}},
}



In [21]:
import math

raw_df = pd.read_excel('./4500~/')
batch_size = 100
total_batches = math.ceil(len(raw_df) / batch_size)
news_data_list = []

for batch_num in range(total_batches):
    start_index = batch_num * batch_size
    end_index = start_index + batch_size
    batch_df = raw_df.iloc[start_index:end_index]
    
    for media_outlet, tag_info in tag_info_dict.items():
        news_data = extract_news_info(batch_df, media_outlet, tag_info)
        news_data_list.append(news_data)

combined_news_data = pd.concat(news_data_list, ignore_index=True)

16 :  https://news.kbs.co.kr/news/view.do?ncd=5363886&amp;ref=DA
20 :  https://imnews.imbc.com/replay/2022/nwtoday/article/6330379_35752.html
27 :  https://imnews.imbc.com/replay/2022/nwtoday/article/6330717_35752.html
42 :  https://imnews.imbc.com/replay/2022/nwtoday/article/6332098_35752.html
61 :  https://imnews.imbc.com/replay/2022/nwtoday/article/6332427_35752.html
53 :  http://www.obsnews.co.kr/news/articleView.html?idxno=1340353
3 :  http://www.ytn.co.kr/_ln/0102_202201022216442398
8 :  http://www.ytn.co.kr/_ln/0102_202201020514276848
33 :  http://www.ytn.co.kr/_ln/0115_202201100001158170
60 :  http://www.ytn.co.kr/_ln/0102_202201130808231310
81 :  http://www.ytn.co.kr/_ln/0104_202201171706136474
82 :  http://www.ytn.co.kr/_ln/0115_202201171622023015
88 :  http://www.ytn.co.kr/_ln/0104_202201182234438265
9 :  http://www.kado.net/news/articleView.html?idxno=1106894
14 :  http://www.ksilbo.co.kr/news/articleView.html?idxno=923011
58 :  http://www.khan.co.kr/economy/economy-general

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


144 :  http://www.mk.co.kr/economy/view/2022/59064
157 :  http://www.mk.co.kr/economy/view/2022/65502
172 :  http://news.mk.co.kr/newsRead.php?no=68907&year=2022
뉴스 텍스트를 가져올 수 없습니다.
170 :  https://news.imaeil.com/page/view/2022012208064660231
186 :  https://news.imaeil.com/page/view/2022012414254098379
114 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022011909290797121&type=2
152 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022011916014569001&type=2
191 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022012412137092117&type=2
200 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022012507127058842&type=2
106 :  http://www.munhwa.com/news/view.html?no=2022011901071203018001
141 :  http://www.munhwa.com/news/view.html?no=2022012001070103006001
142 :  http://www.munhwa.com/news/view.html?no=2022012001070803006002
197 :  http://www.munhwa.com/news/view.html?no=2022012501032421087002
123 :  http://www.busan.com/view/busan/view.php?code=2022011906380890144
177 :  http://w

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


441 :  http://www.mk.co.kr/economy/view/2022/119602
458 :  http://news.mk.co.kr/newsRead.php?no=126808&year=2022
뉴스 텍스트를 가져올 수 없습니다.
443 :  https://news.imaeil.com/page/view/2022020913290283337
433 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022020910584484508&type=2
436 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022020916162336167&type=2
438 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022020914551078032&type=2
450 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022020908271181868&type=2
461 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022021010250773617&type=2
466 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022021007365861937&type=2
485 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022021207185424645&type=2
440 :  http://www.mdilbo.com/detail/DDAuq0/662608


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


444 :  http://www.munhwa.com/news/view.html?no=2022020901072103015001
495 :  http://www.munhwa.com/news/view.html?no=20220213MW172541643196
416 :  http://www.busan.com/view/busan/view.php?code=2022020816550307694
483 :  http://www.busan.com/view/busan/view.php?code=2022021210111476199
490 :  http://www.busan.com/view/busan/view.php?code=2022021308493509847
498 :  http://www.busan.com/view/busan/view.php?code=2022021315482715164
437 :  http://www.sedaily.com/NewsView/2622PN2ELE
474 :  http://www.sedaily.com/NewsView/2623L59YLV
413 :  http://www.segye.com/content/html/2022/02/08/20220208515944.html
442 :  http://www.segye.com/content/html/2022/02/09/20220209509883.html
462 :  http://www.segye.com/content/html/2022/02/10/20220210506318.html
491 :  http://www.segye.com/content/html/2022/02/13/20220213508816.html
422 :  http://www.asiae.co.kr/article/2022020811462127094
454 :  http://www.asiae.co.kr/article/2022020907160953635
480 :  http://www.asiae.co.kr/article/2022021215431772397
486 : 

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


612 :  http://www.mk.co.kr/economy/view/2022/157431
616 :  http://news.mk.co.kr/newsRead.php?no=158512&year=2022
뉴스 텍스트를 가져올 수 없습니다.
617 :  http://news.mk.co.kr/newsRead.php?no=158479&year=2022
뉴스 텍스트를 가져올 수 없습니다.
618 :  http://news.mk.co.kr/newsRead.php?no=158320&year=2022
뉴스 텍스트를 가져올 수 없습니다.
648 :  http://news.mk.co.kr/newsRead.php?no=160065&year=2022
뉴스 텍스트를 가져올 수 없습니다.
663 :  http://news.mk.co.kr/newsRead.php?no=162360&year=2022
뉴스 텍스트를 가져올 수 없습니다.
685 :  http://news.mk.co.kr/newsRead.php?no=163624&year=2022
뉴스 텍스트를 가져올 수 없습니다.
699 :  http://news.mk.co.kr/newsRead.php?no=170072&year=2022
뉴스 텍스트를 가져올 수 없습니다.
641 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022021716094741645&type=2
671 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022022009101145173&type=2
614 :  http://www.busan.com/view/busan/view.php?code=2022021820081259152
631 :  http://www.busan.com/view/busan/view.php?code=2022021809215437144
635 :  http://www.busan.com/view/busan/view.php?code=202202180828039979

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1140 :  http://www.mk.co.kr/economy/view/2022/209587
1167 :  http://news.mk.co.kr/newsRead.php?no=214506&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1168 :  http://news.mk.co.kr/newsRead.php?no=214466&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1169 :  http://news.mk.co.kr/newsRead.php?no=214453&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1170 :  http://news.mk.co.kr/newsRead.php?no=214451&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1172 :  http://news.mk.co.kr/newsRead.php?no=214338&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1174 :  https://news.imaeil.com/page/view/2022030716202925962
1136 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030611534175284&type=2
1138 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030613281198401&type=2
1148 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030416180220289&type=2
1176 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030716167072278&type=2
1180 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030716167089883&type=2
1184 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1311 :  http://www.mk.co.kr/economy/view/2022/215433
1318 :  http://premium.mk.co.kr/view.php?no=31611
뉴스 텍스트를 가져올 수 없습니다.
1320 :  http://news.mk.co.kr/newsRead.php?no=215121&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1349 :  http://news.mk.co.kr/newsRead.php?no=222051&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1351 :  http://news.mk.co.kr/newsRead.php?no=221872&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1352 :  http://news.mk.co.kr/newsRead.php?no=221771&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1365 :  http://news.mk.co.kr/newsRead.php?no=220403&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1366 :  http://www.mk.co.kr/economy/view/2022/220388
1315 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030807332058551&type=2
1319 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030806551844263&type=2
1360 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030913534167561&type=2
1371 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030814335723087&type=2
1375 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022030907343685555&type=2

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


1469 :  http://www.mk.co.kr/economy/view/2022/229638
1499 :  http://news.mk.co.kr/newsRead.php?no=226672&year=2022
뉴스 텍스트를 가져올 수 없습니다.
1455 :  https://news.imaeil.com/page/view/2022031005595817569
1416 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031011173676732&type=2
1419 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031010430777828&type=2
1420 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031010533952315&type=2
1431 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031009090773352&type=2
1443 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031007393691413&type=2
1448 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031006513457790&type=2
1478 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031116134525399&type=2
1493 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031110265794889&type=2
1496 :  http://news.moneytoday.co.kr/view/mtview.php?no=2022031107463979169&type=2
1487 :  http://www.munhwa.com/news/view.html?no=20220311

In [22]:
combined_news_data.to_excel('./0~1500/유가데이터(시간) 0~1500.xlsx')