In [2]:
pip install xlsxwriter

Note: you may need to restart the kernel to use updated packages.


In [3]:
pip install openpyxl

Note: you may need to restart the kernel to use updated packages.


In [4]:
pip install readability-lxml

Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install trafilatura

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install newspaper3k

Note: you may need to restart the kernel to use updated packages.


In [14]:
import re
import pandas as pd
import requests
from bs4 import BeautifulSoup
from readability import Document
import trafilatura
from newspaper import Article
from time import sleep
from tqdm import tqdm
from datetime import datetime

# 🔹 입력 파일 경로 설정
INPUT_PATH = r"C:\Users\KEARNEY\Desktop\url 크롤링\250520_라벨링 필요 기사 리스트.xlsx"

# 🔹 URL 리스트 불러오기
df = pd.read_excel(INPUT_PATH, sheet_name='Sheet1').head(50)
df['content'] = ''

# 🔹 세션 설정 (User-Agent 포함)
session = requests.Session()
session.headers.update({
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                  'AppleWebKit/537.36 (KHTML, like Gecko) '
                  'Chrome/124.0.0.0 Safari/537.36',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
    'Referer': 'https://www.google.com/',
    'Connection': 'keep-alive',
    'Upgrade-Insecure-Requests': '1',
    'Sec-Fetch-Site': 'same-origin',
    'Sec-Fetch-Mode': 'navigate',
    'Sec-Fetch-User': '?1',
    'Sec-Fetch-Dest': 'document'
})

# 🔹 필터 패턴 설정
filter_contains = [
    re.compile(r'(?i)\bimage credit\b'),
    re.compile(r'(?i)^advertisement\s*[:\-]?\s*')
]

filter_full = [
    re.compile(r'^[A-Z][a-z]+(?: [A-Z][a-z]+)*\.?$'),
]

start_time = datetime.now()
print(f"Start Time: {start_time:%Y-%m-%d %H:%M:%S}")

# 🔹 크롤링 및 본문 추출
for idx, url in tqdm(enumerate(df['url']), total=len(df), desc="Progress"):
    text = ''
    try:
        # 1) HTML 요청 (최대 3회 시도)
        for _ in range(3):
            resp = session.get(url, timeout=15)
            if resp.status_code == 410:
                sleep(2)
                continue
            resp.raise_for_status()
            break
        resp.encoding = resp.apparent_encoding
        html = resp.text

        # 2) newspaper3k 사용 (가능하면 set_html 방식 사용)
        try:
            article = Article(url, keep_article_html=True)
            article.set_html(html)
            article.parse()
            clean_html = article.article_html
            soup_np = BeautifulSoup(clean_html, 'html.parser')

            # 표나 목록 제거
            for tag in soup_np(['table', 'ul', 'ol', 'li', 'td', 'th', 'tr']):
                tag.decompose()

            text = soup_np.get_text(separator=' ').strip()
            text = re.sub(r'\s+', ' ', text).strip()

        except Exception as e:
            print(f"[{idx}] newspaper3k 실패: {e}")
            text = ''

        # 3) trafilatura 보조
        if len(text) < 200:
            downloaded = trafilatura.fetch_url(url) or html
            extracted = trafilatura.extract(
                downloaded,
                include_comments=False,
                include_tables=False
            ) or ''
            if len(extracted) > len(text):
                text = extracted.strip()

        # 4) readability + <p> 추출
        if len(text) < 200:
            doc = Document(html)
            summary = doc.summary()
            soup_sum = BeautifulSoup(summary, 'html.parser')
            paras = [p.get_text(strip=True) for p in soup_sum.find_all('p') if len(p.get_text(strip=True)) >= 50]
            text = '\n\n'.join(paras)

        # 5) 가장 긴 블록 선택
        if len(text) < 200:
            soup_full = BeautifulSoup(html, 'html.parser')
            for tag in soup_full(['script', 'style']):
                tag.decompose()
            candidates = soup_full.find_all(['article', 'main', 'div', 'section'])
            best = max(candidates, key=lambda t: len(t.get_text()), default=soup_full.body)
            paras = [p.get_text(strip=True) for p in best.find_all('p') if len(p.get_text(strip=True)) >= 50]
            text = '\n\n'.join(paras) or best.get_text(strip=True)

        # 🔹 문장 필터링
        sentences = re.split(r'(?<=[\.\?\!])\s+', text)
        filtered = []
        for sent in sentences:
            sent_strip = sent.strip()
            if any(p.search(sent_strip) for p in filter_contains):
                continue
            if any(p.fullmatch(sent_strip) for p in filter_full):
                continue
            filtered.append(sent_strip)
        text = ' '.join(filtered).strip()

    except Exception as e:
        text = f'Error: {e}'

    # 🔹 결과 저장
    df.at[idx, 'content'] = text

# 🔹 시간 출력
end_time = datetime.now()
print(f"End Time: {end_time:%Y-%m-%d %H:%M:%S}")
print(f"Time Consumed: {end_time - start_time}")


Start Time: 2025-06-25 10:27:26


Progress: 100%|██████████| 50/50 [01:17<00:00,  1.55s/it]

End Time: 2025-06-25 10:28:44
Time Consumed: 0:01:17.530413





In [15]:
df["content"][2]

"Summary So, how was Samsung's Galaxy Unpacked 2025 event for you? If you missed the event, the tech giant revealed its newest set of Samsung Galaxy S25 devices, with something for everyone. The best bit is, if you want to snag one ASAP, the company has opened up pre-orders starting today, with general availability landing on February 7th. Samsung reveals the Galaxy S25, S25+, and S25 Ultra As announced during the Samsung Galaxy Unpacked event, the Galaxy S25, S25+, and S25 Ultra are now official. As Samsung fans have come to expect, each entry in the range corresponds to different needs and budgets, with the S25 being the more affordable choice and the Ultra being the luxury pick. So, what's on these phones? Regardless of which model you pick, you're getting a slick Snapdragon 8 Elite CPU and an AMOLED display. The more expensive models have bigger screens, going from the S25's 6.2'' screen to the Ultra's 6.9''. Each phone has a 12MP Wide AF selfie camera that can record 4K video at 6

In [16]:
df["content"][0]

'Well, the Galaxy S25 Ultra screen takes the anti-reflective coating of its predecessor and runs with it, debuting the newest Gorilla Armor 2 glass by Corning . Samsung\'s investment in Corning is being paid off by getting exclusive access to the best reinforced glass technology out there, and the S25 Ultra is a shining example. This offer is not available in your area. Get the mighty Samsung Galaxy S25 Ultra for up to $630 off with the official Samsung Store. To get the discount, you must provide an eligible trade-in in good condition. Not only is the Corning Gorilla Armor 2 glass over the S25 Ultra\'s high-res OLED display more than two times stronger than before, but it offers the first scratch-resistant, anti-reflective glass ceramic cover glass available on a phone. David Velasquez, Vice President and General Manager, Corning Gorilla Glass, January \'25 Receive the latest mobile news By subscribing you agree to our terms and conditions and privacy policy The new Gorilla Armor 2 li

In [19]:
df["content"][4]

'It\'s hard to see value in the plethora of AI features tech companies have dumped on us in recent years. Do I really want AI to write my texts and emails? How often do I actually need to summarize a PDF ? Is a custom emoji of a dog in a Santa hat riding a skateboard (while cute) really worth dropping several hundred dollars on the latest AI-supercharged phone? Perhaps companies are picking up on our AI fatigue , or perhaps the technology is simply getting smarter and more intuitive. But during its Unpacked event on Wednesday, Samsung unveiled a slew of AI-powered features that the company says can pick up on our habits, offer helpful suggestions and handle mundane tasks via the S25 series . The focus wasn\'t just on flexing AI capabilities, but also demonstrating how they might fit into more practical applications like streamlining everyday activities and navigating your phone\'s functions -- all without necessarily drawing your attention to the AI running in the background. It\'s a s

In [21]:
#df.to_excel("250623_2.xlsx", index = False)
df.to_excel(r"C:\Users\KEARNEY\Desktop\url 크롤링\결과.xlsx", sheet_name='결과', index=False, engine='openpyxl')

