# Crawling BBC and Translate
- BBC > Home > Most Watch top 10 가져와보기

In [1]:
# !pip install selenium
# !pip install webdriver_manager
!pip install openai==0.28



In [2]:
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchElementException
import openai
import requests
from bs4 import BeautifulSoup #bs4 모듈안에 있는 BeautifulSoup를 불러옴.

# 크롬드라이버 셋팅(내 user-agent 검색해서 집어넣기)
def set_chrome_driver(headless=True):
    options = webdriver.ChromeOptions()
    if headless:
        options.add_argument('headless')
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    return driver

In [3]:
# OPENAI API키 설정
openai.api_key = "개인 API"

In [4]:
# article = soup.find_all("p", {"class":"ssrcss-1q0x1qg-Paragraph e1jhz7w10"})
# texts = []
# for i in article :
#      texts.append(i.text)

# paragraph = ' '.join(texts)

## 함수들 정의

In [5]:
# 뉴스 페이지 크롤링> 뷰티풀숩 사용
def crawl_page(url):
    try:
        resp = requests.get(url)
        soup = BeautifulSoup(resp.text,"html.parser")
        # BBC 기사의 텍스트 포함된 class 입력
        article = soup.find_all("p", {"class":"ssrcss-1q0x1qg-Paragraph e1jhz7w10"})
        texts = []
        for i in article :
            texts.append(i.text)
        paragraph = ' '.join(texts)
    except NoSuchElementException:
        paragraph = ""
    return paragraph

# ChatGPT 요약
def summarize(text):
    # 모델 엔진 선택
    model = "text-davinci-003"
    # 맥스 토큰
    max_tokens = 2500
    # 프롬프트 (요약해줘!)
    prompt = f'''Summarize the paragraph below and interpret whether it is a positive or negative sentiment.
    {text}
    '''

    # 요약 요청
    completion = openai.Completion.create(
        engine=model,
        prompt=prompt,
        max_tokens=max_tokens,
        temperature=0.5,      # creativity

    )
    return completion.choices[0].text

# 파파고 번역
def papago_translate(text):
    try:
        papago = set_chrome_driver(False)
        papago.get('https://papago.naver.com/')
        time.sleep(1)
        papago.find_element(By.ID, 'txtSource').send_keys(text)
        papago.find_element(By.ID, 'btnTranslate').click()
        time.sleep(2)
        papago_translated = papago.find_element(By.ID, 'targetEditArea')
        result = papago_translated.text
    except NoSuchElementException: # 예외처리 (요소를 찾지 못하는 경우)
        result = '번역 오류ㅠㅠ'
    finally:
        papago.close()
    return result

# 최종 wrapper
def summarize_news(url):
    page = crawl_page(url) # 위의 크롤링 def
    summarized = summarize(page)
    print('<원문 요약>')
    print(summarized)
    korean_translated = papago_translate(summarized)
    print('\n<한글 요약본>')
    print(korean_translated)
    return korean_translated

In [6]:
# def crawl_page(url):
#     try:
#         resp = requests.get(url)
#         soup = BeautifulSoup(resp.text,"html.parser")
#         # BBC 기사의 텍스트 포함된 class 입력
#         article = soup.find_all("p", {"class":"ssrcss-1q0x1qg-Paragraph e1jhz7w10"})
#         texts = []
#         for i in article :
#             texts.append(i.text)
#         text = ' '.join(texts)
#     except NoSuchElementException:
#         text = ""
#     return text

# text = crawl_page('https://www.bbc.com/news/world-us-canada-67899435')
# text

## BBC top10 뉴스에 대해서 전체 요약

In [7]:
# driver 설정
driver = set_chrome_driver(False)
# URL 요청
url = 'https://www.bbc.com/news'
driver.get(url)

# 뉴스 페이지 크롤링
top10_links = []

# # most read의 ID 리스트 만들기 > 쓸모없음... 빈
# ID = []
# for i in range(1,11):
#     ID.append(f'most-popular-read-{i}')

# 진짜 뒤질뻔했네... 홈페이지 왜이래
for link in driver.find_elements(By.CLASS_NAME, 'gs-o-media__body'):
    top10_links.append(link.find_element(By.CSS_SELECTOR, 'a').get_attribute('href'))
    
top10 = top10_links[7:17]

In [8]:
top10

['https://www.bbc.com/news/world-us-canada-67899435',
 'https://www.bbc.com/news/world-us-canada-67878504',
 'https://www.bbc.com/news/world-europe-67659275',
 'https://www.bbc.com/news/world-asia-india-67871797',
 'https://www.bbc.com/news/business-67869653',
 'https://www.bbc.com/news/world-us-canada-67898569',
 'https://www.bbc.com/news/uk-67884785',
 'https://www.bbc.com/news/world-europe-67899408',
 'https://www.bbc.com/news/world-us-canada-67899564',
 'https://www.bbc.com/news/world-us-canada-67898784']

## openai 사용하려면 paid user 여야하고, 카드 등록해야함....

In [9]:
# 크롤링+요약+번역
top10_summarize = []

for link in top10:
    output = summarize_news(link)
    top10_summarize.append(output)
    print()

<원문 요약>

This paragraph summarizes the US Supreme Court's decision to hear a case that will determine whether Donald Trump can run for president in 2024. It outlines the legal challenges that have been brought against Mr Trump, and the arguments being made by his legal team. The court's decision to expedite the case suggests that they will issue a ruling before the Super Tuesday primary election in March. The overall sentiment of the paragraph is neutral, as it simply outlines the facts of the case.

<한글 요약본>
이 단락은 도널드 트럼프가 2024년 대선에 출마할 수 있는지를 결정할 사건을 심리하라는 미국 연방대법원의 결정을 요약한 것이다. 트럼프에게 제기된 법적 도전과 그의 법률팀이 주장하는 내용이 요약되어 있다. 법원의 신속한 사건 처리 결정은 그들이 3월 슈퍼 화요일 예비선거 전에 판결을 내릴 것임을 시사한다. 이 단락의 전체적인 감정은 사건의 사실관계를 간단히 요약하는 것으로 중립적이다.

<원문 요약>

This is a positive sentiment about Glynn Simmons, who was wrongfully convicted and spent nearly half a century in prison for a murder he did not commit. After being released and declared innocent in December 2023, Simmons spoke to the BBC about his newfound

InvalidRequestError: This model's maximum context length is 4097 tokens, however you requested 4241 tokens (1741 in your prompt; 2500 for the completion). Please reduce your prompt; or completion length.