# 0. 패키지 인스톨

In [1]:
!pip install -r requirements.txt

Collecting selenium (from -r requirements.txt (line 14))
  Using cached selenium-4.20.0-py3-none-any.whl.metadata (6.9 kB)
Collecting trafilatura (from -r requirements.txt (line 15))
  Using cached trafilatura-1.8.1-py3-none-any.whl.metadata (14 kB)
Collecting trio~=0.17 (from selenium->-r requirements.txt (line 14))
  Using cached trio-0.25.0-py3-none-any.whl.metadata (8.7 kB)
Collecting trio-websocket~=0.9 (from selenium->-r requirements.txt (line 14))
  Using cached trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting courlan>=1.0.0 (from trafilatura->-r requirements.txt (line 15))
  Downloading courlan-1.1.0-py3-none-any.whl.metadata (18 kB)
Collecting htmldate>=1.8.0 (from trafilatura->-r requirements.txt (line 15))
  Using cached htmldate-1.8.1-py3-none-any.whl.metadata (10 kB)
Collecting justext>=3.0.0 (from trafilatura->-r requirements.txt (line 15))
  Using cached jusText-3.0.0-py2.py3-none-any.whl.metadata (6.8 kB)
Collecting lxml>=4.8 (from pikepdf==8.12.0->-r


## 2. Selenium을 활용한 크롤링 샘플 코드
> Chrome 및 Chrom driver의 stable 버전은 [이 URL](https://googlechromelabs.github.io/chrome-for-testing/#stable) 에서 다운로드 받았습니다.

#### 2-1. 리눅스용 Chrome Driver (123.0.6312.105 버전) 다운로드

In [2]:
## Chrome 드라이버 다운로드 및 설치는 필수 입니다.

!wget https://storage.googleapis.com/chrome-for-testing-public/123.0.6312.105/linux64/chromedriver-linux64.zip

!unzip -o chromedriver-linux64.zip


--2024-04-30 13:23:58--  https://storage.googleapis.com/chrome-for-testing-public/123.0.6312.105/linux64/chromedriver-linux64.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 172.217.14.219, 172.217.14.251, 142.250.69.219, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|172.217.14.219|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 8636471 (8.2M) [application/zip]
Saving to: ‘chromedriver-linux64.zip’


2024-04-30 13:23:58 (117 MB/s) - ‘chromedriver-linux64.zip’ saved [8636471/8636471]

Archive:  chromedriver-linux64.zip
  inflating: chromedriver-linux64/LICENSE.chromedriver  
  inflating: chromedriver-linux64/chromedriver  


In [3]:
# Chrome 드라이버 버전 확인

!./chromedriver-linux64/chromedriver --version

ChromeDriver 123.0.6312.105 (399174dbe6eff0f59de9a6096129c0c827002b3a-refs/branch-heads/6312@{#761})


#### 2-2. Google-chome (123.0.6312.105 버전) 설치

In [4]:
## Chrome 다운로드 및 설치는 필수 입니다.

!wget https://storage.googleapis.com/chrome-for-testing-public/123.0.6312.105/linux64/chrome-linux64.zip

!unzip -o chrome-linux64.zip


--2024-04-30 13:24:03--  https://storage.googleapis.com/chrome-for-testing-public/123.0.6312.105/linux64/chrome-linux64.zip
Resolving storage.googleapis.com (storage.googleapis.com)... 142.251.215.251, 142.250.217.123, 142.250.217.91, ...
Connecting to storage.googleapis.com (storage.googleapis.com)|142.251.215.251|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 150006507 (143M) [application/zip]
Saving to: ‘chrome-linux64.zip’


2024-04-30 13:24:05 (76.7 MB/s) - ‘chrome-linux64.zip’ saved [150006507/150006507]

Archive:  chrome-linux64.zip
  inflating: chrome-linux64/ABOUT    
  inflating: chrome-linux64/MEIPreload/manifest.json  
  inflating: chrome-linux64/MEIPreload/preloaded_data.pb  
  inflating: chrome-linux64/chrome   
  inflating: chrome-linux64/chrome-wrapper  
  inflating: chrome-linux64/chrome_100_percent.pak  
  inflating: chrome-linux64/chrome_200_percent.pak  
  inflating: chrome-linux64/chrome_crashpad_handler  
  inflating: chrome-linux64/chro

In [5]:
# Chrome 버전 확인

!./chrome-linux64/chrome --version

Google Chrome for Testing 123.0.6312.105 


In [6]:
!pip install selenium



#### 2-3. Chrome Driver 설정

In [7]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options

# Selenium 드라이버 옵션 설정
chrome_options = Options()
chrome_options.add_argument('--headless')
chrome_options.add_argument('--no-sandbox')
chrome_options.add_argument('--disable-dev-shm-usage')
chrome_options.binary_location = './chrome-linux64/chrome'  # 크롬 브라우저 경로 지정

# 크롬 드라이버 경로 설정
driver_path = "./chromedriver-linux64/chromedriver"

In [8]:
# 크롬 드라이버 서비스 객체 생성
service = Service(driver_path)

# 크롬 드라이버 실행
driver = webdriver.Chrome(service=service, options=chrome_options)

#### 2-3. 크롤링 테스트

In [9]:
from datetime import date

# 오늘 날짜 가져오기 (YYYY-MM-DD)
today = date.today().strftime('%Y-%m-%d')


In [10]:
# 크롤링 결과 저장 디렉토리 지정 (ex: ./crawling/2024-04-30/)

import os

# 현재 작업 디렉토리 경로 가져오기
current_dir = os.getcwd()

# 생성할 디렉토리 경로
temp_dir = os.path.join(current_dir, f"crawling/{today}")

# 디렉토리가 없으면 생성
if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)
    print(f"{temp_dir} 디렉토리가 생성되었습니다.")
else:
    print(f"{temp_dir} 디렉토리가 이미 존재합니다.")

/home/ec2-user/SageMaker/2024-ebp/crawling/2024-04-30 디렉토리가 생성되었습니다.


In [11]:
import os
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import random
import pandas as pd
import warnings
import trafilatura   #본문만 갖고 오는 함수
warnings.filterwarnings('ignore')

words = ['BESS project'] # 'solar PV project' #,'AUSTRALIA SOLAR FARM','AUSTRALIA BESS','SAUDI PV','SAUDI SOLAR FARM','SAUDI BESS','nuclear power plant']

word_list = []
title_list = []
content_list = []
link_list = []
date_list = []

url = 'https://www.google.com/search?q={}&newwindow=1&sca_esv=600662400&tbs=cdr:1,cd_min:1/1/2024,cd_max:4/16/2024&tbm=nws&ei=vGmvZaObBLPd1e8PirSVkAY&start={}&sa=N&ved=2ahUKEwij6Ye5_fKDAxWzbvUHHQpaBWI4KBDy0wN6BAgEEAQ&biw=1536&bih=735&dpr=1.25'
#날짜 설정

for word in words:
    page = 1
    for i in range(0, 60, 10):     #페이지 수   
        # url {}에 순서대로 word,i 넣기
        new_url = url.format(word, i)

        # 크롬 드라이버에 url 주소 넣고 실행
        driver.get(new_url)

        # 페이지가 완전히 로딩되도록 3초동안 기다림
        time.sleep(3)

        print("*" * 10 + str(page) + "*" * 10)
        page = page + 1
        
        
        titles = driver.find_elements(By.CLASS_NAME, 'n0jPhd.ynAwRc')
        for title in titles:
            word_list.append(word)
            title_list.append(title.text.replace("," , ""))
             # print(title.text) # 기사 제목 확인

        contents = driver.find_elements(By.CLASS_NAME, 'GI74Re.nDgy9d')
        for content in contents:
            content_list.append(content.text.replace("," , ""))

        dates = driver.find_elements(By.CLASS_NAME, 'OSrXXb.rbYSKb.LfVVr')
        for date in dates:
            date_list.append(date.text.replace("," , ""))

        links = driver.find_elements(By.CLASS_NAME, 'WlydOe')
        for link in links:
            link_list.append(link.get_attribute('href'))

        if page == 32:  # 페이지 수정
            break


# 링크내 기사 본문 갖고 오기

def extract_content(link_list):  
    content_full = []
    for link in link_list:
        try:
            downloaded = trafilatura.fetch_url(link)
            extracted = trafilatura.extract(downloaded)
            content_full.append(extracted)
        except:
            content_full.append(None)
    return content_full

# 사용 예시
# link_list = ['https://www.example.com/article1', 'https://www.example.com/article2', 'https://www.example.com/article3']
content_full = extract_content(link_list)

# 결과 확인
len(content_full)            
            
data = {'WORD': word_list, 'TITLE': title_list, 'CONTENT': content_list, 'CONTENT_FULL': content_full,'LINK': link_list, 'DATE': date_list}
df = pd.DataFrame(data)
df.to_csv(f"{temp_dir}/{today}_origin.csv", encoding="utf-8-sig")       # 일별 파일명설정
df.to_excel(f"{temp_dir}/{today}_origin.xlsx")

driver.quit()

**********1**********
**********2**********
**********3**********
**********4**********
**********5**********
**********6**********


In [12]:
 len(link_list)

60

## 2. LangChain을 활용한 Bedrock Claude 3 호출

#### 2-1. LangChain의 ChatBedrock을 활용한 가장 간단한 코드 예시

In [13]:
# len(link_list)

## 3. LLM 활용 

### 3.1 기사내용 요약, 국가, 도시 찾기 후 파싱하여 list로 넣기

In [14]:
import os
#from langchain_community.chat_models import BedrockChat
from langchain_aws import ChatBedrock
from botocore.config import Config
from bs4 import BeautifulSoup

config = Config(read_timeout=1000)

def parsing_data(content):
    summary_list = []
    summary_kor_list = []
    country_list =[]
    city_list =[]
    soup = BeautifulSoup(content, 'html.parser')
    
    for i in range(0, len(content_full)):
        summary = soup.find(f'summary_{i}')
        # summary_kor = soup.find(f'summary_kor_{i}')
        city = soup.find(f'city_{i}')
        country = soup.find(f'country_{i}')
        if summary:
            summary_list.append(summary.text.strip())

            country_list.append(country.text.strip() if country is not None else 'NaN')
            city_list.append(city.text.strip() if city is not None else 'NaN')

            # summary_kor_list.append(summary.text.s trip())        
            # country_list.append(country.text.strip())
            # city_list.append(city.text.strip())
    # print(summary_list)
    print(city_list)
    return summary_list, country_list, city_list

def get_text_response(input_content):
    llm = ChatBedrock(
        credentials_profile_name=os.environ.get("BWB_PROFILE_NAME"),
        region_name=os.environ.get("BWB_REGION_NAME"),
        endpoint_url=os.environ.get("BWB_ENDPOINT_URL"),
        model_id="anthropic.claude-3-haiku-20240307-v1:0",
        model_kwargs={
            "max_tokens": 4096,   # maximum : 4096
            "temperature": 0,
            "top_p": 0.01,
            "top_k": 0,
        } , 
        config = config
    )
    return llm.predict(input_content)


if __name__ == "__main__":

    response_content = []
    total_summary_list =[]
    total_country_list = []
    total_city_list = [] 

    for start in range(0, 101, 10):
        end = start + 10
        content_prompt = ""
        for doc_i in range(start, end):
            if doc_i < len(content_full):
                content_prompt += f"""<content_{doc_i}>{content_full[doc_i]}</content_{doc_i}>"""

        prompt = f"""

        다음 <content></content> 태그 안에 있는 본문의 내용을 <instructions></instructions> 지침에 따라 자세한 요약 메모를 작성하세요:

        <content> 
        {content_prompt}
        </content> 

        <instructions>
          각 <content> 기사의 주요 내용을 3~5문장으로 영어로 요약하고 요약 내용을 list에 삽입해줘.  
          요약시 상세한 숫자를 기입해서 작성해줘.
          각 <content> 에서 해당 국가정보와 도시정보가 있으면 국가정보는 country에, 도시정보는 city에 각각 영어로 한개의 값만 list에 삽입해주고 해당 정보가 없으면 'null'으로 처리해줘.

          summary list 는 <summary_NUMBER> 태그 안에 넣어서 출력해줘
          country list 는 <country_NUMBER> 태그 안에 넣어서 출력해줘
          city list 는 <city_NUMBER> 태그 안에 넣어서 출력해줘
        </instructions>


        """

        response = get_text_response(input_content=prompt)
        response_content.append(response)
        
    for i, content in enumerate(response_content):
        # print(f"Response {i}:")
        # print(content)
        summary_list, country_list, city_list = parsing_data(content)
        total_summary_list.extend(summary_list)
        total_country_list.extend(country_list)
        total_city_list.extend(city_list)

print(len(total_summary_list))


  warn_deprecated(


['California', 'Texas', 'Arizona', 'Texas, Arizona', 'Utah', 'Arizona', 'Arizona', 'Michigan', 'Indiana', 'null']
['Arizona', 'Kern County', 'Fatehgarh, Uttar Pradesh', 'Uusikaupunki', 'Grand Terrace', 'null', 'Menifee', 'Nurmijärvi', 'Qianjiang', 'Kern County']
['Hawaii', 'South Australia', 'null', 'New York City', 'Vlissingen', 'Austin', 'Groningen', 'null', 'Galveston County', 'null']
['Arizona', 'Texas', 'Rajnandgaon', 'Antofagasta', 'null', 'null', 'Flevoland', 'Lappeenranta', 'South Australia', 'Texas']
['Illinois', 'Antofagasta', 'Melbourne', 'New Mexico', 'Arizona', 'Morro Bay', 'Warrington', 'California', 'null', 'Västernorrland']
['Noordoostpolder', 'null', 'Kilmarnock', 'null', 'Lolland', 'Antofagasta', 'Vilvoorde', 'Arizona', 'California', 'null']
['null']
[]
[]
['null']
[]
62


In [15]:
len(link_list)

60

## 3.2 LLM으로 날짜 수정하기 (잘안됨)

### 기존 결과값이랑 합치기

In [16]:
# # CITY 열의 공백 값을 np.nan으로 대체
# df['CITY'] = df['CITY'].replace('', np.nan)
# df['COUNTRY'] = df['COUNTRY'].replace('',np.nan)
# df['REAL_DATE'] = df['REAL_DATE'].replace('',np.nan)

In [17]:
# # TITLE과 SUMMARY 열 합치기
# df['CONTENTS'] = df['TITLE'] + '\n' + df['SUMMARY']

In [18]:
# df=pd.read_csv("240418_content_full.csv")

In [19]:
data2 = {'SUMMARY': total_summary_list, 'COUNTRY': total_country_list, 'CITY': total_city_list}
df2 = pd.DataFrame(data2)

In [20]:
df0= pd.concat([df,df2], axis=1)
df0.to_csv(f"{temp_dir}/{today}_news_result.csv", encoding="utf-8-sig")

In [21]:
df0.to_excel(f"{temp_dir}/{today}_news_result.xlsx")

### S3에 업로드

In [23]:
import boto3
import os
from datetime import date

s3 = boto3.client('s3')

# 버킷 이름
bucket_name = 'jesamkim-temp-20240409'

# 오늘 날짜 가져오기 (YYYY-MM-DD)
today = date.today().strftime('%Y-%m-%d')

# 오늘 날짜 디렉토리 경로
prefix = today + '/'

# S3 버킷에 디렉토리 생성
try:
    s3.put_object(Bucket=bucket_name, Key=prefix)
    print(f"디렉토리 {prefix} 생성 완료")
except Exception as e:
    print(e)

# 로컬 ./crawlong/yyyy-mm-dd 폴더의 모든 xlsx 파일 업로드
for filename in os.listdir(f'{temp_dir}/'):
    if filename.endswith('.xlsx'):
        local_path = os.path.join(f'{temp_dir}/', filename)
        s3_path = os.path.join(prefix, filename)
        try:
            s3.upload_file(local_path, bucket_name, s3_path)
            print(f"파일 {filename} 업로드 완료")
        except Exception as e:
            print(e)


디렉토리 2024-04-30/ 생성 완료
파일 2024-04-30_origin.xlsx 업로드 완료
파일 2024-04-30_news_result.xlsx 업로드 완료
