<a href="https://colab.research.google.com/github/kimdonggyu2008/2024_2_Capstone/blob/main/Crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re
from transformers import BartForConditionalGeneration, BartTokenizer, AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import pandas as pd

# GPU 사용 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# BART 모델 및 토크나이저 초기화
summarizer_model = BartForConditionalGeneration.from_pretrained("facebook/bart-large-cnn").to(device)
summarizer_tokenizer = BartTokenizer.from_pretrained("facebook/bart-large-cnn")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [3]:

# NLLB 모델 초기화
def load_translation_models():
    """
    NLLB 모델과 토크나이저를 초기화합니다.
    """
    languages = ['fra', 'deu', 'rus', 'jpn']  # 프랑스어, 독일어, 러시아어, 일본어
    models = {}
    base_model = "facebook/nllb-200-3.3B"  # NLLB 대형 모델 (3.3B 버전)

    # NLLB 모델과 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelForSeq2SeqLM.from_pretrained(base_model).to(device)

    # 각 언어 쌍 초기화
    for lang in languages:
        models[f"eng_{lang}"] = {"model": model, "tokenizer": tokenizer, "target_lang": lang}
        models[f"{lang}_kor"] = {"model": model, "tokenizer": tokenizer, "target_lang": "kor"}
    return models

translation_models = load_translation_models()


Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:  47%|####7     | 3.26G/6.93G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

pytorch_model-00003-of-00003.bin:   0%|          | 0.00/2.10G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

In [5]:

def detect_site(url):
    """
    URL을 분석하여 사이트를 감지합니다.
    """
    parsed_url = urlparse(url)
    domain = parsed_url.netloc

    if "bbc" in domain:
        return "bbc"
    elif "cnn" in domain:
        return "cnn"
    else:
        return None

def get_filtered_main_content(url):
    """
    URL에서 원문(article)을 크롤링합니다.
    """
    content = ""

    # HTML 소스 가져오기
    response = requests.get(url)
    if response.status_code != 200:
        return f"Failed to fetch the webpage! URL: {url}"

    # 인코딩 설정
    response.encoding = response.apparent_encoding

    soup = BeautifulSoup(response.content, 'html.parser')

    # 사이트 감지
    site = detect_site(url)
    if site == "bbc":
        text_blocks = soup.select('div[data-component="text-block"]')
        for block in text_blocks:
            content += block.get_text(strip=True) + "\n\n"
    elif site == "cnn":
        paragraphs = soup.select("p.paragraph.inline-placeholder.vossi-paragraph")
        for paragraph in paragraphs:
            content += paragraph.get_text(strip=True) + "\n\n"
    else:
        return None  # 지원되지 않는 사이트

    return content.strip()

def split_text_with_last_sentence_overlap(text, target_chunk_length=1024):
    """
    긴 텍스트를 지정된 길이로 문장 단위로 분할하며, 마지막 문장을 중복 포함시킵니다.
    """
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk.split()) + len(sentence.split()) <= target_chunk_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

def summarize_long_text(article_text, target_chunk_length=1024):
    """
    긴 텍스트를 분할하여 각 청크를 요약하고 결합합니다.
    """
    chunks = split_text_with_last_sentence_overlap(article_text, target_chunk_length)
    summaries = []

    for chunk in chunks:
        try:
            inputs = summarizer_tokenizer(chunk, max_length=target_chunk_length, return_tensors="pt", truncation=True).to(device)
            summary_ids = summarizer_model.generate(
                inputs["input_ids"],
                max_length=100,
                min_length=50,
                length_penalty=2.0,
                num_beams=2,
                early_stopping=True
            )
            summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)
        except Exception as e:
            print(f"Error processing chunk: {chunk[:100]}... | Error: {e}")
            summaries.append("Summarization failed for this chunk.")

    return " ".join(summaries)

def create_summary_dataset(input_file, output_file):
    """
    URL 목록에서 크롤링하여 요약 데이터를 생성합니다.
    """
    results = []

    with open(input_file, 'r') as file:
        urls = file.read().split(',')

    for url in urls:
        url = url.strip()
        if not url:
            continue

        print(f"Processing URL: {url}")
        article = get_filtered_main_content(url)
        if not article:
            print(f"Skipping URL (no article found): {url}")
            continue

        highlight = summarize_long_text(article)
        results.append({"url": url, "article": article, "highlight": highlight})

    summary_df = pd.DataFrame(results)
    summary_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Summary dataset saved to {output_file}")

# 실행 예제: 요약 데이터셋 생성
if __name__ == "__main__":
    summary_input_file = "/content/drive/MyDrive/summarizer/data/test_summary.txt"  # URL 파일 경로
    summary_output_file = "/content/drive/MyDrive/summarizer/data/crawled_summary.csv"  # 요약 데이터셋 경로
    create_summary_dataset(summary_input_file, summary_output_file)

Processing URL: https://www.bbc.com/news/articles/cd6vpy8e6jvo
Processing URL: https://www.bbc.com/news/articles/cly20zz51j7o
Processing URL: https://www.bbc.com/future/article/20241115-how-robotaxis-are-trying-to-win-passengers-trust
Processing URL: https://www.bbc.com/news/articles/c9wrqg4vd2qo
Processing URL: https://www.bbc.com/news/articles/c5yxv41q235o
Processing URL: https://www.bbc.com/news/articles/c30p16gn3pvo
Processing URL: https://www.bbc.com/news/articles/cdenplz5j89o
Processing URL: https://www.bbc.com/future/article/20230306-just-how-loud-is-a-rocket-launch
Processing URL: https://www.bbc.com/future/article/20241111-stressed-writing-down-a-to-do-list-might-help
Processing URL: https://www.bbc.com/future/article/20201028-the-benefits-of-coffee-is-coffee-good-for-health
Processing URL: https://edition.cnn.com/2024/11/19/science/starship-test-flight-6-launch-spacex/index.html
Processing URL: https://edition.cnn.com/2024/11/18/science/mummified-sabre-toothed-cat-cub/index.h

In [9]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

# GPU 설정
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 번역 모델 및 토크나이저 초기화
def load_translation_models():
    """
    번역 모델과 토크나이저를 초기화합니다.
    """
    # 사용자 지정 모델 경로
    translator_model_name = "facebook/nllb-200-3.3B"

    # 모델과 토크나이저 로드
    model = AutoModelForSeq2SeqLM.from_pretrained(translator_model_name, forced_bos_token_id=256098).to(device)
    tokenizer = AutoTokenizer.from_pretrained(translator_model_name, src_lang="eng_Latn", tgt_lang="kor_Hang")

    return model, tokenizer

model, tokenizer = load_translation_models()


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
"facebook/nllb-200-3.3B"

In [11]:

def translate_text(text, src_lang, tgt_lang):
    """
    텍스트를 지정된 언어로 번역합니다.

    Args:
        text (str): 번역할 텍스트
        src_lang (str): 원본 언어 코드 (예: 'eng_Latn')
        tgt_lang (str): 대상 언어 코드 (예: 'kor_Hang')

    Returns:
        str: 번역된 텍스트
    """
    # 원본 및 대상 언어 설정
    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang

    # 입력 텍스트를 텐서로 변환
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)

    # 번역 수행
    outputs = model.generate(
        inputs["input_ids"],
        max_length=512,
        num_beams=4,
        early_stopping=True
    )
    translated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    return translated_text

def create_translation_dataset(summary_df, output_file):
    """
    요약 데이터를 다양한 언어로 번역 후 다시 한국어로 역번역한 데이터셋을 생성합니다.

    Args:
        summary_df (pd.DataFrame): 요약 데이터가 포함된 데이터프레임 (columns: "highlight").
        output_file (str): 결과 CSV 파일 경로.
    """
    results = []

    for _, row in summary_df.iterrows():
        original_summary = row["highlight"]
        translations = {"en": original_summary}

        # 번역 대상 언어 목록
        target_languages = {
            "fra_Latn": "French",
            "deu_Latn": "German",
            "rus_Cyrl": "Russian",
            "jpn_Jpan": "Japanese",
        }

        for lang_code, lang_name in target_languages.items():
            try:
                # 영어 -> 중간 언어 번역
                intermediate_translation = translate_text(original_summary, "eng_Latn", lang_code)
                # 중간 언어 -> 한국어 번역
                back_translation = translate_text(intermediate_translation, lang_code, "kor_Hang")
                translations["ko"] = back_translation
            except Exception as e:
                print(f"Translation failed for {lang_name}: {e}")
                translations["ko"] = "Translation failed"

            # 결과 추가
            results.append({
                "ko": translations["ko"],  # 역번역된 한국어
                "en": translations["en"]   # 영어 원문
            })

    # 데이터프레임 생성 및 저장
    translation_df = pd.DataFrame(results)
    translation_df.to_csv(output_file, index=False, encoding="utf-8-sig")
    print(f"Translated dataset saved to {output_file}")

# 실행 예제
if __name__ == "__main__":
    # 요약 데이터 파일 경로
    summary_data_file = "/content/drive/MyDrive/summarizer/data/crawled_summary.csv"
    translation_output_file = "/content/drive/MyDrive/summarizer/data/translated_summary.csv"

    # 요약 데이터 읽기
    summary_df = pd.read_csv(summary_data_file)

    # 번역 데이터셋 생성
    create_translation_dataset(summary_df, translation_output_file)



Translated dataset saved to /content/drive/MyDrive/summarizer/data/translated_summary.csv
