<a href="https://colab.research.google.com/github/kimdonggyu2008/combined/blob/main/Crawling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import time
from transformers import T5Tokenizer, T5ForConditionalGeneration
from urllib.parse import urlparse
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

# T5 모델 및 토크나이저 불러오기
summarizer_model = T5ForConditionalGeneration.from_pretrained("t5-large").to(device)
summarizer_tokenizer = T5Tokenizer.from_pretrained("t5-large")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [3]:

def detect_site(url):
    parsed_url = urlparse(url)
    domain = parsed_url.netloc

    if "bbc" in domain:
        return "bbc"
    elif "cnn" in domain:
        return "cnn"
    else:
        return None

def get_filtered_main_content(url):
    content = ""

    # HTML 소스 가져오기
    response = requests.get(url)
    if response.status_code != 200:
        return f"Failed to fetch the webpage! URL: {url}"

    # 인코딩 설정
    response.encoding = response.apparent_encoding

    soup = BeautifulSoup(response.content, 'html.parser')

    # 사이트 감지
    site = detect_site(url)
    if site == "bbc":
        text_blocks = soup.select('div[data-component="text-block"]')
        for block in text_blocks:
            content += block.get_text(strip=True) + "\n\n"
    elif site == "cnn":
        paragraphs = soup.select("p.paragraph.inline-placeholder.vossi-paragraph")
        for paragraph in paragraphs:
            content += paragraph.get_text(strip=True) + "\n\n"
    else:
        return None  #지원되지 않는 사이트

    return content.strip()

def split_text_with_last_sentence_overlap(text, target_chunk_length=1024):
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""

    for sentence in sentences:
        if len(current_chunk.split()) + len(sentence.split()) <= target_chunk_length:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks
def summarize_long_text(article_text, target_chunk_length=1024):
    chunks = split_text_with_last_sentence_overlap(article_text, target_chunk_length)
    summaries = []
    total_time = 0  # Summarization 소요 시간 기록

    for chunk in chunks:
        try:
            start_time = time.time()
            inputs = summarizer_tokenizer(chunk, max_length=target_chunk_length, return_tensors="pt", truncation=True).to(device)
            summary_ids = summarizer_model.generate(
                inputs["input_ids"],
                max_length=100,
                min_length=50,
                length_penalty=2.0,
                num_beams=2,
                early_stopping=True
            )
            summary = summarizer_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
            summaries.append(summary)
            end_time = time.time()
            total_time += end_time - start_time
        except Exception as e:
            print(f"Error processing chunk: {chunk[:100]}... | Error: {e}")
            summaries.append("Summarization failed for this chunk.")

    print(f"Total summarization time: {total_time:.2f} seconds")
    return " ".join(summaries)


def create_summary_dataset(input_file, output_file):
    results = []

    with open(input_file, 'r') as file:
        urls = file.read().split(',')

    for url in urls:
        url = url.strip()
        if not url:
            continue

        print(f"Processing URL: {url}")
        article_start_time = time.time()  # Fetching and summarizing 시작 시간 기록
        article = get_filtered_main_content(url)
        if not article:
            print(f"Skipping URL (no article found): {url}")
            continue

        highlight = summarize_long_text(article)
        article_end_time = time.time()  # Fetching and summarizing 끝 시간 기록
        elapsed_time = article_end_time - article_start_time

        print(f"URL processed in {elapsed_time:.2f} seconds")
        results.append({"url": url, "article": article, "highlight": highlight, "processing_time": elapsed_time})

    summary_df = pd.DataFrame(results)
    summary_df.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"Summary dataset saved to {output_file}")

if __name__ == "__main__":
    summary_input_file = "/content/drive/MyDrive/summarizer/data/test_summary.txt"  # URL 파일 경로
    summary_output_file = "/content/drive/MyDrive/summarizer/data/crawled_summary.csv"  # 요약 데이터셋 경로
    create_summary_dataset(summary_input_file, summary_output_file)

Processing URL: https://www.bbc.com/news/articles/cd6vpy8e6jvo
Total summarization time: 4.99 seconds
URL processed in 5.63 seconds
Processing URL: https://www.bbc.com/news/articles/cly20zz51j7o
Total summarization time: 2.10 seconds
URL processed in 3.87 seconds
Processing URL: https://www.bbc.com/future/article/20241115-how-robotaxis-are-trying-to-win-passengers-trust
Total summarization time: 12.48 seconds
URL processed in 13.10 seconds
Processing URL: https://www.bbc.com/news/articles/c9wrqg4vd2qo
Total summarization time: 4.09 seconds
URL processed in 4.48 seconds
Processing URL: https://www.bbc.com/news/articles/c5yxv41q235o
Total summarization time: 3.98 seconds
URL processed in 6.62 seconds
Processing URL: https://www.bbc.com/news/articles/c30p16gn3pvo
Total summarization time: 4.25 seconds
URL processed in 4.64 seconds
Processing URL: https://www.bbc.com/news/articles/cdenplz5j89o
Total summarization time: 7.97 seconds
URL processed in 9.31 seconds
Processing URL: https://www.

In [9]:

import pandas as pd
from IPython.display import display  # Jupyter Notebook에서 표로 보기 위한 모듈

# CSV 파일 경로 입력
csv_file_path = "/content/drive/MyDrive/summarizer/data/crawled_summary.csv"  # 여기에 실제 파일 경로를 입력하세요.

# CSV 파일 읽기
data = pd.read_csv(csv_file_path)

# 표로 표시
display(data)  # Jupyter Notebook에서 보기 좋게 표시

Unnamed: 0,url,article,highlight,processing_time
0,https://www.bbc.com/news/articles/cd6vpy8e6jvo,Instagram is testing a new feature which allow...,in December.The regulator will also finalise i...,5.625508
1,https://www.bbc.com/news/articles/cly20zz51j7o,Men are getting the chance to talk about lonel...,"Follow Norfolk news onBBC Sounds on Facebook,I...",3.870579
2,https://www.bbc.com/future/article/20241115-ho...,Autonomous vehicles are already clocking up mi...,. It is a place where human operators can give...,13.097822
3,https://www.bbc.com/news/articles/c9wrqg4vd2qo,Roblox has announced it will block under-13s f...,to comply with new rules under the Online Safe...,4.476845
4,https://www.bbc.com/news/articles/c5yxv41q235o,Conditions in a Manx marshland wallabies escap...,on Twitter and Instagram.... You can also foll...,6.617889
5,https://www.bbc.com/news/articles/c30p16gn3pvo,A robotic dog named “Spot” made by Boston Dyna...,armed.“They are trying to weaponise these dogs...,4.64171
6,https://www.bbc.com/news/articles/cdenplz5j89o,Described as a virtual singer powered by artif...,.This means that AI is influenced by data from...,9.305097
7,https://www.bbc.com/future/article/20230306-ju...,SpaceX's enormous Starship rocket is the large...,that set off car alarms in towns up to 10 mile...,8.215674
8,https://www.bbc.com/future/article/20241111-st...,"If you find it hard to get to sleep, then a so...",. You might find that one of the things that k...,6.37494
9,https://www.bbc.com/future/article/20201028-th...,"In the past, coffee was associated with increa...","had a 50% increase in blood sugar, compared to...",10.795196


In [None]:
import time
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_translation_models():
    translator_model_name = "facebook/nllb-200-3.3B"

    model = AutoModelForSeq2SeqLM.from_pretrained(translator_model_name, forced_bos_token_id=256098).to(device)
    tokenizer = AutoTokenizer.from_pretrained(translator_model_name, src_lang="eng_Latn", tgt_lang="kor_Hang")

    return model, tokenizer

model, tokenizer = load_translation_models()


config.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

pytorch_model.bin.index.json:   0%|          | 0.00/90.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model-00001-of-00003.bin:   0%|          | 0.00/6.93G [00:00<?, ?B/s]

pytorch_model-00002-of-00003.bin:   0%|          | 0.00/8.55G [00:00<?, ?B/s]

In [None]:

def translate_text(text, src_lang, tgt_lang):
    """
    Args:
        text (str): 번역할 텍스트
        src_lang (str): 원본 언어 코드 (예: 'eng_Latn')
        tgt_lang (str): 대상 언어 코드 (예: 'kor_Hang')
    Returns:
        str: 번역된 텍스트
    """
    start_time = time.time()

    tokenizer.src_lang = src_lang
    tokenizer.tgt_lang = tgt_lang

    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512).to(device)

    outputs = model.generate(
        inputs["input_ids"],
        max_length=512,
        num_beams=4,
        early_stopping=True
    )
    translated_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]

    end_time = time.time()
    print(f"Translation from {src_lang} to {tgt_lang} completed in {end_time - start_time:.2f} seconds.")
    return translated_text

def create_translation_dataset(summary_df, output_file):
    """
    Args:
        summary_df (pd.DataFrame): 요약 데이터가 포함된 데이터프레임 (columns: "highlight").
        output_file (str): 결과 CSV 파일 경로.
    """
    results = []
    total_start_time = time.time()

    for _, row in summary_df.iterrows():
        original_summary = row["highlight"]
        translations = {"en": original_summary}

        target_languages = {
            "fra_Latn": "French",
            "deu_Latn": "German",
            "rus_Cyrl": "Russian",
            "jpn_Jpan": "Japanese",
        }

        for lang_code, lang_name in target_languages.items():
            try:
                # 영어 -> 중간 언어 번역
                intermediate_translation = translate_text(original_summary, "eng_Latn", lang_code)
                # 중간 언어 -> 한국어 번역
                back_translation = translate_text(intermediate_translation, lang_code, "kor_Hang")
                translations["ko"] = back_translation
            except Exception as e:
                print(f"Translation failed for {lang_name}: {e}")
                translations["ko"] = "Translation failed"

            # 결과 추가
            results.append({
                "ko": translations["ko"],  # 역번역된 한국어
                "en": translations["en"]  # 영어 원문
            })

    total_end_time = time.time()
    print(f"Total translation time: {total_end_time - total_start_time:.2f} seconds.")

    translation_df = pd.DataFrame(results)
    translation_df.to_csv(output_file, index=False, encoding="utf-8-sig")
    print(f"Translated dataset saved to {output_file}")

In [None]:
summary_data_file = "/content/drive/MyDrive/summarizer/data/crawled_summary.csv"
translation_output_file = "/content/drive/MyDrive/summarizer/data/translated_summary.csv"
summary_df = pd.read_csv(summary_data_file)
create_translation_dataset(summary_df, translation_output_file)

In [None]:

import pandas as pd
from IPython.display import display  # Jupyter Notebook에서 표로 보기 위한 모듈

# CSV 파일 경로 입력
csv_file_path = "/content/drive/MyDrive/summarizer/data/translated_summary.csv"  # 여기에 실제 파일 경로를 입력하세요.

# CSV 파일 읽기
data = pd.read_csv(csv_file_path)

# 표로 표시
display(data)  # Jupyter Notebook에서 보기 좋게 표시