In [129]:
import numpy as np
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer

In [130]:
import requests
from bs4 import BeautifulSoup
import re


def preprocessing(d):  # 한국어 기사 본문 전처리 함수
    d = d.lower()
    d = re.sub(r'[a-z0-9\-_.]{3,}@[a-z0-9\-_.]{3,}(?:[.]?[a-z]{2})+', ' ', d)
    d = re.sub(r'‘’ⓒ\'\"“”…=□*◆:/_]', ' ', d)
    d = re.sub(r'\s+', ' ', d)
    d = re.sub(r'^\s|\s$', '', d)
    d = re.sub(r'[<*>_="/■□▷▶]', '', d)
    return d


def fetch_article_data(article_url):  # 기사 본문, 기자 정보 수집 함수
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
    }

    resp = requests.get(article_url, headers=headers)
    if resp.status_code != 200:
        return "Failed to retrieve the article"

    article_dom = BeautifulSoup(resp.content, 'html.parser')

    # 특정 선택자를 사용하여 기사 본문 추출
    content_tag = article_dom.select_one(
        'article#dic_area.go_trans._article_content')

    content = preprocessing(content_tag.get_text(
        strip=True)) if content_tag else ''

    # 기자 정보 추출
    reporter_tag = article_dom.select_one('div.byline span') or \
        article_dom.select_one('p.byline') or \
        article_dom.select_one('span.byline')

    reporter = reporter_tag.get_text(strip=True) if reporter_tag else ''

    article_data = {
        "link": article_url,  # 기사 링크
        "article": content,  # 기사 본문
        "reporter": reporter  # 기자
    }

    return article_data


In [131]:
# 피어슨 상관계수 구하기
def pearson_similarity(a, b):
    return np.dot((a-np.mean(a)), (b-np.mean(b)))/((np.linalg.norm(a-np.mean(a)))*(np.linalg.norm(b-np.mean(b))))

In [132]:
import numpy as np
import faiss
import pandas as pd
from sentence_transformers import SentenceTransformer
import pymysql

# 피어슨 상관계수 구하기
def pearson_similarity(a, b):
    return np.dot((a-np.mean(a)), (b-np.mean(b)))/((np.linalg.norm(a-np.mean(a)))*(np.linalg.norm(b-np.mean(b))))

# 현재 읽고 있는 기사와 유사한 기사 찾기


def find_similar_news(target_summary, model):

    # 저장된 임베딩 데이터 불러오기
    #query = 'SELECT summary_embedding FROM db_summary_embeddings'
    #db_summary_embeddings = get_embedding_dataset(query)

    db_summary_embeddings=pd.read_json('dataset/summary_embedding.json')
    db_summary_embeddings=db_summary_embeddings.sort_index()
    db_embeddings=[]
    for row in db_summary_embeddings['summary_embedding']:
        db_embeddings.append(np.fromstring(row[1:-1], dtype=np.float32, sep=' '))

    # 현재 읽고 있는 기사 요약문 임베딩
    target_summary_embedding = model.encode(target_summary,
                                            normalize_embeddings=True)

    # 피어슨 상관계수 기반으로 계산
    threshold = 0.55  # 최소 유사도 threshold
    similar_list = []
    for i in range(len(db_embeddings)):
        similarity = pearson_similarity(
            target_summary_embedding, db_embeddings[i])

        if similarity > threshold:
            # threshold 이상이면 유사한 기사 리스트에 추가
            similar_list.append((similarity, i))

    # 유사도 기준 내림차순 정렬
    sorted_similar_list = sorted(
        similar_list, key=lambda x: x[0], reverse=True)

    # 100개만 추려서 반환
    if len(similar_list) > 100:
        return [item[1] for item in sorted_similar_list[:100]]

    # 100개 이하면 모두 반환
    else:
        return [item[1] for item in sorted_similar_list]

In [133]:
import kss
import torch
import math
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
from torch.nn.init import xavier_uniform_
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

# 요약 KPF-BERTSUMM
# https://github.com/KPFBERT/kpfbertsum
MAX_TOKEN_COUNT = 512
N_EPOCHS = 10
BATCH_SIZE = 4

BERT_MODEL_NAME = 'jinmang2/kpfbert'
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL_NAME)


class PositionalEncoding(nn.Module):

    def __init__(self, dropout, dim, max_len=5000):
        pe = torch.zeros(max_len, dim)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp((torch.arange(0, dim, 2, dtype=torch.float) *
                              -(math.log(10000.0) / dim)))
        pe[:, 0::2] = torch.sin(position.float() * div_term)
        pe[:, 1::2] = torch.cos(position.float() * div_term)
        pe = pe.unsqueeze(0)
        super(PositionalEncoding, self).__init__()
        self.register_buffer('pe', pe)
        self.dropout = nn.Dropout(p=dropout)
        self.dim = dim

    def forward(self, emb, step=None):
        emb = emb * math.sqrt(self.dim)
        if (step):
            emb = emb + self.pe[:, step][:, None, :]

        else:
            emb = emb + self.pe[:, :emb.size(1)]
        emb = self.dropout(emb)
        return emb

    def get_emb(self, emb):
        return self.pe[:, :emb.size(1)]


class TransformerEncoderLayer(nn.Module):
    def __init__(self, d_model, heads, d_ff, dropout):
        super(TransformerEncoderLayer, self).__init__()

        self.self_attn = MultiHeadedAttention(
            heads, d_model, dropout=dropout)
        self.feed_forward = PositionwiseFeedForward(d_model, d_ff, dropout)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout = nn.Dropout(dropout)

    def forward(self, iter, query, inputs, mask):
        if (iter != 0):
            input_norm = self.layer_norm(inputs)
        else:
            input_norm = inputs

        mask = mask.unsqueeze(1)
        context = self.self_attn(input_norm, input_norm, input_norm,
                                 mask=mask)
        out = self.dropout(context) + inputs
        return self.feed_forward(out)


class ExtTransformerEncoder(nn.Module):
    def __init__(self, hidden_size=768, d_ff=2048, heads=8, dropout=0.2, num_inter_layers=2):
        super(ExtTransformerEncoder, self).__init__()
        self.hidden_size = hidden_size
        self.num_inter_layers = num_inter_layers
        self.pos_emb = PositionalEncoding(dropout, hidden_size)
        self.transformer_inter = nn.ModuleList(
            [TransformerEncoderLayer(hidden_size, heads, d_ff, dropout)
             for _ in range(num_inter_layers)])
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(hidden_size, eps=1e-6)
        self.wo = nn.Linear(hidden_size, 1, bias=True)
        self.sigmoid = nn.Sigmoid()

    def forward(self, top_vecs, mask):
        """ See :obj:`EncoderBase.forward()`"""

        batch_size, n_sents = top_vecs.size(0), top_vecs.size(1)
        pos_emb = self.pos_emb.pe[:, :n_sents]
        x = top_vecs * mask[:, :, None].float()
        x = x + pos_emb

        for i in range(self.num_inter_layers):
            x = self.transformer_inter[i](i, x, x, ~mask)

        x = self.layer_norm(x)
        sent_scores = self.sigmoid(self.wo(x))
        sent_scores = sent_scores.squeeze(-1) * mask.float()

        return sent_scores


class PositionwiseFeedForward(nn.Module):
    """ A two-layer Feed-Forward-Network with residual layer norm.

    Args:
        d_model (int): the size of input for the first-layer of the FFN.
        d_ff (int): the hidden layer size of the second-layer
            of the FNN.
        dropout (float): dropout probability in :math:`[0, 1)`.
    """

    def __init__(self, d_model, d_ff, dropout=0.1):
        super(PositionwiseFeedForward, self).__init__()
        self.w_1 = nn.Linear(d_model, d_ff)
        self.w_2 = nn.Linear(d_ff, d_model)
        self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
        self.dropout_1 = nn.Dropout(dropout)
        self.dropout_2 = nn.Dropout(dropout)

    def gelu(self, x):
        return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))

    def forward(self, x):
        inter = self.dropout_1(self.gelu(self.w_1(self.layer_norm(x))))
        output = self.dropout_2(self.w_2(inter))
        return output + x


class MultiHeadedAttention(nn.Module):
    """
    Multi-Head Attention module from
    "Attention is All You Need"
    :cite:`DBLP:journals/corr/VaswaniSPUJGKP17`.

    Similar to standard `dot` attention but uses
    multiple attention distributions simulataneously
    to select relevant items.

    .. mermaid::

       graph BT
          A[key]
          B[value]
          C[query]
          O[output]
          subgraph Attn
            D[Attn 1]
            E[Attn 2]
            F[Attn N]
          end
          A --> D
          C --> D
          A --> E
          C --> E
          A --> F
          C --> F
          D --> O
          E --> O
          F --> O
          B --> O

    Also includes several additional tricks.

    Args:
       head_count (int): number of parallel heads
       model_dim (int): the dimension of keys/values/queries,
           must be divisible by head_count
       dropout (float): dropout parameter
    """

    def __init__(self, head_count, model_dim, dropout=0.1, use_final_linear=True):
        assert model_dim % head_count == 0
        self.dim_per_head = model_dim // head_count
        self.model_dim = model_dim

        super(MultiHeadedAttention, self).__init__()
        self.head_count = head_count

        self.linear_keys = nn.Linear(model_dim,
                                     head_count * self.dim_per_head)
        self.linear_values = nn.Linear(model_dim,
                                       head_count * self.dim_per_head)
        self.linear_query = nn.Linear(model_dim,
                                      head_count * self.dim_per_head)
        self.softmax = nn.Softmax(dim=-1)
        self.dropout = nn.Dropout(dropout)
        self.use_final_linear = use_final_linear
        if (self.use_final_linear):
            self.final_linear = nn.Linear(model_dim, model_dim)

    def forward(self, key, value, query, mask=None,
                layer_cache=None, type=None, predefined_graph_1=None):
        """
        Compute the context vector and the attention vectors.

        Args:
           key (`FloatTensor`): set of `key_len`
                key vectors `[batch, key_len, dim]`
           value (`FloatTensor`): set of `key_len`
                value vectors `[batch, key_len, dim]`
           query (`FloatTensor`): set of `query_len`
                 query vectors  `[batch, query_len, dim]`
           mask: binary mask indicating which keys have
                 non-zero attention `[batch, query_len, key_len]`
        Returns:
           (`FloatTensor`, `FloatTensor`) :

           * output context vectors `[batch, query_len, dim]`
           * one of the attention vectors `[batch, query_len, key_len]`
        """

        batch_size = key.size(0)
        dim_per_head = self.dim_per_head
        head_count = self.head_count
        key_len = key.size(1)
        query_len = query.size(1)

        def shape(x):
            """  projection """
            return x.view(batch_size, -1, head_count, dim_per_head) \
                .transpose(1, 2)

        def unshape(x):
            """  compute context """
            return x.transpose(1, 2).contiguous() \
                .view(batch_size, -1, head_count * dim_per_head)

        # 1) Project key, value, and query.
        if layer_cache is not None:
            if type == "self":
                query, key, value = self.linear_query(query), \
                    self.linear_keys(query), \
                    self.linear_values(query)

                key = shape(key)
                value = shape(value)

                if layer_cache is not None:
                    device = key.device
                    if layer_cache["self_keys"] is not None:
                        key = torch.cat(
                            (layer_cache["self_keys"].to(device), key),
                            dim=2)
                    if layer_cache["self_values"] is not None:
                        value = torch.cat(
                            (layer_cache["self_values"].to(device), value),
                            dim=2)
                    layer_cache["self_keys"] = key
                    layer_cache["self_values"] = value
            elif type == "context":
                query = self.linear_query(query)
                if layer_cache is not None:
                    if layer_cache["memory_keys"] is None:
                        key, value = self.linear_keys(key), \
                            self.linear_values(value)
                        key = shape(key)
                        value = shape(value)
                    else:
                        key, value = layer_cache["memory_keys"], \
                            layer_cache["memory_values"]
                    layer_cache["memory_keys"] = key
                    layer_cache["memory_values"] = value
                else:
                    key, value = self.linear_keys(key), \
                        self.linear_values(value)
                    key = shape(key)
                    value = shape(value)
        else:
            key = self.linear_keys(key)
            value = self.linear_values(value)
            query = self.linear_query(query)
            key = shape(key)
            value = shape(value)

        query = shape(query)

        key_len = key.size(2)
        query_len = query.size(2)

        # 2) Calculate and scale scores.
        query = query / math.sqrt(dim_per_head)
        scores = torch.matmul(query, key.transpose(2, 3))

        if mask is not None:
            mask = mask.unsqueeze(1).expand_as(scores)
            # how can i fix it to use fp16...
            scores = scores.masked_fill(mask, -1e18)

        # 3) Apply attention dropout and compute context vectors.

        attn = self.softmax(scores)

        if (not predefined_graph_1 is None):
            attn_masked = attn[:, -1] * predefined_graph_1
            attn_masked = attn_masked / \
                (torch.sum(attn_masked, 2).unsqueeze(2) + 1e-9)

            attn = torch.cat([attn[:, :-1], attn_masked.unsqueeze(1)], 1)

        drop_attn = self.dropout(attn)
        if (self.use_final_linear):
            context = unshape(torch.matmul(drop_attn, value))
            output = self.final_linear(context)
            return output
        else:
            context = torch.matmul(drop_attn, value)
            return context


class Summarizer(pl.LightningModule):

    def __init__(self, n_training_steps=None, n_warmup_steps=None):
        super().__init__()
        self.max_pos = 512
        self.bert = BertModel.from_pretrained(
            BERT_MODEL_NAME)  # , return_dict=True)
        self.ext_layer = ExtTransformerEncoder()
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.loss = nn.BCELoss(reduction='none')

        for p in self.ext_layer.parameters():
            if p.dim() > 1:
                xavier_uniform_(p)

    # , input_ids, attention_mask, labels=None):
    def forward(self, src, segs, clss, labels=None):

        mask_src = ~(src == 0)  # 1 - (src == 0)
        mask_cls = ~(clss == -1)  # 1 - (clss == -1)

        top_vec = self.bert(src, token_type_ids=segs, attention_mask=mask_src)
        top_vec = top_vec.last_hidden_state

        sents_vec = top_vec[torch.arange(top_vec.size(0)).unsqueeze(1), clss]
        sents_vec = sents_vec * mask_cls[:, :, None].float()

        sent_scores = self.ext_layer(sents_vec, mask_cls).squeeze(-1)

        loss = 0
        if labels is not None:
            loss = self.loss(sent_scores, labels)

            loss = (loss * mask_cls.float()).sum() / len(labels)

        return loss, sent_scores

    def step(self, batch):

        src = batch['src']
        if len(batch['labels']) > 0:
            labels = batch['labels']
        else:
            labels = None
        segs = batch['segs']
        clss = batch['clss']

        loss, sent_scores = self(src, segs, clss, labels)

        return loss, sent_scores, labels

    def training_step(self, batch, batch_idx):

        loss, sent_scores, labels = self.step(batch)
        self.log("train_loss", loss, prog_bar=True, logger=True)

        return {"loss": loss, "predictions": sent_scores, "labels": labels}

    def validation_step(self, batch, batch_idx):

        loss, sent_scores, labels = self.step(batch)
        self.log("val_loss", loss, prog_bar=True, logger=True)

        return {"loss": loss, "predictions": sent_scores, "labels": labels}

    def test_step(self, batch, batch_idx):

        loss, sent_scores, labels = self.step(batch)
        self.log("test_loss", loss, prog_bar=True, logger=True)

        return {"loss": loss, "predictions": sent_scores, "labels": labels}

    def acc_loss(self, outputs):
        total_loss = 0
        hit_cnt = 0
        for outp in outputs:
            labels = outp['labels'].cpu()
            predictions, idxs = outp['predictions'].cpu().sort()
            loss = outp['loss'].cpu()
            for label, idx in zip(labels, idxs):
                for i in range(1, 3):
                    if label[idx[-i-1]] == 1:
                        hit_cnt += 1

            total_loss += loss

        avg_loss = total_loss / len(outputs)
        acc = hit_cnt / (3*len(outputs)*len(labels))

        return acc, avg_loss

    def training_epoch_end(self, outputs):

        acc, avg_loss = self.acc_loss(outputs)

        print('acc:', acc, 'avg_loss:', avg_loss)

        self.log('avg_train_loss', avg_loss, prog_bar=True, logger=True)

    def validation_epoch_end(self, outputs):

        acc, avg_loss = self.acc_loss(outputs)

        print('val_acc:', acc, 'avg_val_loss:', avg_loss)

        self.log('avg_val_loss', avg_loss, prog_bar=True, logger=True)

    def test_epoch_end(self, outputs):

        acc, avg_loss = self.acc_loss(outputs)

        print('test_acc:', acc, 'avg_test_loss:', avg_loss)

        self.log('avg_test_loss', avg_loss, prog_bar=True, logger=True)

        return

    def configure_optimizers(self):

        optimizer = AdamW(self.parameters(), lr=2e-5)

        steps_per_epoch = 10 // BATCH_SIZE
        total_training_steps = steps_per_epoch * N_EPOCHS

        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=steps_per_epoch,
            num_training_steps=total_training_steps
        )

        return dict(
            optimizer=optimizer,
            lr_scheduler=dict(
                scheduler=scheduler,
                interval='step'
            )
        )


checkpoint_callback = ModelCheckpoint(
    dirpath="checkpoints",
    filename="best-checkpoint",
    save_top_k=1,
    verbose=True,
    monitor="avg_val_loss",
    mode="min"
)
logger = TensorBoardLogger("lightning_logs", name="kpfBERT_Summary")
early_stopping_callback = EarlyStopping(monitor='avg_val_loss', patience=3)
trainer = pl.Trainer(
    checkpoint_callback=checkpoint_callback,
    callbacks=[early_stopping_callback],
    max_epochs=N_EPOCHS,
    gpus=0,
    #   precision=16, #소스 수정 또는 패키지 재설치 필요... 런타임 에러.
    progress_bar_refresh_rate=30
)


trained_model = Summarizer.load_from_checkpoint(
    'best-checkpoint.ckpt',
    strict=False
)
trained_model.eval()
trained_model.freeze()


def data_process(text):
    # 문장 분리 하고,
    sents = kss.split_sentences(text)

    # 데이터 가공하고,
    tokenlist = []
    for sent in sents:
        tokenlist.append(tokenizer(
            text=sent,
            add_special_tokens=True))  # , # Add '[CLS]' and '[SEP]'

    src = []  # 토크나이징 된 전체 문단
    labels = []  # 요약문에 해당하면 1, 아니면 0으로 문장수 만큼 생성
    segs = []  # 각 토큰에 대해 홀수번째 문장이면 0, 짝수번째 문장이면 1을 매핑
    clss = []  # [CLS]토큰의 포지션값을 지정

    odd = 0

    for tkns in tokenlist:

        if odd > 1:
            odd = 0
        clss = clss + [len(src)]
        src = src + tkns['input_ids']
        segs = segs + [odd] * len(tkns['input_ids'])
        odd += 1

        # truncation
        if len(src) == MAX_TOKEN_COUNT:
            break
        elif len(src) > MAX_TOKEN_COUNT:
            src = src[:MAX_TOKEN_COUNT - 1] + [src[-1]]
            segs = segs[:MAX_TOKEN_COUNT]
            break

    # padding
    if len(src) < MAX_TOKEN_COUNT:
        src = src + [0]*(MAX_TOKEN_COUNT - len(src))
        segs = segs + [0]*(MAX_TOKEN_COUNT - len(segs))

    if len(clss) < MAX_TOKEN_COUNT:
        clss = clss + [-1]*(MAX_TOKEN_COUNT - len(clss))

    return dict(
        sents=sents,  # 정답 출력을 위해...
        src=torch.tensor(src),
        segs=torch.tensor(segs),
        clss=torch.tensor(clss),
    )


def summarize_test(text):
    data = data_process(text.replace('\n', ''))

    # trained_model에 넣어 결과값 반환
    _, rtn = trained_model(data['src'].unsqueeze(
        0), data['segs'].unsqueeze(0), data['clss'].unsqueeze(0))
    rtn = rtn.squeeze()

    # 예측 결과값을 받기 위한 프로세스
    rtn_sort, idx = rtn.sort(descending=True)

    rtn_sort = rtn_sort.tolist()
    idx = idx.tolist()

    end_idx = rtn_sort.index(0)

    rtn_sort = rtn_sort[:end_idx]
    idx = idx[:end_idx]

    if len(idx) > 3:
        rslt = idx[:3]
    else:
        rslt = idx

    summ = []
    # print(' *** 입력한 문단의 요약문은 ...')
    for i, r in enumerate(rslt):
        summ.append(data['sents'][r])
        # print('[', i+1, ']', summ[i])

    return summ


def summarize_article(target_article):
    target_summary = summarize_test(target_article)
    return target_summary


[Kss]: GPU available: False, used: False
[Kss]: TPU available: False, using: 0 TPU cores
Some weights of BertModel were not initialized from the model checkpoint at jinmang2/kpfbert and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [134]:
import kss

# 3줄씩 문장을 잘라 단락 생성


def split_into_paragraphs(article, sentences_per_paragraph=3):
    sentences = kss.split_sentences(article)
    paragraphs = []
    paragraph = []

    for sentence in sentences:
        if len(sentence) > 20:
            # 보통 한 줄에 20자 정도 넘어가야 유의미한 정보가 포함된 문장
            paragraph.append(sentence)
        if len(paragraph) == sentences_per_paragraph:  # 3줄 이상이면
            paragraphs.append(" ".join(paragraph))  # 3줄을 하나로 합치기
            paragraph = []

        # 남아있는 문장들 중 20자가 넘어가면 단락으로 추가
    if paragraph and len(paragraph) > 20:
        paragraphs.append(" ".join(paragraph))

    return paragraphs  # 단락 데이터 반환

In [135]:
import pandas as pd
from bertopic import BERTopic
import faiss
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer


# 클러스터
def clustering(target_article, similar_news):
    db_paragraph_data = pd.read_json('dataset/paragraph_data.json')
    db_paragraph_embeddings1 = pd.read_json('dataset/paragraph_embedding.json') # DB에서 단락임베딩 데이터 가져오기
    db_paragraph_embeddings=[]
    for row in db_paragraph_embeddings1['paragraph_embedding']:
        db_paragraph_embeddings.append(np.fromstring(row[1:-1], dtype=np.float32, sep=' '))

    db_paragraph_embeddings = np.array(db_embeddings)
    
    db_paragraph_embeddings = db_paragraph_embeddings[db_paragraph_data['index'].isin(
        similar_news)]  # 유사한 기사 데이터만 가져오기
    db_paragraph_data = db_paragraph_data[db_paragraph_data['index'].isin(
        similar_news)]  # 유사한 기사 데이터만 가져오기

    # 현재 읽고 있는 기사의 단락 데이터
    target_paragraphs = split_into_paragraphs(target_article)
    target_paragraph_data = []
    for data in target_paragraphs:
        target_paragraph_data.append([-1]+[data])

    target_paragraph_data = pd.DataFrame(
        data=target_paragraph_data,
        columns=['index', 'paragraph'])

    model = SentenceTransformer('bongsoo/kpf-sbert-128d-v1')

    target_embeddings = model.encode(
        target_paragraph_data['paragraph'].tolist())  # 현재 읽고 있는 기사 단락 임베딩

    # 현재 읽고 있는 기사 데이터와 유사한 기사 데이터를 합쳐서 훈련 데이터로 들어감
    train_paragraph_embeddings = np.vstack(
        (target_embeddings, db_paragraph_embeddings))
    train_paragraph_data = pd.concat(
        [target_paragraph_data, db_paragraph_data], axis=0)

    # BERTopic을 이용한 클러스터링
    model = BERTopic(embedding_model='bongsoo/kpf-sbert-128d-v1',
                     min_topic_size=5)

    topics, probs = model.fit_transform(
        documents=train_paragraph_data['paragraph'], embeddings=train_paragraph_embeddings)  # 클러스터링 만들기
    train_paragraph_data['topic'] = topics  # 토픽 저장

    # 현재 읽고 있는 기사의 토픽 모델링
    target_paragraph_data = pd.merge(target_paragraph_data, train_paragraph_data[[
                                     'paragraph', 'topic']], on='paragraph', how='inner')
    # 토픽이 -1, 0은 제외
    target_paragraph_data = target_paragraph_data[target_paragraph_data['topic'] > 0]

    if len(target_paragraph_data) == 0:  # 만약 현재 읽고 있는 기사의 토픽이 없으면
        print('No Topic')
        return similar_news  # 아무 기사3개 랜덤으로

    # 유사한 기사들의 토픽 모델링 결과 저장
    db_paragraph_data = pd.merge(db_paragraph_data, train_paragraph_data[[
        'paragraph', 'topic']], on='paragraph', how='inner')
    # 토픽 -1, 0 제외
    db_paragraph_data = db_paragraph_data[db_paragraph_data['topic'] > 0]

    # 토픽 간 거리 구하기
    topic_embeddings = model.topic_embeddings_
    topic_embeddings = topic_embeddings[1:]

    target_topic = target_paragraph_data['topic'].value_counts().idxmax()
    target_topic_embedding = topic_embeddings[target_topic]

    # 현재 토픽 개수 0~n
    num_topics = len(model.get_topic_freq()) - 1

    # faiss를 이용해서 토픽 간 코사인 유사도 계산
    index = faiss.IndexFlatIP(128)
    faiss.normalize_L2(topic_embeddings)
    index.add(topic_embeddings)
    distances, indices = index.search(np.expand_dims(
        target_topic_embedding, axis=0), num_topics)

    # 가장 유사도가 낮은 토픽 순으로 단락 정렬
    indices = indices[0][::-1]
    indices = np.delete(indices, np.where(indices == 0)[0][0])
    db_paragraph_data['topic'] = pd.Categorical(
        db_paragraph_data['topic'], categories=indices, ordered=True)
    db_paragraph_data = db_paragraph_data.sort_values('topic')

    # 토픽이 3개 이상이면
    if num_topics - 2 > 3:
        index_counts = db_paragraph_data.groupby(
            'topic')['index'].value_counts().rename('count').reset_index()
        most_common_index_per_topic = index_counts.loc[index_counts.groupby('topic')[
            'count'].idxmax()]
        most_common_index_per_topic = most_common_index_per_topic.drop_duplicates(
            subset='index')  # 중복 제거

        return most_common_index_per_topic['index'].tolist()

    else:  # 토픽이 3개 이하이면 나온 것 모두 반환
        db_paragraph_data = db_paragraph_data.drop_duplicates(
            subset='index')  # 중복 제거
        return db_paragraph_data['index'].tolist()


In [136]:
import json

with open('inference_dataset/url.json', 'r') as f:
    json_data = json.load(f)
    
url=json_data['url']
target_data = fetch_article_data(url)  # 현재 읽고 있는 뉴스 크롤링
target_article = target_data['article']
target_summary1 = summarize_article(target_article)
target_summary = " ".join(target_summary1)
model = SentenceTransformer('bongsoo/kpf-sbert-128d-v1')  # 임베딩 모델
similar_news_index = find_similar_news(
        target_summary, model)  # 유사한 기사 찾기 (인덱스 반환)


various_news_index = clustering(target_article, similar_news_index)  # 클러스터링
various_news = pd.read_json('dataset/news.json')
if url in various_news['link']:
        same_news_index=various_news[various_news['link']==url].index
        various_news_index.remove(same_news_index)


various_news = various_news.loc[various_news_index][:3]

various_news

Unnamed: 0,index,title,link,article
5732,5732,[기획] `전국민 25만원` 제동 건 기재부,https://n.news.naver.com/mnews/article/029/000...,"민주당, 15조 추경 편성 제안尹·李 회담 최대 의제로 꼽혀기재부, '부정적' 입장..."
5664,5664,대통령실 “민생지원금은 포퓰리즘” 인식… 협치위해 ‘선별지원’ 가능성,https://n.news.naver.com/mnews/article/021/000...,대상·금액 조정해 합의할수도민주 “부자감세 줄여 재원마련”이재명 더불어민주당 대표가...
5594,5594,입법권으로 민생지원금 주자?… 삼권분립 훼손하는 巨野 [심층기획-'처분적 법률' 위...,https://n.news.naver.com/mnews/article/022/000...,“법률 개념 자체가 위헌적 뉘앙스”행정·재판 없이 국민에 직접 자동집행력5·18민주...
