In [12]:
import pandas as pd
import json
import re
import os
import sys
import numpy as np
import random
pd.options.display.float_format = '{:,}'.format

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import numpy as np
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [13]:
def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    # 문서와 각 키워드들 간의 유사도
    distances = cosine_similarity(doc_embedding, candidate_embeddings)

    # 각 키워드들 간의 유사도
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # 코사인 유사도에 기반하여 키워드들 중 상위 top_n개의 단어를 pick.
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # 각 키워드들 중에서 가장 덜 유사한 키워드들간의 조합을 계산
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [14]:
def mmr(doc_embedding, candidate_embeddings, words, top_n, diversity):

    # 문서와 각 키워드들 간의 유사도가 적혀있는 리스트
    word_doc_similarity = cosine_similarity(candidate_embeddings, doc_embedding)

    # 각 키워드들 간의 유사도
    word_similarity = cosine_similarity(candidate_embeddings)

    # 문서와 가장 높은 유사도를 가진 키워드의 인덱스를 추출.
    # 만약, 2번 문서가 가장 유사도가 높았다면
    # keywords_idx = [2]
    keywords_idx = [np.argmax(word_doc_similarity)]

    # 가장 높은 유사도를 가진 키워드의 인덱스를 제외한 문서의 인덱스들
    # 만약, 2번 문서가 가장 유사도가 높았다면
    # ==> candidates_idx = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10 ... 중략 ...]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    # 최고의 키워드는 이미 추출했으므로 top_n-1번만큼 아래를 반복.
    # ex) top_n = 5라면, 아래의 loop는 4번 반복됨.
    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # MMR을 계산
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # keywords & candidates를 업데이트
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [15]:
f = open('key_1_text.txt', 'r', encoding='UTF-8')
doc = f.read()

In [16]:
# 3개의 단어 묶음인 단어구 추출
n_gram_range = (3, 3)
stop_words = "english"

count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names_out()

print('trigram 개수 :',len(candidates))
print('trigram 다섯개만 출력 :',candidates[:5])

trigram 개수 : 18699
trigram 다섯개만 출력 : ['000 help reach' '000 inside runs' '000 miles away' '000 touting male'
 '000 troops rescued']


In [17]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [18]:
top_n = 10
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
print(keywords)

['upcoming pageant security', 'german army stealing', 'disturbance meeting sheriff', 'father building comet', 'brothers meet cauliflower', 'train fled murder', 'heroin demands president', 'boy kills agents', 'quits school decathlon', 'brother train finally']


In [19]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)

['upcoming pageant security',
 'brothers meet cauliflower',
 'train fled murder',
 'heroin demands president',
 'quits school decathlon']

In [20]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=10, nr_candidates=20)

['calling brother train',
 'monster hunting cashier',
 'man killed wolverine',
 'kingsmen whiskey accidentally',
 'prisoners escaping gang',
 'decathlon competition washington',
 'upcoming pageant security',
 'brothers meet cauliflower',
 'heroin demands president',
 'quits school decathlon']

In [21]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=10, diversity=0.2)

['brother train finally',
 'heroin demands president',
 'quits school decathlon',
 'father building comet',
 'train fled murder',
 'woman dying cancer',
 'upcoming pageant security',
 'prisoners escaping gang',
 'brothers meet cauliflower',
 'kingsmen whiskey accidentally']

In [22]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=10, diversity=0.7)

['brother train finally',
 'precision bombs repel',
 'pool hall drunk',
 'woman dying cancer',
 'safe president obama',
 'going forest trespass',
 'starts make sandwich',
 'shop prom dresses',
 'play cricket nearby',
 'seal tom cruise']