In [1]:
import pandas as pd
import json
import re
import os
import sys
import numpy as np
import random
pd.options.display.float_format = '{:,}'.format

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

import numpy as np
import itertools

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\User\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [4]:
def max_sum_sim(doc_embedding, candidate_embeddings, words, top_n, nr_candidates):
    # 문서와 각 키워드들 간의 유사도
    distances = cosine_similarity(doc_embedding, candidate_embeddings)

    # 각 키워드들 간의 유사도
    distances_candidates = cosine_similarity(candidate_embeddings, 
                                            candidate_embeddings)

    # 코사인 유사도에 기반하여 키워드들 중 상위 top_n개의 단어를 pick.
    words_idx = list(distances.argsort()[0][-nr_candidates:])
    words_vals = [candidates[index] for index in words_idx]
    distances_candidates = distances_candidates[np.ix_(words_idx, words_idx)]

    # 각 키워드들 중에서 가장 덜 유사한 키워드들간의 조합을 계산
    min_sim = np.inf
    candidate = None
    for combination in itertools.combinations(range(len(words_idx)), top_n):
        sim = sum([distances_candidates[i][j] for i in combination for j in combination if i != j])
        if sim < min_sim:
            candidate = combination
            min_sim = sim

    return [words_vals[idx] for idx in candidate]

In [5]:
def mmr(doc_embedding, candidate_embeddings, words, top_n, diversity):

    # 문서와 각 키워드들 간의 유사도가 적혀있는 리스트
    word_doc_similarity = cosine_similarity(candidate_embeddings, doc_embedding)

    # 각 키워드들 간의 유사도
    word_similarity = cosine_similarity(candidate_embeddings)

    # 문서와 가장 높은 유사도를 가진 키워드의 인덱스를 추출.
    # 만약, 2번 문서가 가장 유사도가 높았다면
    # keywords_idx = [2]
    keywords_idx = [np.argmax(word_doc_similarity)]

    # 가장 높은 유사도를 가진 키워드의 인덱스를 제외한 문서의 인덱스들
    # 만약, 2번 문서가 가장 유사도가 높았다면
    # ==> candidates_idx = [0, 1, 3, 4, 5, 6, 7, 8, 9, 10 ... 중략 ...]
    candidates_idx = [i for i in range(len(words)) if i != keywords_idx[0]]

    # 최고의 키워드는 이미 추출했으므로 top_n-1번만큼 아래를 반복.
    # ex) top_n = 5라면, 아래의 loop는 4번 반복됨.
    for _ in range(top_n - 1):
        candidate_similarities = word_doc_similarity[candidates_idx, :]
        target_similarities = np.max(word_similarity[candidates_idx][:, keywords_idx], axis=1)

        # MMR을 계산
        mmr = (1-diversity) * candidate_similarities - diversity * target_similarities.reshape(-1, 1)
        mmr_idx = candidates_idx[np.argmax(mmr)]

        # keywords & candidates를 업데이트
        keywords_idx.append(mmr_idx)
        candidates_idx.remove(mmr_idx)

    return [words[idx] for idx in keywords_idx]

In [8]:
f = open('key_text.txt', 'r', encoding='UTF-8')
doc = f.read()

In [10]:
# 3개의 단어 묶음인 단어구 추출
n_gram_range = (3, 3)
stop_words = "english"

count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names_out()

print('trigram 개수 :',len(candidates))
print('trigram 다섯개만 출력 :',candidates[:5])

trigram 개수 : 567902
trigram 다섯개만 출력 : ['00 10 00' '00 agent requirements' '00 agents relative'
 '00 appointment tony' '00 day life']


In [11]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/550 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [12]:
top_n = 5
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
print(keywords)

['priest killing rampage', 'alcohol killing sheriff', 'marines kill flamethrower', 'terrorist died yacht', 'loader killing travis']


In [18]:
top_n = 100
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
print(keywords)

['killed men cancer', 'mobsters knocked unconscious', 'brutally executes sheriff', 'cars explode killing', 'resulting deaths zombies', 'rafters doing suicide', 'explodes killing mccaffrey', 'bridge killing jason', 'killed fleeing battle', 'aliens kill policemen', 'car killing detectives', 'parents murder steve', 'grenade kill zombies', 'torpedoes destroying sickbay', 'explode killing men', 'crash murdering occupants', 'overwhelmed killed zombies', 'alaska killing wolves', 'boat attack smuggler', 'wife killing teddy', 'machete priest killing', 'glider crashed killing', 'chainsaw kills abomination', 'miller kills terrorist', 'suv killing men', 'killing fellow marine', 'gun stole zombie', 'murder ceo major', 'soldiers disintegrated wolverine', 'tumbles shooting terrorists', 'killing dread pirate', 'explosion destroys helicopter', 'guards breaking harley', 'donatellis hostage critically', 'gang killed noodles', 'kicking sharks away', 'rooftops killing police', 'robotics plot killed', 'hosp

In [21]:
import pickle
with open("keybert_word", "wb") as fp:
    pickle.dump(keywords, fp)

### after 2000

In [23]:
f = open('key_20_text.txt', 'r', encoding='UTF-8')
doc = f.read()

In [24]:
# 3개의 단어 묶음인 단어구 추출
n_gram_range = (3, 3)
stop_words = "english"

count = CountVectorizer(ngram_range=n_gram_range, stop_words=stop_words).fit([doc])
candidates = count.get_feature_names_out()

print('trigram 개수 :',len(candidates))
print('trigram 다섯개만 출력 :',candidates[:5])

trigram 개수 : 368646
trigram 다섯개만 출력 : ['00 10 00' '00 agent requirements' '00 agents relative'
 '00 appointment tony' '00 day life']


In [25]:
model = SentenceTransformer('distilbert-base-nli-mean-tokens')
doc_embedding = model.encode([doc])
candidate_embeddings = model.encode(candidates)

In [33]:
top_n = 200
distances = cosine_similarity(doc_embedding, candidate_embeddings)
keywords = [candidates[index] for index in distances.argsort()[0][-top_n:]]
print(keywords)

['wife priest yells', 'champagne poured creepy', 'partying casino goes', 'huge party suite', 'coveted twinkies fight', 'steward transforms cinderella', 'girl screams tyler', 'brad dance invites', 'downstairs ballroom steals', 'party guests gossip', 'dancers outraged tracy', 'having sex chloe', 'pool flips rage', 'younger brothers fish', 'party night patients', 'sex toilet father', 'hookers trying crash', 'daddy daughter dance', 'party slaughtered bride', 'party stunned announces', 'father wedding underway', 'win basketball scholarship', 'senses pregnant hellboy', 'crew partying rotunda', 'gay friends quickly', 'apartments throwing party', 'party going boys', 'club dancer addicted', 'lacrosse team gay', 'buddies bachelor party', 'starts whacking porn', 'bride party slaughtered', 'party jay angrily', 'billy planning wedding', 'suddenly vampire babies', 'crashed wedding self', 'bathroom forever screaming', 'party gets drunk', 'parties wonder shower', 'frog horror parties', 'jamie graduati

In [34]:
with open("keybert_20_word", "wb") as fp:
    pickle.dump(keywords, fp)

In [27]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=10)

MemoryError: Unable to allocate 506. GiB for an array with shape (368646, 368646) and data type float32

In [None]:
max_sum_sim(doc_embedding, candidate_embeddings, candidates, top_n=5, nr_candidates=20)

In [31]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.2)

MemoryError: Unable to allocate 506. GiB for an array with shape (368646, 368646) and data type float32

In [None]:
mmr(doc_embedding, candidate_embeddings, candidates, top_n=5, diversity=0.7)