# Keyword extraction by self-attention method

In [None]:
import pandas as pd
train2 = pd.read_csv('train_data.csv')
train2.head()

In [None]:
# NLTK
!pip install nltk
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import re
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer

def is_noun(word):
    # noun
    pos = pos_tag([word])[0][1]
    return pos.startswith('NN')

model = AutoModel.from_pretrained("bert-base-uncased", output_attentions=True)
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model.eval()

In [None]:
import numpy as np

# attention
def get_token_attention(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs, output_attentions=True)

    attn = outputs.attentions[-1][0].mean(0)[0].cpu().numpy()  # CLS
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])

    return tokens, attn

# subword
def merge_subwords(tokens, scores):
    words, word_scores = [], []
    buffer, buffer_scores = "", []

    for tok, score in zip(tokens, scores):
        if tok in ['[CLS]', '[SEP]']:
            continue
        if tok.startswith('##'):
            buffer += tok[2:]
            buffer_scores.append(score)
        else:
            if buffer:
                words.append(buffer)
                word_scores.append(np.mean(buffer_scores))
            buffer = tok
            buffer_scores = [score]
    if buffer:
        words.append(buffer)
        word_scores.append(np.mean(buffer_scores))

    return words, word_scores

def average_duplicate_keywords(word_score_list):
    score_dict = defaultdict(list)
    for word, score in word_score_list:
        score_dict[word].append(score)

    averaged = [(word, float(np.mean(scores))) for word, scores in score_dict.items()]

    return sorted(averaged, key=lambda x: x[1], reverse=True)

# all_word_scores_arr: all word-level attention score arr
scores = all_word_scores_arr

# 5%, 10%, 15%, 20% cutoff
thr_5  = np.percentile(scores, 5)
thr_10 = np.percentile(scores, 10)
thr_15 = np.percentile(scores, 15)
thr_20 = np.percentile(scores, 20)

print("5% threshold :", thr_5)
print("10% threshold:", thr_10)
print("15% threshold:", thr_15)
print("20% threshold:", thr_20)

def extract_keywords_for_threshold(words, scores, threshold):
    pairs = [(w, s) for w, s in zip(words, scores)
             if is_noun(w) and s >= threshold]

    averaged = average_duplicate_keywords(pairs)

    keywords = [w for w, _ in averaged]
    return keywords

rows = []

# id grouping
for doc_id, group in attn_detail_df.groupby('id'):
    words  = group['word'].tolist()
    scores = group['score'].tolist()

    # abstract
    abstract = train2.loc[train2['id'] == doc_id, 'text'].iloc[0]

    # threshold extract
    kw_5  = extract_keywords_for_threshold(words, scores, thr_5)
    kw_10 = extract_keywords_for_threshold(words, scores, thr_10)
    kw_15 = extract_keywords_for_threshold(words, scores, thr_15)
    kw_20 = extract_keywords_for_threshold(words, scores, thr_20)

    rows.append({
        "id": doc_id,
        "abstract": abstract,
        "kw_5":  ", ".join(kw_5),
        "kw_10": ", ".join(kw_10),
        "kw_15": ", ".join(kw_15),
        "kw_20": ", ".join(kw_20)
    })

result_df = pd.DataFrame(rows)
result_df.to_csv('attn_5_10_15_20.csv', index=False)

#5% threshold : 0.0009138039429672062
#10% threshold: 0.001269497605971992
#15% threshold: 0.0015857178019359707
#20% threshold: 0.0018903562799096106

In [None]:
attn_results = []
for _, row in tqdm(train2.iterrows(), total=len(train2)):
    doc_id = row['id']
    text = row['text']

    tokens, attn_scores = get_token_attention(text)
    merged_words, merged_scores = merge_subwords(tokens, attn_scores)

    scored = list(zip(merged_words, merged_scores))

    # 명사만 필터링
    noun_filtered = [(w, s) for w, s in scored if is_noun(w) and s >= 0.001269497605971992]

    # 중복 단어 평균
    averaged = average_duplicate_keywords(noun_filtered)

    attn_results.append({
        "id": doc_id,
        "keywords": [w for w, _ in sorted(averaged, key=lambda x: -x[1])]
    })
attn_df = pd.DataFrame(attn_results)
attn_df.head()
attn_df.to_csv('attn_df.csv', index=False)

# Extract salient keyword

In [None]:
attn_df = pd.read_csv('attn_df.csv')

# train(id, source) concat
attn_df_merged = attn_df.merge(train2[['id', 'source']], on='id', how='left')

In [None]:
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag

# ----------------------------
# NLTK
# ----------------------------
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# ----------------------------
# base setting
# ----------------------------
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# ----------------------------
# POS mapping
# ----------------------------
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

# ----------------------------
# keyword preprocessing
# ----------------------------
def clean_keywords_pos(keywords_list):
    processed = []
    for kw in keywords_list:
        if not kw:
            continue
        kw_clean = re.sub(r'[^a-zA-Z]', '', kw.strip()).lower()
        if len(kw_clean) <= 2:
            continue
        pos = pos_tag([kw_clean])[0][1]
        wn_pos = get_wordnet_pos(pos)
        if wn_pos == wordnet.VERB:
            continue

        lemma = lemmatizer.lemmatize(kw_clean, pos=wn_pos)
        processed.append(lemma)
    return processed

# ----------------------------
# list processing
# ----------------------------
def preprocess_keywords(row):
    if isinstance(row, str):
        keyword_list = [k.strip() for k in row.split(',')]
    elif isinstance(row, list):
        keyword_list = row
    else:
        return []
    return clean_keywords_pos(keyword_list)


attn_df_merged['cleaned_keywords'] = attn_df_merged['keywords'].apply(preprocess_keywords)

# count by source
patent_df = attn_df_merged[attn_df_merged['source'] == 'patent']
startup_df = attn_df_merged[attn_df_merged['source'] == 'startup']

patent_keywords_flat = [kw for kws in patent_df['cleaned_keywords'] for kw in kws]
startup_keywords_flat = [kw for kws in startup_df['cleaned_keywords'] for kw in kws]

patent_counts = Counter(patent_keywords_flat)
startup_counts = Counter(startup_keywords_flat)

# top keywords filtering
def top_percent_keywords(counter_obj, percent=0.10):
    sorted_kws = sorted(counter_obj.items(), key=lambda x: x[1], reverse=True)
    cutoff = max(1, int(len(sorted_kws) * percent))
    return set([kw for kw, _ in sorted_kws[:cutoff]])

patent_top_keywords = top_percent_keywords(patent_counts, 0.10)
startup_top_keywords = top_percent_keywords(startup_counts, 0.10)

combined_top_keywords = patent_top_keywords.union(startup_top_keywords)

# ----------------------------
# final keyword
# ----------------------------
def filter_to_top_keywords(keywords, top_keywords_set):
    return [kw for kw in keywords if kw in top_keywords_set]

attn_df_merged['final_keywords'] = attn_df_merged['cleaned_keywords'].apply(
    lambda kws: filter_to_top_keywords(kws, combined_top_keywords)
)

# ----------------------------
# doc-term matrix
# ----------------------------
doc_term_matrix = pd.DataFrame(0, index=attn_df_merged['id'], columns=sorted(combined_top_keywords))

for idx, keywords in zip(attn_df_merged['id'], attn_df_merged['final_keywords']):
    for kw in keywords:
        doc_term_matrix.loc[idx, kw] = 1

# Tf-idf vectorize

3 step stopwords processing
- 1st level : verb, stopwords
- 2nd level : general business and concept term
- 3rd level : general tech, attribute, business term

In [None]:
cleaned_keywords = [word for word in combined_top_keywords if word.lower() not in stop_words and word]

first_level_stopwords = [
    'use', 'allow', 'receive', 'contain', 'implement', 'combine', 'leverage', 'describe', 'learn', 'classify', 'simulate', 'self', 'mean', 'clean', 'aim',
    'quality', 'well', 'world', 'call', 'idea', 'post', 'form', 'back', 'subject', 'cover', 'allow', 'benefit', 'change', 'play', 'set', 'start',
    'com', 'future', 'claim', 'time', 'order', 'word', 'answer', 'multiple', 'entry', 'response', 'function', 'contact', 'instance', 'contain', 'cost'
]

revised_second_level_stopwords = [
    # general business term
    'channel', 'line', 'message', 'query', 'brand', 'creator', 'sector', 'account', 'client', 'growth', 'advisory',
    'industry', 'employee', 'insight', 'market', 'sale', 'marketing', 'business', 'manager', 'enterprise', 'partner', 'partnership',
    'community', 'management', 'policy', 'organisation', 'workforce', 'analytics', 'performance', 'confidence', 'scale', 'score', 'observation',
    'determination', 'cluster', 'frequency', 'deployment', 'procedure', 'production', 'google', 'workflow',

    # general concept
    'technique', 'development', 'behavior', 'operation', 'implementation', 'objective', 'state', 'base', 'ground', 'profile', 'anchor',
    'formation', 'arrangement', 'shape', 'measure', 'section', 'aspect', 'factor', 'creation', 'approach', 'characteristic', 'life',
]

final_stopwords = [
    # general tech term
    'system', 'component', 'implementation', 'technology', 'framework', 'structure', 'base', 'architecture',
    'feature', 'configuration', 'arrangement', 'mechanism', 'invention',

    # general attribute
    'pressure', 'heat', 'hollow', 'transparent', 'environment', 'fabric', 'color', 'position', 'distance', 'focus', 'quality', 'characteristic',
    'factor', 'aspect', 'attribute', 'part', 'portion', 'section', 'fluid', 'start',

    # business term
    'retailer', 'agency', 'government', 'participant', 'lift', 'food', 'operator', 'expert', 'building', 'turf', 'crop', 'pair', 'weather',
    'tree', 'branch', 'artist', 'fish', 'fixture', 'kit', 'gesture', 'shop', 'fund', 'investment', 'culture', 'leather', 'article', 'startup',
    'shell', 'insurance', 'decision', 'faster', 'market', 'business', 'industry', 'client', 'customer', 'campaign', 'advisory', 'employee', 'workforce', 'organisation', 'community', 'policy', 'benefit',
]

# ----------------------------

def lemmatize_stopwords(stopword_list):
    return set(lemmatizer.lemmatize(w.lower(), wordnet.NOUN) for w in stopword_list)

first_level_stopwords_lemma = lemmatize_stopwords(first_level_stopwords)
revised_second_level_stopwords_lemma = lemmatize_stopwords(revised_second_level_stopwords)
final_stopwords_lemma = lemmatize_stopwords(final_stopwords)
custom_stopwords_lemma = lemmatize_stopwords(list(custom_stopwords))


def clean_keywords_final(keywords_list):

    cleaned_list = [kw.lower() for kw in keywords_list if kw]

    after_first = [kw for kw in cleaned_list if kw not in first_level_stopwords_lemma]

    after_second = [kw for kw in after_first if kw not in revised_second_level_stopwords_lemma]

    final_cleaned = [kw for kw in after_second if kw not in final_stopwords_lemma and kw not in custom_stopwords_lemma]

    return final_cleaned

# ----------------------------

final_cleaned_keywords = clean_keywords_final(list(combined_top_keywords))

keywords_to_exclude = ['event', 'analysis', 'pipeline', 'startup', 'material', 'water', 'surface', 'composition', 'weight']
final_combined_keywords = [kw for kw in final_cleaned_keywords if kw not in keywords_to_exclude]

# ----------------------------

patent_keyword_freq = dict(sorted_patent_keywords)
startup_keyword_freq = dict(sorted_startup_keywords)

final_combined_freq = {}
for kw in final_combined_keywords:
    patent_freq = patent_keyword_freq.get(kw, 0)
    startup_freq = startup_keyword_freq.get(kw, 0)
    final_combined_freq[kw] = patent_freq + startup_freq

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def map_to_final_keywords(keywords, final_vocab_set):
    lemmatized = [lemmatizer.lemmatize(kw.lower(), wordnet.NOUN) for kw in keywords]
    return ' '.join([kw for kw in lemmatized if kw in final_vocab_set])

final_vocab_set = set(final_combined_keywords)

attn_df_merged['tfidf_text'] = attn_df_merged['cleaned_keywords'].apply(
    lambda kws: map_to_final_keywords(kws, final_vocab_set)
)

# TF-IDF vectorize
vectorizer = TfidfVectorizer(vocabulary=final_combined_keywords)

# TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(attn_df_merged['tfidf_text'])

# to DataFrame
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(),
                        columns=vectorizer.get_feature_names_out(),
                        index=attn_df_merged['id'])
tfidf_df

In [None]:
tfidf_df.to_csv('tfidf_df.csv', index=False)

# 키워드별 평균 TF-IDF 값 계산
mean_tfidf_per_keyword = tfidf_df.mean(axis=0).sort_values(ascending=False)

# 결과 확인 (상위 20개 키워드)
print(mean_tfidf_per_keyword.head(20))

#device         0.103101
#processing     0.067296
#sensor         0.057862
#processor      0.056383
#image          0.044627 ...