In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm

extractor = pipeline(model="yanekyuk/bert-keyword-extractor")

df = pd.read_csv("   ")

df['body'] = df['body'].fillna('0').astype(str)

In [None]:
def extract_keywords(text):
    try:
        return extractor(text)
    except Exception as e:
        print(f"Error extracting keywords for text: {text} with error {e}")
        return []

In [None]:
tqdm.pandas() 
df['keywords'] = df['body'].progress_apply(extract_keywords)

print("Keywords extraction completed.")

In [None]:
def format_keywords(hf_output):
    return [(entry['word'], entry['score']) for entry in hf_output]

In [None]:
df['formatted_keywords'] = df['keywords'].apply(format_keywords)

In [None]:
# To solve the prblem brought by subword tokenization
def merge_subwords(keywords):
    merged_keywords = []
    previous_keyword = None
    
    for word, score in keywords:
        if word.startswith('##'):
            if previous_keyword:
                # Remove the "##" and combine with previous word
                previous_keyword = (previous_keyword[0] + word.replace('##', ''), max(previous_keyword[1], score))
        else:
            if previous_keyword:
                # Add the keyword we combined to the list
                merged_keywords.append(previous_keyword)
            # Update the previous_keyword with current word
            previous_keyword = (word, score)
    
    # Make sure the last word will be added
    if previous_keyword:
        merged_keywords.append(previous_keyword)
        
    return merged_keywords

In [None]:
df['merged_keywords'] = df['formatted_keywords'].apply(merge_subwords)

In [None]:
all_keywords = df['merged_keywords'].tolist()

In [None]:
from itertools import chain

flat_list = list(chain(*all_keywords)) 

In [None]:
import re
from collections import Counter

stop_words = set([
    'a', 'an', 'and', 'the', 'of', 'in', 'to', 'is', 'it', 'that', 'on', 'for', 
    'with', 'as', 'by', 'at', 'from', 'this', 'be', 'or', 'which', 'but', 'are', 
    'was', 'were', 'not', 'have', 'has', 'had', 'will', 'would', 'can', 'could', 
    'should', 'shall', 'may', 'might', 'must', 'do', 'does', 'did','w','T'
])

def is_valid_word(word):
    return re.match(r'^[A-Za-z0-9]+$', word) is not None and word.lower() not in stop_words

def count_frequency(keywords):
    keyword_counter = Counter(word for word, _ in keywords if is_valid_word(word))
    return sorted(keyword_counter.items(), key=lambda x: x[1], reverse=True)

unique_keywords = count_frequency(flat_list)
top_50_keywords = unique_keywords[:50]


In [None]:
import os

output_path = '   '
if not os.path.exists(output_path):
    df_keywords = pd.DataFrame(top_50_keywords, columns=['Keyword', 'Score'])
    df_keywords.to_csv(output_path, index=False)