In [None]:
from transformers import pipeline

In [None]:
extractor = pipeline(model = "yanekyuk/bert-keyword-extractor")

In [None]:
# Read the csv file
import pandas as pd

df = pd.read_csv("p&c_Healthcare.csv")

In [None]:
# clean the data
df['body'] = df['body'].fillna('0').astype(str)

In [None]:
# Define a keyword extraction function
def extract_keywords(text):
    try:
        return extractor(text)
    except Exception as e:
        print(f"Error extracting keywords for text: {text} with error {e}")

In [None]:
# Use the function to extract keyword
df['keywords'] = df['body'].apply(extract_keywords)

In [None]:
# Define a formatting function
def format_keywords(hf_output):
    return [(entry['word'], entry['score']) for entry in hf_output]

In [None]:
df['formatted_keywords'] = df['keywords'].apply(format_keywords)

In [None]:
# To solve the prblem brought by subword tokenization
def merge_subwords(keywords):
    merged_keywords = []
    previous_keyword = None
    
    for word, score in keywords:
        if word.startswith('##'):
            if previous_keyword:
                # Remove the "##" and combine with previous word
                previous_keyword = (previous_keyword[0] + word.replace('##', ''), max(previous_keyword[1], score))
        else:
            if previous_keyword:
                # Add the keyword we combined to the list
                merged_keywords.append(previous_keyword)
            # Update the previous_keyword with current word
            previous_keyword = (word, score)
    
    # Make sure the last word will be added
    if previous_keyword:
        merged_keywords.append(previous_keyword)
        
    return merged_keywords

In [None]:
df['merged_keywords'] = df['formatted_keywords'].apply(merge_subwords)

In [None]:
all_keywords = df['merged_keywords'].tolist()

In [None]:
from itertools import chain

flat_list = list(chain(*all_keywords)) 

In [None]:
def remove_duplicates(keywords):
    keyword_dict = {}
    for word, score in keywords:
        if word not in keyword_dict or keyword_dict[word] < score:
            keyword_dict[word] = score
    return sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)

unique_keywords = remove_duplicates(flat_list)
top_100_keywords = unique_keywords[:100]


In [None]:
top_100_keywords

In [None]:
import os

output_path = 'kw_Healthcare.csv'
if not os.path.exists(output_path):
    df_keywords = pd.DataFrame(top_100_keywords, columns=['Keyword', 'Score'])
    df_keywords.to_csv(output_path, index=False)