In [116]:
from transformers import pipeline

In [117]:
extractor = pipeline(model = "yanekyuk/bert-keyword-extractor")

In [168]:
# Read the csv file
import pandas as pd

df = pd.read_csv("p&c_Healthcare.csv")

In [169]:
# clean the data
df['body'] = df['body'].fillna('0').astype(str)

In [170]:
# Define a keyword extraction function
def extract_keywords(text):
    try:
        return extractor(text)
    except Exception as e:
        print(f"Error extracting keywords for text: {text} with error {e}")

In [171]:
# Use the function to extract keyword
df['keywords'] = df['body'].apply(extract_keywords)

In [172]:
# Define a formatting function
def format_keywords(hf_output):
    return [(entry['word'], entry['score']) for entry in hf_output]

In [173]:
df['formatted_keywords'] = df['keywords'].apply(format_keywords)

In [174]:
# To solve the prblem brought by subword tokenization
def merge_subwords(keywords):
    merged_keywords = []
    previous_keyword = None
    
    for word, score in keywords:
        if word.startswith('##'):
            if previous_keyword:
                # Remove the "##" and combine with previous word
                previous_keyword = (previous_keyword[0] + word.replace('##', ''), max(previous_keyword[1], score))
        else:
            if previous_keyword:
                # Add the keyword we combined to the list
                merged_keywords.append(previous_keyword)
            # Update the previous_keyword with current word
            previous_keyword = (word, score)
    
    # Make sure the last word will be added
    if previous_keyword:
        merged_keywords.append(previous_keyword)
        
    return merged_keywords

In [175]:
df['merged_keywords'] = df['formatted_keywords'].apply(merge_subwords)

In [176]:
all_keywords = df['merged_keywords'].tolist()

In [177]:
from itertools import chain

flat_list = list(chain(*all_keywords)) 

In [178]:
def remove_duplicates(keywords):
    keyword_dict = {}
    for word, score in keywords:
        if word not in keyword_dict or keyword_dict[word] < score:
            keyword_dict[word] = score
    return sorted(keyword_dict.items(), key=lambda x: x[1], reverse=True)

unique_keywords = remove_duplicates(flat_list)
top_100_keywords = unique_keywords[:100]


In [179]:
top_100_keywords

[('AdventHealth', 0.9999807),
 ('apnea', 0.9999771),
 ('echocardiogram', 0.99997365),
 ('Telemynd', 0.9999734),
 ('Armodafinil', 0.99997306),
 ('GlaxoSmithKlinelovent', 0.99997306),
 ('Apnea', 0.99996984),
 ('Reddit', 0.99996924),
 ('cryptocurrency', 0.9999689),
 ('CareFirst', 0.9999685),
 ('chiropractor', 0.99996805),
 ('Sleep', 0.9999678),
 ('amyloidosis', 0.9999677),
 ('syndrome', 0.9999676),
 ('QuickBlox', 0.9999676),
 ('Google', 0.99996734),
 ('telehealth', 0.99996686),
 ('electrocardiogram', 0.9999664),
 ('methotrexate', 0.99996626),
 ('allergy', 0.99996614),
 ('Aetna', 0.9999658),
 ('colitis', 0.9999654),
 ('Teladoc', 0.9999647),
 ('phlebotomy', 0.9999645),
 ('ulcerative', 0.99996436),
 ('Glow', 0.9999641),
 ('google', 0.99996376),
 ('obese', 0.99996316),
 ('HIPAA', 0.99996305),
 ('zocdoc', 0.99996305),
 ('bipolar', 0.9999629),
 ('ELECTROcardiogram', 0.9999627),
 ('Power', 0.99996257),
 ('paxil', 0.999962),
 ('telemedicine', 0.99996185),
 ('Parenthood', 0.99996173),
 ('WellCare'

In [180]:
import os

output_path = 'kw_Healthcare.csv'
if not os.path.exists(output_path):
    df_keywords = pd.DataFrame(top_100_keywords, columns=['Keyword', 'Score'])
    df_keywords.to_csv(output_path, index=False)