In [2]:
import json
import pandas as pd
import nltk
from nltk.stem.snowball import SnowballStemmer

In [4]:
# Ensure you have the necessary NLTK data
nltk.download('punkt')

# JSON data
regexes = {
    "m": [
        r"SAH", 
        r"subarachnoid hemorrhage",
        r"subarachnoid"
    ],
    "a": [
        r"coil",
        r"clip",
        r"nimodipine",
        r"stroke",
        r"nontraumatic",
        r"aneurysm",
        r"hemorrhage",
        r"neck",
        r"xanthochromia",
        r"csf",
        r"brain bleed",
        r"bleed",
        r"rupture",
        r"hunt hess",
        r"hunt",
        r"hess",
        r"modified fisher",
        r"fisher",
        r"avm",
        r"headache",
        r"mca",
        r"vasospasm",
        r"keppra",
        r"h&h",
        r"worst headache of life",
        r"caa",
        r"rcvs"
    ],
    "b": [
        r"trauma",
        r"no hemorrhage",
        r"fell",
        r"fall",
        r"hit",
        r"tbi",
        r"without hemorrhage",
        r"w/o hemorrhage",
        r"head strike",
        r"No evidence of intracranial hemorrhage",
        r"no evidence of intracranial mass, hemorrhage or acute infarction",
        r"No intracranial aneurysm",
        r"skull",
        r"fracture",
        r"LOC",
        r"vehicle",
        r"motorcycle",
        r"stair",
        r"car",
        r"accident",
        r"struck",
        r"auto",
        r"IPH",
        r"SDH",
        r"mechanical",
        r"scattered"
    ],
    "c": [
        r"admission diagnosis",
        r"principal problem",
        r"surgeries",
        r"etiology",
        r"discharge instructions",
        r"hospital course",
        r"indication:",
        r"found to have",
        r"presented with",
        r"presents with",
        r"secondary to",
        r"ct head",
        r"cta"
    ]
}

# Initialize the stemmer
stemmer = SnowballStemmer("english")

# Function to stem a list of phrases
def stem_phrases(phrases):
    stemmed_phrases = []
    for phrase in phrases:
        # Tokenize the phrase
        words = nltk.word_tokenize(phrase)
        # Stem each word
        stemmed_words = [stemmer.stem(word) for word in words]
        # Reconstruct the phrase
        stemmed_phrase = ' '.join(stemmed_words)
        stemmed_phrases.append(stemmed_phrase)
    return stemmed_phrases

# Stem the phrases in the regexes dictionary
stemmed_data = {key: stem_phrases(phrases) for key, phrases in regexes.items()}

# Convert to DataFrame
stemmed_data_df = pd.DataFrame.from_dict(stemmed_data, orient='index').transpose()

# Save to CSV
stemmed_data_df.to_csv('stemmed_phrases.csv', index=False)

stemmed_data_df.head()



[nltk_data] Downloading package punkt to /home/jsearle/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Unnamed: 0,m,a,b,c
0,sah,coil,trauma,admiss diagnosi
1,subarachnoid hemorrhag,clip,no hemorrhag,princip problem
2,subarachnoid,nimodipin,fell,surgeri
3,,stroke,fall,etiolog
4,,nontraumat,hit,discharg instruct


In [7]:
pos_keywords = stemmed_data_df['a'].dropna().tolist()
neg_keywords = stemmed_data_df['b'].dropna().tolist()
focal_keywords = stemmed_data_df['c'].dropna().tolist()

pos_keywords_df = pd.DataFrame(pos_keywords, columns=['pos_keywords'])
neg_keywords_df = pd.DataFrame(neg_keywords, columns=['neg_keywords'])
focal_keywords_df = pd.DataFrame(focal_keywords, columns=['focal_keywords'])

pos_keywords_df.head()

Unnamed: 0,pos_keywords
0,coil
1,clip
2,nimodipin
3,stroke
4,nontraumat


In [8]:
pos_keywords_df.to_csv('pos_keywords.csv', index=False)
neg_keywords_df.to_csv('neg_keywords.csv', index=False)
focal_keywords_df.to_csv('focal_keywords.csv', index=False)