In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

## Add keywords to the feature Matrix

In [56]:
# stem note text 
startMatrix = pd.read_csv('1_matrix_ICD_feature_training.csv')

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

startMatrix['text'] = startMatrix['text'].apply(preprocess_text)

In [57]:
# Stem words in the regex template
before_phrase_words = [r"\bno\b", r"\bnot\b", "n't", "absent", "h/o", "pmh", "negative", "history of", "hx of", "unlikely",
                        "without", "lack", "deferred", "recent", "prior", "concern", "c/f", "possible", "denies"]
after_phrase_words = ["deferred", "absent", "unlikely", "negative"] #add more here
stemmed_before_phrase_words = [stemmer.stem(word) for word in before_phrase_words]
stemmed_after_phrase_words = [stemmer.stem(word) for word in after_phrase_words]

# Create regex template with stemmed words
template = r"(?:(?:" + '|'.join(stemmed_before_phrase_words) + r")[^(.|,)]{{0,40}}{phrase})|(?:{phrase}\s*[^.]{{0,50}}(?:" + '|'.join(stemmed_after_phrase_words) + r"))"

# create the positive regexes and the negative regexes based on the csv of key words
def create_neg_regex_dict(phrases, special_negations=None):
    if special_negations is None:
        special_negations = {}

    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}[a-z]*" for word in words]
        return r"\s*".join(processed_words)
    
    # Dictionary to store the regex patterns
    regex_dict = {}
    
    for phrase in phrases:
        stemmed_phrase = ' '.join([stemmer.stem(word) for word in phrase.split()])
        combined_before_negations = stemmed_before_phrase_words.copy()
        combined_after_negations = stemmed_after_phrase_words.copy()
        if stemmed_phrase in special_negations:
            special_negations_stemmed = [stemmer.stem(word.lstrip(r'^')) if word.startswith(r'^') else stemmer.stem(word) for word in special_negations[stemmed_phrase]]
            combined_before_negations += special_negations_stemmed
            combined_after_negations += special_negations_stemmed
            pattern = (
                r"(?:(?:" + '|'.join(combined_before_negations) + r")[^(.|,)]{{0,40}}{phrase})|"
                r"(?:{phrase}\s*[^.]{{0,50}}(?:" + '|'.join(combined_after_negations) + r"))"
            ).format(phrase=process_phrase(phrase))
        else:
            pattern = template.format(phrase=process_phrase(phrase))
        regex_dict[phrase] = pattern
        # processed_phrase = process_phrase(phrase)
        # pattern = template.format(phrase=processed_phrase)
        # regex_dict[phrase] = pattern
    
    return regex_dict

def create_pos_regex_dict(phrases):    
    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}" for word in words]
        return r"\s*".join(processed_words)
    
    regex_dict = {}
    
    for phrase in phrases:
        processed_phrase = process_phrase(phrase)
        regex_dict[phrase] = processed_phrase
    
    return regex_dict


In [58]:
# Preprocess and stem keywords
def preprocess_keywords(keywords):
    stemmed_keywords = []
    for phrase in keywords:
        tokens = nltk.word_tokenize(phrase)
        stemmed_tokens = [stemmer.stem(token) if token != r'\b' else token for token in tokens]
        stemmed_keywords.append(' '.join(stemmed_tokens))
    return stemmed_keywords

keywords_df = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/featureMatrix/keywords.csv')
keywords_list = keywords_df['keywords'].tolist()
stemmed_keywords_list = preprocess_keywords(keywords_list)

# Define special negations for specific keywords
special_negations = {
    'sah': [r'^traumatic', 'trauma'],
    'subarachnoid hemorrhage': [r'^traumatic', 'trauma']
}

# Stem the special negations
special_negations_stemmed = {stemmer.stem(k): [r'^' + stemmer.stem(word.lstrip(r'^')) if word.startswith(r'^') else stemmer.stem(word) for word in v]
    for k, v in special_negations.items()}

neg_regex_dict = create_neg_regex_dict(stemmed_keywords_list, special_negations=special_negations_stemmed)
pos_regex_dict = create_pos_regex_dict(stemmed_keywords_list)


In [59]:
print(neg_regex_dict['diplopia'])
print(pos_regex_dict)

(?:(?:\bno\b|\bnot\b|n't|absent|h/o|pmh|negat|history of|hx of|unlik|without|lack|defer|recent|prior|concern|c/f|possibl|deni)[^(.|,)]{0,40}diplopia[a-z]*)|(?:diplopia[a-z]*\s*[^.]{0,50}(?:defer|absent|unlik|negat))
{'acut onset headach': 'acut\\s*onset\\s*headach', 'thunderclap headach': 'thunderclap\\s*headach', 'headach': 'headach', 'doubl vision': 'doubl\\s*vision', 'diplopia': 'diplopia', 'sudden': 'sudden', 'neck pain': 'neck\\s*pain', 'stiff neck': 'stiff\\s*neck', 'nausea': 'nausea', 'vomit': 'vomit', 'dizzi': 'dizzi', 'photophobia': 'photophobia', 'sever': 'sever', 'loss of conscious': 'loss\\s*of\\s*conscious', 'seizur': 'seizur', 'facial droop': 'facial\\s*droop', 'aphasia': 'aphasia', 'paraphas error': 'paraphas\\s*error', 'worst headach of life': 'worst\\s*headach\\s*of\\s*life', 'visual chang': 'visual\\s*chang', 'fascicul': 'fascicul', 'pronat drift': 'pronat\\s*drift', 'asymmetri': 'asymmetri', 'nystagmus': 'nystagmus', 'facial strength': 'facial\\s*strength', 'basilar 

In [60]:
# Any occurrence in note
def check_patterns(text, neg_regex_dict, pos_regex_dict):
    pos_matches = {key: 0 for key in pos_regex_dict}
    neg_matches = {key: 0 for key in neg_regex_dict}
    
    # Find negated matches
    neg_positions = []
    for key in neg_matches:
        for match in re.finditer(neg_regex_dict[key], text):
            neg_matches[key] = 1
            neg_positions.append(match.span())
    
    # Find positive matches outside negation context
    for key in pos_matches:
        for match in re.finditer(pos_regex_dict[key], text):
            start, end = match.span()
            if not any(start <= neg_end and end >= neg_start for neg_start, neg_end in neg_positions):
                pos_matches[key] = 1
    
    return pos_matches, neg_matches

# # Latest occurrence in note (Whether it's positive or negative)
# def check_patterns(text, neg_regex_dict, pos_regex_dict):
#     pos_matches = {key: -1 for key in pos_regex_dict}
#     neg_matches = {key: -1 for key in neg_regex_dict}
    
#     for key in pos_matches:
#         pos_match = [m.end() for m in re.finditer(pos_regex_dict[key], text)]
#         pos_matches[key] = max(pos_match, default=-1)
    
#     for key in neg_matches:
#         neg_match = [m.end() for m in re.finditer(neg_regex_dict[key], text)]
#         neg_matches[key] = max(neg_match, default=-1)
    
#     final_matches = {}
#     for key in pos_regex_dict:
#         pos_last = pos_matches[key]
#         neg_last = neg_matches.get(key, -1)
        
#         if pos_last > neg_last:
#             final_matches[key] = (1, 0)  # Positive match
#         elif neg_last > pos_last:
#             final_matches[key] = (0, 1)  # Negative match
#         else:
#             final_matches[key] = (0, 0)  # No match
    
#     return final_matches


pattern_features = startMatrix['text'].apply(lambda note: check_patterns(note, neg_regex_dict, pos_regex_dict))

## for any occurrence version
# Separate the results into positive and negative pattern columns
pos_pattern_columns = pd.DataFrame([x[0] for x in pattern_features])
neg_pattern_columns = pd.DataFrame([x[1] for x in pattern_features])

## for latest occurrence version
# pos_pattern_columns = pd.DataFrame([{key: value[0] for key, value in patterns.items()} for patterns in pattern_features])
# neg_pattern_columns = pd.DataFrame([{key: value[1] for key, value in patterns.items()} for patterns in pattern_features])

neg_pattern_columns = neg_pattern_columns.rename(columns=lambda x: 'neg_' + x)

# Concatenate the original DataFrame with the positive and negative pattern columns
matrix = pd.concat([startMatrix, pos_pattern_columns, neg_pattern_columns], axis=1)

# Display the result
print(len(matrix))
matrix.head()

1548


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,ICD,acut onset headach,thunderclap headach,headach,...,neg_\btraumat sah,neg_parenchym hemorrhag,neg_intraventricular hemorrhag,neg_subdur,neg_tbi,neg_traumat brain injuri,neg_multicompartment,neg_neuro icu,neg_confus,neg_syncop
0,117914592,2019-11-27,Notes_13462332123_3276403308_20191127.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,1,0,0,1,...,0,0,0,0,0,0,0,0,0,0
1,115191777,2021-12-17,Notes_13583675062_7277203706_20211217.txt,physician * * * * * * * * * * admit date : * *...,MGB,1,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,150181780,2010-05-16,Notes_1130040255_206508829_20100516.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,150032978,2019-12-20,Notes_1129891556_2836307028_20191220.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0
4,122398778,2024-07-06,Notes_13786647359_10264430345_20240706.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [61]:
infoMatrix = matrix.iloc[:, :5]
infoMatrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital
0,117914592,2019-11-27,Notes_13462332123_3276403308_20191127.txt,physician * * * * * * * * * * admit date : * *...,MGB
1,115191777,2021-12-17,Notes_13583675062_7277203706_20211217.txt,physician * * * * * * * * * * admit date : * *...,MGB
2,150181780,2010-05-16,Notes_1130040255_206508829_20100516.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC
3,150032978,2019-12-20,Notes_1129891556_2836307028_20191220.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC
4,122398778,2024-07-06,Notes_13786647359_10264430345_20240706.txt,physician * * * * * * * * * * admit date : * *...,MGB


In [62]:

totals = matrix.iloc[:, 5:].sum()
print(totals[65:85])

reble                        5
ischem stroke               55
vasospasm                  189
hydrocephalus              192
cerebr edema                63
hypertens                  501
htn                        544
subarachnoid hemorrhag     456
intracrani hemorrhag        82
brain bleed                 43
sah                        539
diffus sah                  68
cerebrospin fluid           54
csf                        157
neuro                     1360
seizur prophylaxi          135
labetalol                  155
perimesencephal             22
subarachnoid               490
bifurc                      56
dtype: int64


In [63]:
# Define the threshold
threshold = 10

# Filter out features that do not meet the threshold
features_to_keep = totals[totals > threshold].index

# Update the result DataFrame to include only the filtered features
filtered_result = matrix[features_to_keep]

print(len(filtered_result))
final_results = pd.concat([infoMatrix, filtered_result], axis=1)
final_results.head()

1548


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,ICD,headach,doubl vision,diplopia,...,neg_trauma,neg_alter mental status,neg_am,neg_ivh,neg_parenchym hemorrhag,neg_intraventricular hemorrhag,neg_subdur,neg_tbi,neg_confus,neg_syncop
0,117914592,2019-11-27,Notes_13462332123_3276403308_20191127.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1,115191777,2021-12-17,Notes_13583675062_7277203706_20211217.txt,physician * * * * * * * * * * admit date : * *...,MGB,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,150181780,2010-05-16,Notes_1130040255_206508829_20100516.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,150032978,2019-12-20,Notes_1129891556_2836307028_20191220.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,1,0,0,...,0,0,1,0,0,1,0,0,0,0
4,122398778,2024-07-06,Notes_13786647359_10264430345_20240706.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [65]:
final_results.to_csv('2_full_matrix.csv')

Now I'll take out the ICD column to make a matrix with just keyword features

In [5]:
full_matrix = pd.read_csv('2_full_matrix.csv', index_col=0)
full_matrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,ICD,headach,doubl vision,diplopia,...,neg_trauma,neg_alter mental status,neg_am,neg_ivh,neg_parenchym hemorrhag,neg_intraventricular hemorrhag,neg_subdur,neg_tbi,neg_confus,neg_syncop
0,117914592,2019-11-27,Notes_13462332123_3276403308_20191127.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,1,1,0,0,...,0,0,1,0,0,0,0,0,0,0
1,115191777,2021-12-17,Notes_13583675062_7277203706_20211217.txt,physician * * * * * * * * * * admit date : * *...,MGB,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,150181780,2010-05-16,Notes_1130040255_206508829_20100516.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,150032978,2019-12-20,Notes_1129891556_2836307028_20191220.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,1,0,0,...,0,0,1,0,0,1,0,0,0,0
4,122398778,2024-07-06,Notes_13786647359_10264430345_20240706.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [6]:
keyword_matrix = full_matrix.drop(columns=['ICD'])
keyword_matrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,headach,doubl vision,diplopia,sudden,...,neg_trauma,neg_alter mental status,neg_am,neg_ivh,neg_parenchym hemorrhag,neg_intraventricular hemorrhag,neg_subdur,neg_tbi,neg_confus,neg_syncop
0,117914592,2019-11-27,Notes_13462332123_3276403308_20191127.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,115191777,2021-12-17,Notes_13583675062_7277203706_20211217.txt,physician * * * * * * * * * * admit date : * *...,MGB,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
2,150181780,2010-05-16,Notes_1130040255_206508829_20100516.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,150032978,2019-12-20,Notes_1129891556_2836307028_20191220.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,0,0,0,...,0,0,1,0,0,1,0,0,0,0
4,122398778,2024-07-06,Notes_13786647359_10264430345_20240706.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [7]:
keyword_matrix.to_csv('2_keywords_only_matrix.csv')

Now I want to create a list of features that will remain for the testing set.

In [8]:
keyword_matrix = keyword_matrix.drop(columns=['BDSPPatientID', 'NoteDate', 'NoteTitle', 'text', 'hospital', 'annot'])
keyword_matrix.head()

Unnamed: 0,headach,doubl vision,diplopia,sudden,neck pain,nausea,vomit,dizzi,photophobia,sever,...,neg_trauma,neg_alter mental status,neg_am,neg_ivh,neg_parenchym hemorrhag,neg_intraventricular hemorrhag,neg_subdur,neg_tbi,neg_confus,neg_syncop
0,1,0,0,0,0,1,1,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,0,0,0,0,1,0,0
2,0,0,0,1,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,1,1,0,1,0,1,...,0,0,1,0,0,1,0,0,0,0
4,0,0,0,0,0,1,1,0,0,1,...,0,0,1,0,0,0,0,0,0,0


In [11]:
features = keyword_matrix.columns.to_list()

keptFeatures = pd.DataFrame(features, columns=['features'])

print(len(keptFeatures))
keptFeatures.head()

201


Unnamed: 0,features
0,headach
1,doubl vision
2,diplopia
3,sudden
4,neck pain


In [10]:
keptFeatures.to_csv('features_list.csv')