In [8]:
import pandas as pd
import re
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

## Add keywords to the feature Matrix

In [9]:
# stem note text 
startMatrix = pd.read_csv('1_matrix_ICD_feature_training.csv')

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

startMatrix['text'] = startMatrix['text'].apply(preprocess_text)

In [10]:
# Stem words in the regex template
before_phrase_words = [r"\bno\b", r"\bnot\b", "n't", "absent", "h/o", "pmh", "negative", "history of", "hx of", "unlikely",
                        "without", "lack", "deferred", "recent", "prior", "concern for", "c/f", "possible", "denies"]
after_phrase_words = ["deferred", "absent", "unlikely", "negative"] #add more here
stemmed_before_phrase_words = [stemmer.stem(word) for word in before_phrase_words]
stemmed_after_phrase_words = [stemmer.stem(word) for word in after_phrase_words]

# Create regex template with stemmed words
template = r"(?:(?:" + '|'.join(stemmed_before_phrase_words) + r")[^(.|,)]{{0,40}}{phrase})|(?:{phrase}\s*[^.]{{0,50}}(?:" + '|'.join(stemmed_after_phrase_words) + r"))"

# create the positive regexes and the negative regexes based on the csv of key words
def create_neg_regex_dict(phrases, special_negations=None):
    if special_negations is None:
        special_negations = {}

    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}[a-z]*" for word in words]
        return r"\s*".join(processed_words)
    
    # Dictionary to store the regex patterns
    regex_dict = {}
    
    for phrase in phrases:
        stemmed_phrase = ' '.join([stemmer.stem(word) for word in phrase.split()])
        combined_before_negations = stemmed_before_phrase_words.copy()
        combined_after_negations = stemmed_after_phrase_words.copy()
        if stemmed_phrase in special_negations:
            special_negations_stemmed = [stemmer.stem(word.lstrip(r'^')) if word.startswith(r'^') else stemmer.stem(word) for word in special_negations[stemmed_phrase]]
            combined_before_negations += special_negations_stemmed
            combined_after_negations += special_negations_stemmed
            pattern = (
                r"(?:(?:" + '|'.join(combined_before_negations) + r")[^(.|,)]{{0,40}}{phrase})|"
                r"(?:{phrase}\s*[^.]{{0,50}}(?:" + '|'.join(combined_after_negations) + r"))"
            ).format(phrase=process_phrase(phrase))
        else:
            pattern = template.format(phrase=process_phrase(phrase))
        regex_dict[phrase] = pattern
        # processed_phrase = process_phrase(phrase)
        # pattern = template.format(phrase=processed_phrase)
        # regex_dict[phrase] = pattern
    
    return regex_dict

def create_pos_regex_dict(phrases):    
    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}" for word in words]
        return r"\s*".join(processed_words)
    
    regex_dict = {}
    
    for phrase in phrases:
        processed_phrase = process_phrase(phrase)
        regex_dict[phrase] = processed_phrase
    
    return regex_dict


In [12]:
# Preprocess and stem keywords
def preprocess_keywords(keywords):
    stemmed_keywords = []
    for phrase in keywords:
        tokens = nltk.word_tokenize(phrase)
        stemmed_tokens = [stemmer.stem(token) if token != r'\b' else token for token in tokens]
        stemmed_keywords.append(' '.join(stemmed_tokens))
    return stemmed_keywords

keywords_df = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/featureMatrix/keptKeywords.csv')
keywords_list = keywords_df['keptKeywords'].tolist()
stemmed_keywords_list = preprocess_keywords(keywords_list)

# Define special negations for specific keywords
special_negations = {
    'sah': [r'^traumatic', 'trauma'],
    'subarachnoid hemorrhage': [r'^traumatic', 'trauma']
}

# Stem the special negations
special_negations_stemmed = {stemmer.stem(k): [r'^' + stemmer.stem(word.lstrip(r'^')) if word.startswith(r'^') else stemmer.stem(word) for word in v]
    for k, v in special_negations.items()}

neg_regex_dict = create_neg_regex_dict(stemmed_keywords_list, special_negations=special_negations_stemmed)
pos_regex_dict = create_pos_regex_dict(stemmed_keywords_list)


In [14]:
print(neg_regex_dict['diplopia'])
print(pos_regex_dict)
print(len(pos_regex_dict))

(?:(?:\bno\b|\bnot\b|n't|absent|h/o|pmh|negat|history of|hx of|unlik|without|lack|defer|recent|prior|concern for|c/f|possibl|deni)[^(.|,)]{0,40}diplopia[a-z]*)|(?:diplopia[a-z]*\s*[^.]{0,50}(?:defer|absent|unlik|negat))
{'headach': 'headach', 'doubl vision': 'doubl\\s*vision', 'diplopia': 'diplopia', 'sudden': 'sudden', 'neck pain': 'neck\\s*pain', 'nausea': 'nausea', 'vomit': 'vomit', 'dizzi': 'dizzi', 'photophobia': 'photophobia', 'sever': 'sever', 'loss of conscious': 'loss\\s*of\\s*conscious', 'seizur': 'seizur', 'facial droop': 'facial\\s*droop', 'aphasia': 'aphasia', 'parapha error': 'parapha\\s*error', 'worst headach of life': 'worst\\s*headach\\s*of\\s*life', 'visual chang': 'visual\\s*chang', 'fascicul': 'fascicul', 'pronat drift': 'pronat\\s*drift', 'asymmetri': 'asymmetri', 'nystagmus': 'nystagmus', 'facial strength': 'facial\\s*strength', 'aneurysm': 'aneurysm', 'cerebr': 'cerebr', 'numb': 'numb', 'tingl': 'tingl', 'weak': 'weak', 'letharg': 'letharg', 'ptosi': 'ptosi', 'he

In [15]:
# Any occurrence in note
def check_patterns(text, neg_regex_dict, pos_regex_dict):
    pos_matches = {key: 0 for key in pos_regex_dict}
    neg_matches = {key: 0 for key in neg_regex_dict}
    
    # Find negated matches
    neg_positions = []
    for key in neg_matches:
        for match in re.finditer(neg_regex_dict[key], text):
            neg_matches[key] = 1
            neg_positions.append(match.span())
    
    # Find positive matches outside negation context
    for key in pos_matches:
        for match in re.finditer(pos_regex_dict[key], text):
            start, end = match.span()
            if not any(start <= neg_end and end >= neg_start for neg_start, neg_end in neg_positions):
                pos_matches[key] = 1
    
    return pos_matches, neg_matches

pattern_features = startMatrix['text'].apply(lambda note: check_patterns(note, neg_regex_dict, pos_regex_dict))

## for any occurrence version
# Separate the results into positive and negative pattern columns
pos_pattern_columns = pd.DataFrame([x[0] for x in pattern_features])
neg_pattern_columns = pd.DataFrame([x[1] for x in pattern_features])

neg_pattern_columns = neg_pattern_columns.rename(columns=lambda x: 'neg_' + x)

# Concatenate the original DataFrame with the positive and negative pattern columns
matrix = pd.concat([startMatrix, pos_pattern_columns, neg_pattern_columns], axis=1)

# Display the result
print(len(matrix))
matrix.head()

1548


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,ICD,headach,doubl vision,diplopia,...,neg_am,neg_ivh,neg_parenchym hemorrhag,neg_intraventricular hemorrhag,neg_subdur,neg_tbi,neg_traumat brain injuri,neg_neuro icu,neg_confus,neg_syncop
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,physician * * * * * * * * * * admit date : * *...,MGB,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,* * * * * * * * * * * * * * * medic psychiatri...,MGB,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [16]:
infoMatrix = matrix.iloc[:, :5]
infoMatrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,physician * * * * * * * * * * admit date : * *...,MGB
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,physician * * * * * * * * * * admit date : * *...,MGB
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,* * * * * * * * * * * * * * * medic psychiatri...,MGB


In [17]:

totals = matrix.iloc[:, 5:].sum()
print(totals[65:85])

brain bleed            41
sah                   560
diffus sah             64
cerebrospin fluid      54
csf                   178
neuro                1368
seizur prophylaxi     128
labetalol             129
perimesenceph          12
subarachnoid          498
bifurc                 60
acom                   18
pcom                   27
intracrani            271
scatter               123
multi compart          23
hemorrhag             730
nimodipin             216
neck                  671
bleed                 503
dtype: int64


In [18]:
matrix.to_csv('2_full_matrix.csv')

Creating a matrix with just keyword features.  No ICD.

In [20]:
full_matrix = pd.read_csv('2_full_matrix.csv', index_col=0)
full_matrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,ICD,headach,doubl vision,diplopia,...,neg_am,neg_ivh,neg_parenchym hemorrhag,neg_intraventricular hemorrhag,neg_subdur,neg_tbi,neg_traumat brain injuri,neg_neuro icu,neg_confus,neg_syncop
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,physician * * * * * * * * * * admit date : * *...,MGB,1,1,1,0,0,...,1,0,0,0,0,0,0,0,0,0
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,* * * * * * * * * * * * * * * medic psychiatri...,MGB,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0


In [21]:
keyword_matrix = full_matrix.drop(columns=['ICD'])
keyword_matrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,headach,doubl vision,diplopia,sudden,...,neg_am,neg_ivh,neg_parenchym hemorrhag,neg_intraventricular hemorrhag,neg_subdur,neg_tbi,neg_traumat brain injuri,neg_neuro icu,neg_confus,neg_syncop
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,physician * * * * * * * * * * admit date : * *...,MGB,1,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,* * * * * * * * * * * * * * * medic psychiatri...,MGB,0,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [22]:
keyword_matrix.to_csv('2_keyword_only_matrix.csv')