In [1]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import nltk
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

## Add keywords to the feature Matrix

In [2]:
# stem note text 
startMatrix = pd.read_csv('1_matrix_ICD_feature_training.csv')

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

startMatrix['text'] = startMatrix['text'].apply(preprocess_text)

In [3]:
# Stem words in the regex template
before_phrase_words = [r"\bno\b\s*", "not", "n't", "absent", "h/o", "pmh", "negative", "history of", "hx of", "unlikely", "without", "lack", "deferred"]
after_phrase_words = ["deferred", "absent", "unlikely"]
stemmed_before_phrase_words = [stemmer.stem(word) for word in before_phrase_words]
stemmed_after_phrase_words = [stemmer.stem(word) for word in after_phrase_words]

# Create regex template with stemmed words
template = r"(?:(?:" + '|'.join(stemmed_before_phrase_words) + r")[^(.|,)]{{0,30}}{phrase})|(?:{phrase}\s*[^.]{{0,40}}(?:" + '|'.join(stemmed_after_phrase_words) + r"))"

# create the positive regexes and the negative regexes based on the csv of key words
def create_neg_regex_dict(phrases):
    
    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}[a-z]*" for word in words]
        return r"\s*".join(processed_words)
    
    # Dictionary to store the regex patterns
    regex_dict = {}
    
    for phrase in phrases:
        processed_phrase = process_phrase(phrase)
        pattern = template.format(phrase=processed_phrase)
        regex_dict[phrase] = pattern
    
    return regex_dict

def create_pos_regex_dict(phrases):    
    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}" for word in words]
        return r"\s*".join(processed_words)
    
    regex_dict = {}
    
    for phrase in phrases:
        processed_phrase = process_phrase(phrase)
        regex_dict[phrase] = processed_phrase
    
    return regex_dict

In [4]:
# Preprocess and stem keywords
def preprocess_keywords(keywords):
    stemmed_keywords = []
    for phrase in keywords:
        tokens = nltk.word_tokenize(phrase)
        stemmed_tokens = [stemmer.stem(token) for token in tokens]
        stemmed_keywords.append(' '.join(stemmed_tokens))
    return stemmed_keywords

keywords_df = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/featureMatrix/keywords.csv')
keywords_list = keywords_df['keywords'].tolist()
stemmed_keywords_list = preprocess_keywords(keywords_list)

neg_regex_dict = create_neg_regex_dict(stemmed_keywords_list)
pos_regex_dict = create_pos_regex_dict(stemmed_keywords_list)


In [5]:
print(neg_regex_dict)
print(pos_regex_dict)

{'acut onset headach': "(?:(?:\\bno\\b\\s*|not|n't|absent|h/o|pmh|negat|history of|hx of|unlik|without|lack|defer)[^(.|,)]{0,30}acut[a-z]*\\s*onset[a-z]*\\s*headach[a-z]*)|(?:acut[a-z]*\\s*onset[a-z]*\\s*headach[a-z]*\\s*[^.]{0,40}(?:defer|absent|unlik))", 'thunderclap headach': "(?:(?:\\bno\\b\\s*|not|n't|absent|h/o|pmh|negat|history of|hx of|unlik|without|lack|defer)[^(.|,)]{0,30}thunderclap[a-z]*\\s*headach[a-z]*)|(?:thunderclap[a-z]*\\s*headach[a-z]*\\s*[^.]{0,40}(?:defer|absent|unlik))", 'doubl vision': "(?:(?:\\bno\\b\\s*|not|n't|absent|h/o|pmh|negat|history of|hx of|unlik|without|lack|defer)[^(.|,)]{0,30}doubl[a-z]*\\s*vision[a-z]*)|(?:doubl[a-z]*\\s*vision[a-z]*\\s*[^.]{0,40}(?:defer|absent|unlik))", 'diplopia': "(?:(?:\\bno\\b\\s*|not|n't|absent|h/o|pmh|negat|history of|hx of|unlik|without|lack|defer)[^(.|,)]{0,30}diplopia[a-z]*)|(?:diplopia[a-z]*\\s*[^.]{0,40}(?:defer|absent|unlik))", 'headach': "(?:(?:\\bno\\b\\s*|not|n't|absent|h/o|pmh|negat|history of|hx of|unlik|without|l

In [6]:
# Function to check for the presence of patterns and determine the final occurrence
# def check_patterns(text, neg_regex_dict, pos_regex_dict):
#     pos_matches = {key: 0 for key in pos_regex_dict}
#     neg_matches = {key: 0 for key in neg_regex_dict}
    
#     for key in pos_matches:
#         pos_match = [m.end() for m in re.finditer(pos_regex_dict[key], text)]
#         latest_pos_match = max(pos_match, default=-1)
#         pos_matches[key] = 1 if latest_pos_match != -1 else 0
    
#     for key in neg_matches:
#         neg_match = [m.end() for m in re.finditer(neg_regex_dict[key], text)]
#         latest_neg_match = max(neg_match, default=-1)
#         neg_matches[key] = 1 if latest_neg_match != -1 else 0
    
#     return pos_matches, neg_matches


def check_patterns(text, neg_regex_dict, pos_regex_dict):
    pos_matches = {key: -1 for key in pos_regex_dict}
    neg_matches = {key: -1 for key in neg_regex_dict}
    
    for key in pos_matches:
        pos_match = [m.end() for m in re.finditer(pos_regex_dict[key], text)]
        pos_matches[key] = max(pos_match, default=-1)
    
    for key in neg_matches:
        neg_match = [m.end() for m in re.finditer(neg_regex_dict[key], text)]
        neg_matches[key] = max(neg_match, default=-1)
    
    final_matches = {}
    for key in pos_regex_dict:
        pos_last = pos_matches[key]
        neg_last = neg_matches.get(key, -1)
        
        if pos_last > neg_last:
            final_matches[key] = (1, 0)  # Positive match
        elif neg_last > pos_last:
            final_matches[key] = (0, 1)  # Negative match
        else:
            final_matches[key] = (0, 0)  # No match
    
    return final_matches


pattern_features = startMatrix['text'].apply(lambda note: check_patterns(note, neg_regex_dict, pos_regex_dict))

# Separate the results into positive and negative pattern columns
# pos_pattern_columns = pd.DataFrame([x[0] for x in pattern_features])
# neg_pattern_columns = pd.DataFrame([x[1] for x in pattern_features])

pos_pattern_columns = pd.DataFrame([{key: value[0] for key, value in patterns.items()} for patterns in pattern_features])
neg_pattern_columns = pd.DataFrame([{key: value[1] for key, value in patterns.items()} for patterns in pattern_features])
neg_pattern_columns = neg_pattern_columns.rename(columns=lambda x: 'neg_' + x)

# Concatenate the original DataFrame with the positive and negative pattern columns
matrix = pd.concat([startMatrix, pos_pattern_columns, neg_pattern_columns], axis=1)

# Display the result
print(len(matrix))
matrix.head()

KeyboardInterrupt: 

In [7]:
infoMatrix = matrix.iloc[:, :5]
infoMatrix.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,physician * * * * * * * * * * admit date : * *...,MGB
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,physician * * * * * * * * * * admit date : * *...,MGB
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,* * * * * * * * * * * * * * * medic psychiatri...,MGB


In [8]:

totals = matrix.iloc[:, 5:].sum()
print(totals[125:155])

strike                        90
fall                         308
fell                         287
trauma                       400
alter mental status          183
neg_acut onset headach         0
neg_thunderclap headach        0
neg_doubl vision               0
neg_diplopia                   0
neg_headach                    2
neg_sudden                     0
neg_neck pain                  0
neg_stiff neck                 0
neg_nausea                     0
neg_vomit                      4
neg_dizzi                      2
neg_photophobia                0
neg_sever                      1
neg_loss of conscious          0
neg_seizur                     5
neg_facial droop               0
neg_aphasia                    1
neg_paraphas error             0
neg_worst headach of life      0
neg_visual chang               0
neg_fascicul                   0
neg_pronat drift               0
neg_asymmetri                  0
neg_nystagmus                  0
neg_facial strength            0
dtype: int

In [41]:
# # Function to check for the presence of patterns and determine the final occurrence
# def check_patterns(text, neg_regex_dict, pos_regex_dict):
#     matches = {key: 0 for key in set(neg_regex_dict) | set(pos_regex_dict)}  # Initialize matches dictionary with 0
#     for key in matches:
#         neg_match = [m.end() for m in re.finditer(neg_regex_dict.get(key, ''), text)]
#         pos_match = [m.end() for m in re.finditer(pos_regex_dict.get(key, ''), text)]
#         # Determine the latest match between negated and non-negated patterns
#         latest_neg_match = max(neg_match, default=-1)
#         latest_pos_match = max(pos_match, default=-1)
#         # Final assignment logic
#         if latest_neg_match != -1 and (latest_pos_match == -1 or latest_neg_match >= latest_pos_match):
#             matches[key] = -1
#         elif latest_pos_match != -1:
#             matches[key] = 1
#     return matches

# pattern_features = startMatrix['text'].apply(lambda note: check_patterns(note, neg_regex_dict, pos_regex_dict))
# pattern_columns = pd.DataFrame(pattern_features.tolist())
# result = pd.concat([startMatrix, pattern_columns], axis=1)

# # Display the result
# print(len(result))
# result.head()

In [9]:
# Define the threshold
threshold = 10

# Filter out features that do not meet the threshold
features_to_keep = totals[totals > threshold].index

# Update the result DataFrame to include only the filtered features
filtered_result = matrix[features_to_keep]

print(len(filtered_result))
final_results = pd.concat([infoMatrix, filtered_result], axis=1)
final_results.head()

1548


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,hospital,annot,ICD,acut onset headach,doubl vision,diplopia,...,neg_numb,neg_am,neg_cta,neg_neuro,neg_av,neg_iph,neg_auto,neg_car,neg_loc,neg_trauma
0,115883980,2018-04-14,Notes_13414009311_1956375582_20180414.txt,physician * * * * * * * * * * admit date : * *...,MGB,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,116483510,2022-05-12,Notes_13622415618_8165687294_20220512.txt,physician * * * * * * * * * * admit date : * *...,MGB,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,150009858,2013-08-29,Notes_1129868316_2608699887_20130829.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,150009896,2022-02-04,Notes_1129868625_2609280946_20220204.txt,note date : * * * * * / * * * * * / * * * * * ...,BIDMC,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,116023288,2022-01-03,Notes_13528991310_6365444126_20220103.txt,* * * * * * * * * * * * * * * medic psychiatri...,MGB,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0


In [11]:
final_results.to_csv('2_full_matrix_latest.csv')