In [10]:
import pandas as pd
from thunderpack import ThunderReader
from tqdm import tqdm
import re
import nltk
import csv
from nltk.stem import SnowballStemmer
stemmer = SnowballStemmer("english")

## Add keywords to the feature Matrix

In [11]:
# stem note text 
startMatrix = pd.read_csv('matrix_ICD_feature_training.csv')

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

startMatrix['text'] = startMatrix['text'].apply(preprocess_text)

In [12]:
# create the positive regexes and the negative regexes based on the csv of key words
def create_neg_regex_dict(phrases):
    # Define the template for regex patterns
    template = r"(?:(?:no|not|n't|absent|h/o|pmh|negative|history of|unlikely|without|lack|deferred)[^(.|,)]{{0,30}}{phrase})|(?:{phrase}\s*[^.]{{0,80}}(deferred|absent|unlikely))"
    
    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}[a-z]*" for word in words]
        return r"\s*".join(processed_words)
    
    # Dictionary to store the regex patterns
    regex_dict = {}
    
    for phrase in phrases:
        processed_phrase = process_phrase(phrase)
        pattern = template.format(phrase=processed_phrase)
        regex_dict[phrase] = pattern
    
    return regex_dict

def create_pos_regex_dict(phrases):    
    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}" for word in words]
        return r"\s*".join(processed_words)
    
    regex_dict = {}
    
    for phrase in phrases:
        processed_phrase = process_phrase(phrase)
        regex_dict[phrase] = processed_phrase
    
    return regex_dict

In [13]:
keywords_df = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/featureMatrix/keywords/keywords.csv')
keywords_list = keywords_df['keywords'].tolist()

neg_regex_dict = create_neg_regex_dict(keywords_list)
pos_regex_dict = create_pos_regex_dict(keywords_list)


In [14]:
print(neg_regex_dict)

{'acute onset headache': "(?:(?:no|not|n't|absent|h/o|pmh|negative|history of|unlikely|without|lack|deferred)[^(.|,)]{0,30}acute[a-z]*\\s*onset[a-z]*\\s*headache[a-z]*)|(?:acute[a-z]*\\s*onset[a-z]*\\s*headache[a-z]*\\s*[^.]{0,80}(deferred|absent|unlikely))", 'thunderclap headache': "(?:(?:no|not|n't|absent|h/o|pmh|negative|history of|unlikely|without|lack|deferred)[^(.|,)]{0,30}thunderclap[a-z]*\\s*headache[a-z]*)|(?:thunderclap[a-z]*\\s*headache[a-z]*\\s*[^.]{0,80}(deferred|absent|unlikely))", 'double vision': "(?:(?:no|not|n't|absent|h/o|pmh|negative|history of|unlikely|without|lack|deferred)[^(.|,)]{0,30}double[a-z]*\\s*vision[a-z]*)|(?:double[a-z]*\\s*vision[a-z]*\\s*[^.]{0,80}(deferred|absent|unlikely))", 'diplopia': "(?:(?:no|not|n't|absent|h/o|pmh|negative|history of|unlikely|without|lack|deferred)[^(.|,)]{0,30}diplopia[a-z]*)|(?:diplopia[a-z]*\\s*[^.]{0,80}(deferred|absent|unlikely))", 'headache': "(?:(?:no|not|n't|absent|h/o|pmh|negative|history of|unlikely|without|lack|defer

In [15]:
# Function to check for the presence of patterns and determine the final occurrence
def check_patterns(text, neg_regex_dict, pos_regex_dict):
    matches = {key: 0 for key in set(neg_regex_dict) | set(pos_regex_dict)}  # Initialize matches dictionary with 0
    for key in matches:
        neg_match = [m.end() for m in re.finditer(neg_regex_dict.get(key, ''), text)]
        pos_match = [m.end() for m in re.finditer(pos_regex_dict.get(key, ''), text)]
        # Determine the latest match between negated and non-negated patterns
        latest_neg_match = max(neg_match, default=-1)
        latest_pos_match = max(pos_match, default=-1)
        # Final assignment logic
        if latest_neg_match != -1 and (latest_pos_match == -1 or latest_neg_match >= latest_pos_match):
            matches[key] = -1
        elif latest_pos_match != -1:
            matches[key] = 1
    return matches

pattern_features = startMatrix['text'].apply(lambda note: check_patterns(note, neg_regex_dict, pos_regex_dict))
pattern_columns = pd.DataFrame(pattern_features.tolist())
result = pd.concat([startMatrix, pattern_columns], axis=1)

print(len(result))
result.head()

1548


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,text,annot,ICD,r pstosis,fasciculation,cta,ischemic stroke,...,lethargic,subarachnoid,paraphasic errors,xanthochromia,rebleed,neuro,hemorrhage,nontraumatic,artery,facial droop
0,117032881,2021-09-29,Notes_13689094716_7824448998_20210929.txt,physician * * * * * * * * * * admit date : * *...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,120402560,2021-09-01,Notes_13598482458_7493034182_20210901.txt,physician * * * * * * * * * * admit date : * *...,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
2,111454037,2023-11-15,Notes_13666481048_10665439216_20231115.txt,discharg summari name : * * * * * * * * * * * ...,0,0,0,0,1,0,...,0,1,0,0,0,1,0,0,0,0
3,121582882,2018-10-10,Notes_13329742924_1945106714_20181010.txt,physician * * * * * * * * * * admit date : * *...,0,0,0,0,1,0,...,0,0,0,0,0,-1,0,0,0,0
4,111678728,2016-12-29,Notes_13278714866_1420969733_20161229.txt,physician * * * * * * * * * * admit date : * *...,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [16]:
result.to_csv('2_keyword_matrix.csv')