In [210]:
import pandas as pd
import re
import nltk
import csv
from nltk.stem import SnowballStemmer
# Initialize the Snowball Stemmer for English
stemmer = SnowballStemmer("english")

In [236]:
# create basic frame for feature matrix
# create list of all patient IDs that were used in annotation
cohort = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/code/test&trainCohorts/combined_cohort_train.csv')

# read in annotations from downloaded file from arjun's tool
annotations = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/annotations/full_annotations_final.csv')
annotations = annotations[['empi', 'annot']]

# merge the two into a feature matrix with all patients and note date and text
matrix = cohort[['BDSPPatientID', 'NoteDate', 'text']]


In [None]:
# MGB for each patient check the allICDCodesMGB to find record of icd from +- 6 months of the note date
ICDs = pd.read_csv('/home/cdac-c-15/Desktop/NAXCA/Cohort Creation Files/allICDCodesMGB.csv')
ICDs = ICDs[ICDs['BDSPPatientID'].isin(set(matrix['BDSPPatientID']))]
ICDs['ICD Date'] = pd.to_datetime(ICDs['ShiftedContactDTS']).dt.strftime('%Y-%m-%d')
ICDs['ICD Date'] = pd.to_datetime(ICDs['ICD Date'])
ICDs = ICDs[['BDSPPatientID', 'ICD Date']]
ICDs.head()

In [None]:
# BIDMC for each patient check the allICDCodesMGB to find record of icd from +- 6 months of the note date
ICDs = pd.read_csv('/home/cdac-c-15/Desktop/mimic-iii-clinical-database-1.4/DIAGNOSES_ICD.csv.gz', compression='gzip')
dates = pd.read_csv('/home/cdac-c-15/Desktop/mimic-iii-clinical-database-1.4/ADMISSIONS.csv')
ICDs = ICDs.merge(dates[['HADM_ID', 'ADMITTIME', 'DISCHTIME']], on='HADM_ID', how='left')
ICDs = ICDs[['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE', 'ADMITTIME', 'DISCHTIME']]
ICDs = ICDs.rename(columns={'SUBJECT_ID': 'BDSPPatientID', 'DISCHTIME': 'Disch Date','ADMITTIME': 'Admit Date'})
ICDs = ICDs[ICDs['ICD9_CODE'].str.contains(r'^(427\.?5|I46)', regex=True, case=False, na=False)]
print(len(ICDs))
ICDs.head()

In [None]:
# MGB add 1 or 0 to new column in matrix that denotes whether an icd is present +- 6 months from note creation date
#merge the dfs on patient ID
matrix = matrix.rename(columns={'patient id': 'BDSPPatientID', 'note date': 'Note Date'})
matrix['Note Date'] = pd.to_datetime(matrix['Note Date'])
ICDs['ICD Date'] = pd.to_datetime(ICDs['ICD Date'])

merged_df = pd.merge(matrix, ICDs, on='BDSPPatientID')

#check for date timeline
merged_df['ICD'] = merged_df.apply(
    lambda row: (row['Note Date'] >= row['ICD Date'] - pd.DateOffset(months=6)) and 
                (row['Note Date'] <= row['ICD Date'] + pd.DateOffset(months=6)), axis=1)
matrix['ICD'] = matrix.apply(
    lambda row: merged_df[(merged_df['BDSPPatientID'] == row['BDSPPatientID']) & merged_df['ICD']].shape[0] > 0, axis=1).astype(int)

print(len(matrix))

In [None]:
# BIDMC add 1 or 0 to new column in matrix that denotes whether an icd is present +- 6 months from note creation date
#merge the dfs on patient ID
matrix['Note Date'] = pd.to_datetime(matrix['Note Date'])
ICDs['Admit Date'] = pd.to_datetime(ICDs['Admit Date'])
ICDs['Disch Date'] = pd.to_datetime(ICDs['Disch Date'])

merged_df = pd.merge(matrix, ICDs, on='BDSPPatientID')

#check for date timeline
merged_df['ICD'] = merged_df.apply(
    lambda row: (row['Note Date'] >= row['Admit Date'] - pd.DateOffset(months=6)) and 
                (row['Note Date'] <= row['Disch Date'] + pd.DateOffset(months=6)), axis=1)
matrix['ICD'] = matrix.apply(
    lambda row: merged_df[(merged_df['BDSPPatientID'] == row['BDSPPatientID']) & merged_df['ICD']].shape[0] > 0, axis=1).astype(int)

print(len(matrix))

In [None]:
# MGB for each patient check the ** to find record of cpt from +- 6 months of the note date
CPTs = pd.read_csv('/home/cdac-c-15/Desktop/NAXCA/Cohort Creation Files/allEEGCodesMGB.csv')
CPTs = CPTs[CPTs['BDSPPatientID'].isin(set(matrix['BDSPPatientID']))]
CPTs['CPT Date'] = pd.to_datetime(CPTs['StartDTS'], errors='coerce')
CPTs = CPTs[['BDSPPatientID', 'CPT Date', 'CPT']]
CPTs.head()

# add 1 or 0 to new column in matrix that denotes whether a cpt is present +- 6 months from note creation date
#merge the dfs on patient ID
CPTs['CPT Date'] = pd.to_datetime(CPTs['CPT Date'])
matrix['Note Date'] = pd.to_datetime(matrix['Note Date'])

MRI = CPTs[CPTs['CPT'].isin(list([95812,95813,95816,95819,95822]))]
CT = CPTs[CPTs['CPT'].isin(set([95718,95719,95720,95721,95722,95723,95724]))]
PET = CPTs[CPTs['CPT'].isin(set([95950,95951,95953,95957]))]

merged_df = pd.merge(matrix, MRI, on='BDSPPatientID')

#check for date timeline
merged_df['MRI'] = merged_df.apply(
    lambda row: (row['Note Date'] >= row['CPT Date'] - pd.DateOffset(months=6)) and 
                (row['Note Date'] <= row['CPT Date'] + pd.DateOffset(months=6)), axis=1)
matrix['MRI'] = matrix.apply(
    lambda row: merged_df[(merged_df['BDSPPatientID'] == row['BDSPPatientID']) & merged_df['MRI']].shape[0] > 0, axis=1).astype(int)

merged_df = pd.merge(matrix, CT, on='BDSPPatientID')
#check for date timeline
merged_df['CT'] = merged_df.apply(
    lambda row: (row['Note Date'] >= row['CPT Date'] - pd.DateOffset(months=6)) and 
                (row['Note Date'] <= row['CPT Date'] + pd.DateOffset(months=6)), axis=1)
matrix['CT'] = matrix.apply(
    lambda row: merged_df[(merged_df['BDSPPatientID'] == row['BDSPPatientID']) & merged_df['CPT']].shape[0] > 0, axis=1).astype(int)

merged_df = pd.merge(matrix, PET, on='BDSPPatientID')
#check for date timeline
merged_df['PET'] = merged_df.apply(
    lambda row: (row['Note Date'] >= row['CPT Date'] - pd.DateOffset(months=6)) and 
                (row['Note Date'] <= row['CPT Date'] + pd.DateOffset(months=6)), axis=1)
matrix['PET'] = matrix.apply(
    lambda row: merged_df[(merged_df['BDSPPatientID'] == row['BDSPPatientID']) & merged_df['CPT']].shape[0] > 0, axis=1).astype(int)

print(len(matrix))
matrix.head()

In [None]:
# BIDMC for each patient use regex to search for cpt within note text of matrix to fill in matrix value
matrix['MRI'] = matrix['text'].str.contains(r'c?MRI', regex=True, case=False, na=False).astype(int)
matrix['CT'] = matrix['text'].str.contains(r'c?CT', regex=True, case=False, na=False).astype(int)
matrix['PET'] = matrix['text'].str.contains(r'c?PET', regex=True, case=False, na=False).astype(int)
print(len(matrix))
matrix.head()

In [240]:
matrix.to_csv('/home/cdac-c-15/Desktop/NAXCA/Test and Train Groups/trainBIDMC.csv')

In [None]:
# [['BDSPPatientID', 'annot', 'Note Date', 'text', 'ICD', 'MRI', 'CT', 'PET']]

In [261]:
# stem note text 
matrix = pd.read_csv('/home/cdac-c-15/Desktop/NAXCA/Test and Train Groups/allTrainingMatrix.csv')

def preprocess_text(text):
    tokens = nltk.word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

matrix['text'] = matrix['text'].apply(preprocess_text)


In [260]:
## Use if have edited key words, will stem the text
# keyWords = pd.read_csv('/home/cdac-c-15/Desktop/NAXCA/sampleCohortGenerator/generator program files/posPattern.csv')
# keyWords['KeyWord'] = keyWords['KeyWord'].apply(lambda x: preprocess_text(x))
# keyWords.to_csv('/home/cdac-c-15/Desktop/NAXCA/sampleCohortGenerator/generator program files/posPattern.csv', index=None)

In [268]:
# create the positive regexes and the negative regexes based on the csv of key words
def create_neg_regex_dict(phrases):
    # Define the template for regex patterns
    template = r"(?:(?:no|not|n't|absent|h/o|pmh|negative|history of|unlikely|without|lack|deferred)[^(.|,)]{{0,30}}{phrase})|(?:{phrase}\s*[^.]{{0,80}}(deferred|absent|unlikely))"
    
    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}[a-z]*" for word in words]
        return r"\s*".join(processed_words)
    
    # Dictionary to store the regex patterns
    regex_dict = {}
    
    for phrase in phrases:
        processed_phrase = process_phrase(phrase)
        pattern = template.format(phrase=processed_phrase)
        regex_dict[phrase] = pattern
    
    return regex_dict

def create_pos_regex_dict(phrases):    
    # Function to process each phrase into a regex-compatible format
    def process_phrase(phrase):
        # Split the phrase into words and create a regex-friendly version
        words = phrase.split()
        processed_words = [f"{word}" for word in words]
        return r"\s*".join(processed_words)
    
    regex_dict = {}
    
    for phrase in phrases:
        processed_phrase = process_phrase(phrase)
        regex_dict[phrase] = processed_phrase
    
    return regex_dict

# Example usage
keywords_list = []

# Open the CSV file and read line by line
with open('/home/cdac-c-15/Desktop/NAXCA/sampleCohortGenerator/generator program files/posPattern.csv', 'r') as csvfile:
    reader = csv.reader(csvfile)
    next(reader)  # Skip the header if it exists
    for row in reader:
        if len(row) > 1:  # Check if there are columns other than the index
            keywords_list.append(row[1])  # Assuming 'KeyWords' is the second column

neg_regex_dict = create_neg_regex_dict(keywords_list)
pos_regex_dict = create_pos_regex_dict(keywords_list)

In [269]:
# Function to check for the presence of patterns and determine the final occurrence
def check_patterns(text, neg_regex_dict, pos_regex_dict):
    matches = {key: 0 for key in set(neg_regex_dict) | set(pos_regex_dict)}  # Initialize matches dictionary with 0
    for key in matches:
        neg_match = [m.end() for m in re.finditer(neg_regex_dict.get(key, ''), text)]
        pos_match = [m.end() for m in re.finditer(pos_regex_dict.get(key, ''), text)]
        # Determine the latest match between negated and non-negated patterns
        latest_neg_match = max(neg_match, default=-1)
        latest_pos_match = max(pos_match, default=-1)
        # Final assignment logic
        if latest_neg_match != -1 and (latest_pos_match == -1 or latest_neg_match >= latest_pos_match):
            matches[key] = -1
        elif latest_pos_match != -1:
            matches[key] = 1
    return matches

pattern_features = matrix['text'].apply(lambda note: check_patterns(note, neg_regex_dict, pos_regex_dict))
pattern_columns = pd.DataFrame(pattern_features.tolist())
result = pd.concat([matrix, pattern_columns], axis=1)

print(len(result))
result.head()

Unnamed: 0.1,Unnamed: 0,BDSPPatientID,annot,Note Date,text,ICD,MRI,CT,PET,\bbrainstem\b,...,\bpulseless\b,\bsuspicion for anox brain injuri\b,\bcpt\b,\bnoxious stimuli\b,\bhypox\b,\bpmi\b,\bparaphas error\b,\bbrainstem function\b,\brewarm\b,\bvasodilatori shock\b
0,0,106,1,2192-08-15,admiss date : [ * * 2192-8-9 * * ] discharg da...,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,43126,1,2124-09-26,admiss date : [ * * 2124-8-21 * * ] discharg d...,1,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0
2,2,69917,0,2167-02-13,admiss date : [ * * 2167-2-6 * * ] discharg da...,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,13757,1,2134-04-01,admiss date : [ * * 2134-2-16 * * ] discharg d...,1,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
4,4,22285,1,2125-07-24,admiss date : [ * * 2125-7-17 * * ] discharg d...,1,1,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [271]:
result.to_csv('/home/cdac-c-15/Desktop/NAXCA/Test and Train Groups/allTrainingMatrix.csv', index=None)

In [262]:
# ideas:
# function to check gcs score whether it's positive for <8 and negative for >7
# only run code on part of note after cardiac, arrest, etc

In [None]:
# allBIDMC = pd.read_csv('/home/cdac-c-15/Desktop/NAXCA/Test and Train Groups/testBIDMC.csv')
# allMGB = pd.read_csv('/home/cdac-c-15/Desktop/NAXCA/Test and Train Groups/testMGB.csv')
# allBIDMC = allBIDMC[['SUBJECT_ID', 'TEXT', 'CHARTDATE']]
# allMGB = allMGB[['BDSPPatientID', 'text', 'Note Date']]
# allBIDMC = allBIDMC.rename(columns={'SUBJECT_ID': 'patient id', 'TEXT': 'text', 'CHARTDATE': 'note date'})
# allMGB = allMGB.rename(columns={'BDSPPatientID': 'patient id', 'Note Date': 'note date'})

# trainingData = pd.concat([allBIDMC, allMGB])

# trainingData.to_csv('/home/cdac-c-15/Desktop/NAXCA/Test and Train Groups/allTesting.csv')
# print(len(pd.concat([allBIDMC, allMGB])))
# trainingData.head()


In [80]:
# training and testing

# nested k-fold cross validation
#     for i in range(K)       #evaluate out of sample perf
#       for hp in hyperparameters:
#         for j in range(K)   #decide the best hyperparameter

    