In [77]:
import pandas as pd
from datetime import timedelta

In [80]:
bi_icd_pos_dates = pd.read_csv('all_bidmc_icd_pos_dates.csv')
bi_icd_pos_dates['DateICD'] = pd.to_datetime(bi_icd_pos_dates['DateICD'])
bi_icd_pos_dates.head()

Unnamed: 0,BDSPPatientID,DateICD,ICD
0,150000004,2012-11-05,430
1,150000137,2013-06-14,430
2,150000004,2012-11-05,430
3,150063730,2014-03-07,430
4,150000004,2013-02-22,430


In [81]:
bi_neg = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/BIDMC_neg_cohort_final.csv')
bi_neg = bi_neg.drop(columns=['text'])
bi_neg.loc[:, 'NoteDate'] = pd.to_datetime(bi_neg['NoteDate'], format='mixed')
bi_neg.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,151212513,2016-02-04 00:00:00,Notes_1131071605_435587215_20160204.txt
1,150639400,2023-06-12 00:00:00,Notes_1130498158_10141889500_20230612.txt
2,150057053,2020-12-17 00:00:00,Notes_1129915463_4302367261_20201217.txt
3,151005496,2010-06-29 00:00:00,Notes_1130863925_364816102_20100629.txt
4,150006448,2014-07-14 00:00:00,Notes_1129865503_1991829458_20140714.txt


In [82]:
# Regex pattern, simplified by removing redundant `(?I)`
code_regex = "^(I60|430).*"

def check_icd_in_period(id_, date):
    time_window = timedelta(days=30)

    # Filter the rows for the specific BDSPPatientID and within the 30-day window
    filtered_df = bi_icd_pos_dates[
        (bi_icd_pos_dates['BDSPPatientID'] == id_) &
        (abs(bi_icd_pos_dates['DateICD'] - date) <= time_window)
    ]
    
    # Check if any of the filtered rows match the ICD code pattern
    if filtered_df['ICD'].str.match(code_regex).any():
        return 1
    return 0

# Apply the check_icd_in_period function to each row in sampled_notes
bi_neg['ICDpos'] = bi_neg.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['NoteDate']), axis=1)

In [83]:
print(len(bi_neg))
bi_neg.head()

505


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,ICDpos
0,151212513,2016-02-04 00:00:00,Notes_1131071605_435587215_20160204.txt,0
1,150639400,2023-06-12 00:00:00,Notes_1130498158_10141889500_20230612.txt,0
2,150057053,2020-12-17 00:00:00,Notes_1129915463_4302367261_20201217.txt,0
3,151005496,2010-06-29 00:00:00,Notes_1130863925_364816102_20100629.txt,0
4,150006448,2014-07-14 00:00:00,Notes_1129865503_1991829458_20140714.txt,0


In [118]:
# Count the number of 1s and 0s in the ICDpos column
icd_counts = bi_neg['ICDpos'].value_counts()

numPos = icd_counts.get(1, 0)
numNeg = icd_counts.get(0, 0)

prevalence = numPos/len(bi_neg)


print('BI')
print('ICD Positive Cases: ', numPos)
print('ICD Negative Cases: ', numNeg)

# Filter the DataFrame to get only the positive cases
positive_cases = bi_neg[bi_neg['ICDpos'] == 1]

# Get the list of BDSPPatientIDs that are positive cases
positive_patient_ids = positive_cases['BDSPPatientID'].tolist()

# Display the list of positive BDSPPatientIDs
print('\nList of BDSPPatientIDs that are positive cases:')
print(positive_patient_ids)

BI
ICD Positive Cases:  6
ICD Negative Cases:  499

List of BDSPPatientIDs that are positive cases:
[150012047, 151300881, 150676764, 151280384, 151243694, 150004757]


In [120]:
positive_cases = positive_cases.drop(columns='ICDpos')
positive_cases.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
5,150012047,2015-03-07 00:00:00,Notes_1129871055_26686542156_20150307.txt
20,151300881,2019-01-13 00:00:00,Notes_1131159385_473672682_20190113.txt
432,150676764,2011-11-21 00:00:00,Notes_1130535379_11197516873_20111121.txt
436,151280384,2018-04-28 00:00:00,Notes_1131139027_3282098405_20180428.txt
441,151243694,2015-11-28 00:00:00,Notes_1131102166_448549695_20151128.txt


In [121]:
positive_cases.to_csv('bi_positives_in_neg_cohort.csv', index=False)

MGB

In [111]:
mgb_icd_pos_dates = pd.read_csv('all_mgb_icd_pos_dates.csv')
mgb_icd_pos_dates['DateICD'] = pd.to_datetime(mgb_icd_pos_dates['DateICD'])
mgb_icd_pos_dates.head()

Unnamed: 0,BDSPPatientID,DateICD,ICD
0,116398048.0,2018-03-09,430
1,119744866.0,2019-10-20,430
2,116790672.0,2022-05-13,430
3,122243491.0,2020-06-12,430
4,119133865.0,2021-03-07,430


In [114]:
mgb_neg = pd.read_csv('/home/jsearle/bigDrive/NAX/NLP-SAH_identification/cleanCohorts/MGB_neg_cohort_final.csv')
mgb_neg = mgb_neg.drop(columns=['text'])
mgb_neg.loc[:, 'NoteDate'] = pd.to_datetime(mgb_neg['NoteDate'], format='mixed')
mgb_neg.head()

Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt


In [115]:
# Regex pattern, simplified by removing redundant `(?I)`
code_regex = "^(I60|430).*"

def check_icd_in_period(id_, date):
    time_window = timedelta(days=30)

    # Filter the rows for the specific BDSPPatientID and within the 30-day window
    filtered_df = bi_icd_pos_dates[
        (mgb_icd_pos_dates['BDSPPatientID'] == id_) &
        (abs(mgb_icd_pos_dates['DateICD'] - date) <= time_window)
    ]
    
    # Check if any of the filtered rows match the ICD code pattern
    if filtered_df['ICD'].str.match(code_regex).any():
        return 1
    return 0

# Apply the check_icd_in_period function to each row in sampled_notes
mgb_neg['ICDpos'] = mgb_neg.apply(lambda row: check_icd_in_period(row['BDSPPatientID'], row['NoteDate']), axis=1)

  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df = bi_icd_pos_dates[
  filtered_df 

In [116]:
print(len(mgb_neg))
mgb_neg.head()

1029


Unnamed: 0,BDSPPatientID,NoteDate,NoteTitle,ICDpos
0,118374115,2018-11-20 00:00:00,Notes_13333491432_2130196704_20181120.txt,0
1,116931824,2022-12-29 00:00:00,Notes_13563868206_7830407523_20221229.txt,0
2,118587891,2021-05-30 00:00:00,Notes_13532685798_4823397982_20210530.txt,0
3,114557355,2022-07-22 00:00:00,Notes_13620259495_8489742841_20220722.txt,0
4,114329029,2018-05-15 00:00:00,Notes_13356996149_1753656538_20180515.txt,0


In [117]:
# Count the number of 1s and 0s in the ICDpos column
icd_counts = mgb_neg['ICDpos'].value_counts()

numPos = icd_counts.get(1, 0)
numNeg = icd_counts.get(0, 0)

prevalence = numPos/len(mgb_neg)


print('BI')
print('ICD Positive Cases: ', numPos)
print('ICD Negative Cases: ', numNeg)

# Filter the DataFrame to get only the positive cases
positive_cases = mgb_neg[mgb_neg['ICDpos'] == 1]

# Get the list of BDSPPatientIDs that are positive cases
positive_patient_ids = positive_cases['BDSPPatientID'].tolist()

# Display the list of positive BDSPPatientIDs
print('\nList of BDSPPatientIDs that are positive cases:')
print(positive_patient_ids)

BI
ICD Positive Cases:  0
ICD Negative Cases:  1029

List of BDSPPatientIDs that are positive cases:
[]
