In [1]:
import pandas as pd
import numpy as np
import math
import pickle

### Find the most often diagnosed things in the dataset.

In [4]:
# find the most often diagnosed things in the MIMIC dataset
data = pd.read_csv('../MIMIC-III/DIAGNOSES_ICD.csv')
d = {}

for index, row in data.iterrows():
    cur_val = d.get(row['ICD9_CODE'])
    if cur_val == None:
        d[row['ICD9_CODE']] = 1
    else:
        d[row['ICD9_CODE']] += 1

sorted_d = sorted(d.items(), key=lambda x:x[1])
print(sorted_d)

[('2553', 1), ('9555', 1), ('71104', 1), ('9518', 1), ('80416', 1), ('2161', 1), ('0880', 1), ('9596', 1), ('79500', 1), ('74101', 1), ('7450', 1), ('37005', 1), ('53291', 1), ('92811', 1), ('3010', 1), ('9020', 1), ('05471', 1), ('79400', 1), ('79589', 1), ('1270', 1), ('1603', 1), ('75453', 1), ('9238', 1), ('3649', 1), ('80009', 1), ('0839', 1), ('7236', 1), ('V122', 1), ('71180', 1), ('4542', 1), ('76077', 1), ('V6149', 1), ('2984', 1), ('2149', 1), ('7132', 1), ('9942', 1), ('V111', 1), ('0490', 1), ('79009', 1), ('75450', 1), ('V123', 1), ('3578', 1), ('74405', 1), ('74402', 1), ('38832', 1), ('90229', 1), ('83401', 1), ('85212', 1), ('75557', 1), ('75010', 1), ('73600', 1), ('E9222', 1), ('9046', 1), ('3588', 1), ('75529', 1), ('36284', 1), ('E8156', 1), ('85144', 1), ('V4963', 1), ('1609', 1), ('37611', 1), ('37612', 1), ('V435', 1), ('92721', 1), ('1730', 1), ('64271', 1), ('64862', 1), ('E8193', 1), ('7537', 1), ('80300', 1), ('66582', 1), ('E8233', 1), ('3918', 1), ('63320',

In [9]:
# create a dictionary mapping ICD9 code to a diagnosis in words

ICD9_map = pd.read_csv('../MIMIC-III/D_ICD_DIAGNOSES.csv')
diagnosis_dictionary = {}

for index, row in ICD9_map.iterrows():
    diagnosis_dictionary[row['ICD9_CODE']] = row['LONG_TITLE']

pickle.dump(diagnosis_dictionary, open('diagnosis_dictionary.p', 'wb'))
print(diagnosis_dictionary)

{'01166': 'Tuberculous pneumonia [any form], tubercle bacilli not found by bacteriological or histological examination, but tuberculosis confirmed by other methods [inoculation of animals]', '01170': 'Tuberculous pneumothorax, unspecified', '01171': 'Tuberculous pneumothorax, bacteriological or histological examination not done', '01172': 'Tuberculous pneumothorax, bacteriological or histological examination unknown (at present)', '01173': 'Tuberculous pneumothorax, tubercle bacilli found (in sputum) by microscopy', '01174': 'Tuberculous pneumothorax, tubercle bacilli not found (in sputum) by microscopy, but found by bacterial culture', '01175': 'Tuberculous pneumothorax, tubercle bacilli not found by bacteriological examination, but tuberculosis confirmed histologically', '01176': 'Tuberculous pneumothorax, tubercle bacilli not found by bacteriological or histological examination, but tuberculosis confirmed by other methods [inoculation of animals]', '01180': 'Other specified pulmonar

In [7]:
# create a new dictionary from sorted_d that transforms the ICD 9 codes to string diagnoses

labeled_d = {}
for icd9_code in d:
    # ignore diagnoses that are not defined in the table
    if diagnosis_dictionary.get(icd9_code) == None:
        continue
    
    labeled_d[diagnosis_dictionary[icd9_code]] = d[icd9_code]

    
sorted_labeled_d = sorted(labeled_d.items(), key=lambda x:x[1])
sorted_labeled_d.reverse()
print(sorted_labeled_d)

[('Unspecified essential hypertension', 20703), ('Congestive heart failure, unspecified', 13111), ('Atrial fibrillation', 12891), ('Coronary atherosclerosis of native coronary artery', 12429), ('Acute kidney failure, unspecified', 9119), ('Diabetes mellitus without mention of complication, type II or unspecified type, not stated as uncontrolled', 9058), ('Other and unspecified hyperlipidemia', 8690), ('Acute respiratory failure', 7497), ('Urinary tract infection, site not specified', 6555), ('Esophageal reflux', 6326), ('Pure hypercholesterolemia', 5930), ('Need for prophylactic vaccination and inoculation against viral hepatitis', 5779), ('Observation for suspected infectious condition', 5519), ('Anemia, unspecified', 5406), ('Unspecified acquired hypothyroidism', 4917), ('Pneumonia, organism unspecified', 4839), ('Acute posthemorrhagic anemia', 4552), ('Acidosis', 4528), ('Chronic airway obstruction, not elsewhere classified', 4431), ('Severe sepsis', 3912), ('Long-term (current) use

### Find amount of unique individuals in the dataset.

In [4]:
# find the amount of unique individuals in the DIAGNOSES_ICD dataset

unique_individuals = set()

for index, row in data.iterrows():
    unique_individuals.add(row['SUBJECT_ID'])

print(len(unique_individuals))

46520


### Find all the patients that received a diagnosis for atrial fibrillation.

In [5]:
# create a csv where for each patient we output whether or not they received a diagnosis for atrial fibrillation

atrial_fib_data = {}

for index, row in data.iterrows():
    if row['ICD9_CODE'] == '42731':
        atrial_fib_data[int(row['SUBJECT_ID'])] = 1
    elif atrial_fib_data.get(int(row['SUBJECT_ID'])) == 0 or atrial_fib_data.get(int(row['SUBJECT_ID'])) == None:
        atrial_fib_data[int(row['SUBJECT_ID'])] = 0

# store the dictionary data as an array
afib_data_array = []
patient_id_array = []
for key in atrial_fib_data:
    patient_id_array.append(key)
    afib_data_array.append(atrial_fib_data[key])

In [6]:
print(patient_id_array)
print(np.mean(afib_data_array))

[109, 112, 113, 114, 115, 116, 117, 118, 119, 120, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 41, 42, 43, 44, 45, 46, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 273, 274, 275, 245, 246, 247, 248, 249, 250, 251, 252, 253, 255, 256, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 197, 198, 199, 288, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 139, 140, 141, 142, 143, 144, 145, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 146, 14

In [7]:
afib_dataframe = pd.DataFrame({'patient_id': patient_id_array, 'afib': afib_data_array})
print(afib_dataframe)

       patient_id  afib
0             109     0
1             112     0
2             113     0
3             114     0
4             115     0
...           ...   ...
46515       97164     0
46516       97484     0
46517       97488     1
46518       97492     0
46519       97497     1

[46520 rows x 2 columns]


### Process clinician's notes.

In [8]:
notes = pd.read_csv('NOTEEVENTS.csv')

  notes = pd.read_csv('NOTEEVENTS.csv')


In [9]:
# store the notes data in a dictionary where the subject id maps to a tuple of the form (note, hospital admission id of the note)
notes_data = {}

for index, row in notes.iterrows():
    if row['CATEGORY'] != 'Discharge summary' and row['DESCRIPTION'] == 'Report' and notes_data.get(row['SUBJECT_ID']) == None:
        notes_data[row['SUBJECT_ID']] = (row['TEXT'], row['HADM_ID'])

In [16]:
# not every patient in the dataset received a clinician's note other than a discharge summary
print(len(notes_data))

44844
('Baseline artifact\nSinus rhythm\nGeneralized low QRS voltages\nRight bundle branch block\nDiffuse nonspecific T wave changes\nSince previous tracing same date: atrial flutter absent and low voltage seen\n\n', 175533.0)


In [24]:
# create an array storing the notes corresponding to each patient's id

notes_array = []
hospital_admission_id_array = []

for patient_id in patient_id_array:
    # grab the info for the relevant patient id
    info = notes_data.get(patient_id)

    # if this patient does not have a clinician's note, then give them default values
    if info == None:
        notes_array.append('')
        hospital_admission_id_array.append(-1)
    else:
        text = info[0]
        # we replace newlines with whitespace
        notes_array.append(text.replace('\n', ' '))

        # store the hospital admission id related to the clinician's note
        # if there's no hospital admission id associated with the note, then give a default value
        if math.isnan(info[1]):
            hospital_admission_id_array.append(-1)
        else:
            hospital_admission_id_array.append(int(info[1]))    

### Find the age of each patient.

In [28]:
# read hospital admission data to get the date of the hospital admission
admissions = pd.read_csv('ADMISSIONS.csv')

In [33]:
# create a dictionary that maps hospital admission id to year of admission
admissions_data = {}

for index, row in admissions.iterrows():
    hospital_admission_id = row['HADM_ID']
    year_of_admission = row['ADMITTIME'][0:4]
    admissions_data[hospital_admission_id] = year_of_admission

In [35]:
admission_year_array = []

# find the year of admission that matches to the hospital admission id
for hospital_admission_id in hospital_admission_id_array:
    # if there's no hospital admission id associated with 
    if hospital_admission_id == -1:
        year_of_admission = -1
    else:
        year_of_admission = admissions_data[hospital_admission_id]
    admission_year_array.append(year_of_admission)

In [38]:
# read patient data to get their date of birth
patients = pd.read_csv('PATIENTS.csv')

In [40]:
# create a dictionary that maps patient_id to date of birth
year_of_birth_data = {}

for index, row in patients.iterrows():
    patient_id = row['SUBJECT_ID']
    year_of_birth = row['DOB'][0:4]
    year_of_birth_data[patient_id] = year_of_birth

In [41]:
year_of_birth_array = []

for patient_id in patient_id_array:
    year_of_birth = year_of_birth_data[patient_id]
    year_of_birth_array.append(year_of_birth)

In [46]:
age_array = []

for i in range(len(admission_year_array)):
    age_array.append(int(admission_year_array[i]) - int(year_of_birth_array[i]))

In [47]:
afib_dataframe['notes'] = notes_array
afib_dataframe['age'] = age_array


In [50]:
print(afib_dataframe)

       patient_id  afib                                              notes  \
0             109     0  PATIENT/TEST INFORMATION: Indication: Code. As...   
1             112     0  PATIENT/TEST INFORMATION: Indication: Syncope....   
2             113     0  Sinus rhythm, rate 93. Non-specific ST-T wave ...   
3             114     0  Normal sinus rhythm, rate 96 Right bundle bran...   
4             115     0  PATIENT/TEST INFORMATION: Indication: Left ven...   
...           ...   ...                                                ...   
46515       97164     0  PATIENT/TEST INFORMATION: Indication: Aortic v...   
46516       97484     0  Sinus bradycardia with non-diagnostic repolari...   
46517       97488     1  PATIENT/TEST INFORMATION: Indication: Stroke  ...   
46518       97492     0  PATIENT/TEST INFORMATION: Indication: Cerebrov...   
46519       97497     1  PATIENT/TEST INFORMATION: Indication: Left ven...   

        age  
0        25  
1     -1895  
2        35  
3      

In [51]:
# drop rows of data where the patient has an invalid age or is younger than 18

afib_dataframe = afib_dataframe[afib_dataframe['age'] > 18]

In [53]:
afib_dataframe = afib_dataframe.reset_index()

In [55]:
print(afib_dataframe)
print(np.mean(afib_dataframe['afib']))

       index  patient_id  afib  \
0          0         109     0   
1          2         113     0   
2          3         114     0   
3          4         115     0   
4          6         117     0   
...      ...         ...   ...   
29446  46515       97164     0   
29447  46516       97484     0   
29448  46517       97488     1   
29449  46518       97492     0   
29450  46519       97497     1   

                                                   notes  age  
0      PATIENT/TEST INFORMATION: Indication: Code. As...   25  
1      Sinus rhythm, rate 93. Non-specific ST-T wave ...   35  
2      Normal sinus rhythm, rate 96 Right bundle bran...   48  
3      PATIENT/TEST INFORMATION: Indication: Left ven...   75  
4      PATIENT/TEST INFORMATION: Indication: Murmur. ...   50  
...                                                  ...  ...  
29446  PATIENT/TEST INFORMATION: Indication: Aortic v...   83  
29447  Sinus bradycardia with non-diagnostic repolari...   79  
29448  PATIENT/

### Find the gender of each patient.

In [56]:
afib_dataframe.to_csv('processed_afib_data.csv', index=False)

In [4]:
patient_data = pd.read_csv('../MIMIC-III/PATIENTS.csv')

# create a dictionary that maps patient_id to gender
# 0 is male, 1 is female
gender_data = {}

for index, row in patient_data.iterrows():
    if row['GENDER'] == 'M':
        gender_data[row['SUBJECT_ID']] = 0
    elif row['GENDER'] == 'F':
        gender_data[row['SUBJECT_ID']] = 1

In [9]:
data = pd.read_csv('csv_files/processed_afib_data.csv')
data = data.drop(columns=['index'])

gender = []
for index, row in data.iterrows():
    gender.append(gender_data[row['patient_id']])

In [None]:
data['gender'] = gender

data.to_csv('csv_files/processed_afib_data.csv', index=False)

### Find baseline covariates (section is incomplete):
First number in tuple is the Carevue item_id, and second number in tuple is the Metavision item_id.
1. Heart rate (211, 220045)
2. Heart rhythm (212, 220048)
3. Heart sounds (213, 224389)
4. Weight kg (3693, 226512)
5. Height in (1394, 226707)

In [3]:
chart_events = pd.read_csv('../MIMIC-III/CHARTEVENTS.csv')

  chart_events = pd.read_csv('../MIMIC-III/CHARTEVENTS.csv')


<bound method NDFrame.head of               ROW_ID  SUBJECT_ID  HADM_ID  ICUSTAY_ID  ITEMID  \
0                788          36   165660    241249.0  223834   
1                789          36   165660    241249.0  223835   
2                790          36   165660    241249.0  224328   
3                791          36   165660    241249.0  224329   
4                792          36   165660    241249.0  224330   
...              ...         ...      ...         ...     ...   
330712478  330471885       99781   147562    200664.0  224847   
330712479  330471886       99781   147562    200664.0  224889   
330712480  330471887       99781   147562    200664.0  224903   
330712481  330471888       99781   147562    200664.0  224910   
330712482  330471889       99781   147562    200664.0  224930   

                     CHARTTIME            STORETIME     CGID            VALUE  \
0          2134-05-12 12:00:00  2134-05-12 13:56:00  17525.0             15.0   
1          2134-05-12 12:00

In [7]:
pickle.dump(chart_events, open('char_events.p', 'wb'))
# notes_half_1_sentences = pickle.load(open('list_of_sentences.p', 'rb'))