In [2]:
import pandas as pd
import numpy as np
import math
import pickle

### Find the most often diagnosed things in the dataset.

In [3]:
# find the most often diagnosed things in the MIMIC dataset
data = pd.read_csv('../MIMIC-III/DIAGNOSES_ICD.csv')
d = {}

for index, row in data.iterrows():
    cur_val = d.get(row['ICD9_CODE'])
    if cur_val == None:
        d[row['ICD9_CODE']] = 1
    else:
        d[row['ICD9_CODE']] += 1

In [4]:
# create a dictionary mapping ICD9 code to a diagnosis in words

ICD9_map = pd.read_csv('../MIMIC-III/D_ICD_DIAGNOSES.csv')
diagnosis_dictionary = {}

for index, row in ICD9_map.iterrows():
    diagnosis_dictionary[row['ICD9_CODE']] = row['LONG_TITLE']

In [5]:
# create a new dictionary from sorted_d that transforms the ICD 9 codes to string diagnoses

labeled_d = {}
for icd9_code in d:
    # ignore diagnoses that are not defined in the table
    if diagnosis_dictionary.get(icd9_code) == None:
        continue
    
    labeled_d[diagnosis_dictionary[icd9_code]] = d[icd9_code]

    
sorted_labeled_d = sorted(labeled_d.items(), key=lambda x:x[1])
sorted_labeled_d.reverse()

### Find amount of unique individuals in the dataset.

In [None]:
# find the amount of unique individuals in the DIAGNOSES_ICD dataset

unique_individuals = set()

for index, row in data.iterrows():
    unique_individuals.add(row['SUBJECT_ID'])

len(unique_individuals)

### Find all the patients that received a diagnosis for atrial fibrillation.

In [5]:
# create a csv where for each patient we output whether or not they received a diagnosis for atrial fibrillation

atrial_fib_data = {}

for index, row in data.iterrows():
    if row['ICD9_CODE'] == '42731':
        atrial_fib_data[int(row['SUBJECT_ID'])] = 1
    elif atrial_fib_data.get(int(row['SUBJECT_ID'])) == 0 or atrial_fib_data.get(int(row['SUBJECT_ID'])) == None:
        atrial_fib_data[int(row['SUBJECT_ID'])] = 0

# store the dictionary data as an array
afib_data_array = []
patient_id_array = []
for key in atrial_fib_data:
    patient_id_array.append(key)
    afib_data_array.append(atrial_fib_data[key])

In [None]:
master_data = pd.DataFrame({'patient_id': patient_id_array, 'afib': afib_data_array})

### Process clinician's notes.

In [8]:
notes = pd.read_csv('NOTEEVENTS.csv')

  notes = pd.read_csv('NOTEEVENTS.csv')


In [9]:
# store the notes data in a dictionary where the subject id maps to a tuple of the form (note, hospital admission id of the note)
notes_data = {}

for index, row in notes.iterrows():
    if row['CATEGORY'] != 'Discharge summary' and row['DESCRIPTION'] == 'Report' and notes_data.get(row['SUBJECT_ID']) == None:
        notes_data[row['SUBJECT_ID']] = (row['TEXT'], row['HADM_ID'])

In [24]:
# create an array storing the notes corresponding to each patient's id

notes_array = []
hospital_admission_id_array = []

for patient_id in patient_id_array:
    # grab the info for the relevant patient id
    info = notes_data.get(patient_id)

    # if this patient does not have a clinician's note, then give them default values
    if info == None:
        notes_array.append('')
        hospital_admission_id_array.append(-1)
    else:
        text = info[0]
        # we replace newlines with whitespace
        notes_array.append(text.replace('\n', ' '))

        # store the hospital admission id related to the clinician's note
        # if there's no hospital admission id associated with the note, then give a default value
        if math.isnan(info[1]):
            hospital_admission_id_array.append(-1)
        else:
            hospital_admission_id_array.append(int(info[1]))    

### Find the age of each patient.

In [28]:
# read hospital admission data to get the date of the hospital admission
admissions = pd.read_csv('ADMISSIONS.csv')

In [33]:
# create a dictionary that maps hospital admission id to year of admission
admissions_data = {}

for index, row in admissions.iterrows():
    hospital_admission_id = row['HADM_ID']
    year_of_admission = row['ADMITTIME'][0:4]
    admissions_data[hospital_admission_id] = year_of_admission

In [35]:
admission_year_array = []

# find the year of admission that matches to the hospital admission id
for hospital_admission_id in hospital_admission_id_array:
    # if there's no hospital admission id associated with 
    if hospital_admission_id == -1:
        year_of_admission = -1
    else:
        year_of_admission = admissions_data[hospital_admission_id]
    admission_year_array.append(year_of_admission)

In [38]:
# read patient data to get their date of birth
patients = pd.read_csv('PATIENTS.csv')

In [40]:
# create a dictionary that maps patient_id to date of birth
year_of_birth_data = {}

for index, row in patients.iterrows():
    patient_id = row['SUBJECT_ID']
    year_of_birth = row['DOB'][0:4]
    year_of_birth_data[patient_id] = year_of_birth

In [41]:
year_of_birth_array = []

for patient_id in patient_id_array:
    year_of_birth = year_of_birth_data[patient_id]
    year_of_birth_array.append(year_of_birth)

In [46]:
age_array = []

for i in range(len(admission_year_array)):
    age_array.append(int(admission_year_array[i]) - int(year_of_birth_array[i]))

In [None]:
master_data['notes'] = notes_array
master_data['age'] = age_array

In [51]:
# drop rows of data where the patient has an invalid age or is younger than 18

master_data = master_data[master_data['age'] > 18]

In [53]:
master_data = master_data.reset_index()

### Find the gender of each patient.

In [56]:
master_data.to_csv('master_data.csv', index=False)

In [4]:
patient_data = pd.read_csv('../MIMIC-III/PATIENTS.csv')

# create a dictionary that maps patient_id to gender
# 0 is male, 1 is female
gender_data = {}

for index, row in patient_data.iterrows():
    if row['GENDER'] == 'M':
        gender_data[row['SUBJECT_ID']] = 0
    elif row['GENDER'] == 'F':
        gender_data[row['SUBJECT_ID']] = 1

In [9]:
data = pd.read_csv('csv_files/processed_afib_data.csv')
data = data.drop(columns=['index'])

gender = []
for index, row in data.iterrows():
    gender.append(gender_data[row['patient_id']])

In [None]:
data['gender'] = gender

data.to_csv('csv_files/master_data.csv', index=False)