# Working with [ICD9](https://www.cdc.gov/nchs/icd/icd9.htm) Data

In [1]:
%matplotlib inline

In [2]:
import pickle
from collections import defaultdict
from nose.tools import assert_equals

In [3]:
with open("icd9.pickle","rb") as f0:
    icd9_data = pickle.load(f0)

## What does our data look like?

In [4]:
icd9_data[:5]
#tuple (patient id, ICD9 code, description)

((56, '198.3', 'SECONDARY MALIGNANT NEOPLASM OF BRAIN AND SPINAL C'),
 (56, '162.8', 'MALIGNANT NEOPLASM OF OTHER PARTS OF BRONCHUS OR L'),
 (56, '531.40', 'CHRONIC OR UNSPECIFIED GASTRIC ULCER WITH HEMORRHA'),
 (56, '276.1', 'HYPOSMOLALITY AND/OR HYPONATREMIA'),
 (56, '428.0', 'CONGESTIVE HEART FAILURE UNSPECIFIED'))

In [5]:
for d in icd9_data:
    try: #what if there is no label?
        if "HEP" in d[2]:
            print(d)
    except Exception as Error:
        pass

(78, '070.54', 'CHRONIC HEPATITIS C WITHOUT HEPATIC COMA')
(106, 'V02.62', 'CARRIER OR SUSPECTED CARRIER OF HEPATITIS C')
(124, '070.70', 'UNSPECIFIED VIRAL HEPATI                \r')
(117, '070.54', 'CHRONIC HEPATITIS C WITHOUT HEPATIC COMA')
(117, '070.44', 'CHR HEPAT C W/ HEP COMA                 \r')
(117, '572.4', 'HEPATORENAL SYNDROME                    \r')
(390, '070.70', 'UNSPECIFIED VIRAL HEPATI                \r')
(433, '070.32', 'CHRON VIR HEP B W/O COMA                \r')
(433, '070.54', 'CHR HEPAT C W/O HEP COMA                \r')
(433, '070.30', 'VIR HEP B W/O COM, HEP D                \r')
(433, '070.70', 'UNSPECIFIED VIRAL HEPATI                \r')
(590, '571.1', 'ACUTE ALCOHOLIC HEPATITIS')
(634, '572.4', 'HEPATORENAL SYNDROME                    \r')
(940, '070.54', 'CHRONIC HEPATITIS C WITHOUT HEPATIC COMA')
(940, '155.1', 'MALIGNANT NEOPLASM OF INTRAHEPATIC BILE DUCTS')
(1106, '070.32', 'CHRON VIR HEP B W/O COMA                \r')
(1100, '070.54', 'CHRONIC HEPAT

### Each element of `icd9_data` is a tuple with three elements
1. A patient id
1. An ICD9 code
1. The label for that ICD9 code

#### Create a dictionary named `icd9_map` with keys ICD9 code and values the ICD9 label

strip extra white spaces

In [6]:
#icd9_map = dict(icd9_data[1], icd9_data[2]) ???

icd9_map = {}
for d in icd9_data:
    key = d[1]
    try:
        value = d[2].strip() #remember d[2] is a string, so you can strip it
    except:
        value = "NO LABEL PROVIDED" #just to note, there could be code-value, but if after there was code-none, none would replace the value
    icd9_map[key]=value

In [7]:
icd9_map

{'008.45': 'INTESTIN INF CLOSTRIOIUM',
 '035': 'ERYSIPELAS',
 '038.0': 'STREPTOCOCCAL SEPTICEMIA',
 '038.10': 'STAPHYLOCOCCAL SEPTICEMIA UNSPECIFIED',
 '038.11': 'STAPHYLOCOCCUS AUREUS SEPTICEMIA',
 '038.19': 'OTHER STAPHYLOCOCCAL SEPTICEMIA',
 '038.3': 'SEPTICEMIA DUE TO ANAEROBES',
 '038.40': 'GRAM-NEG SEPTICEMIA NOS',
 '038.42': 'SEPTICEMIA DUE TO ESCHERICHIA COLI (E. COLI)',
 '038.49': 'GRAM-NEG SEPTICEMIA NEC',
 '038.8': 'SEPTICEMIA NEC',
 '038.9': 'UNSPECIFIED SEPTICEMIA',
 '041.00': 'STREPTOCOCCUS INFECTION IN CONDITIONS CLASSIFIED E',
 '041.04': 'STREPTOCOCCUS INFECTION',
 '041.11': 'STAPHYLOCOCCUS INFECTION IN CONDITIONS CLASSIFIED',
 '041.19': 'STAPHYLOCOCCUS INFECTION IN CONDITIONS CLASSIFIED',
 '041.3': 'KLEBSIELLA INFECT NOS',
 '041.4': 'E. COLI INFECT NOS',
 '041.6': 'PROTEUS INFECTION NOS',
 '041.7': 'PSEUDOMONAS INFECTION IN CONDITIONS CLASSIFIED ELS',
 '041.86': 'HELICOBACTER PYLORI INFE',
 '042': 'HUMAN IMMUNODEFICIENCY VIRUS (HIV) DISEASE',
 '047.8': 'OTHER SPECIFIED

In [None]:
icd9_patients = defaultdict(list) #key 
patient_diagnoses = defaultdict(list) #key is patient, values is all their codes

### How many patients are there for each diagnosis?

Create a list named `icd9_patients_list` sorted by the number of patients per diagnosis.

In [21]:
icd9_patients = defaultdict(list) #the first input it sees, it will make the first value be a list
#dictionary key by the code, then the value is the list of patients with that code

for p,c,l in icd9_data: #the three pieces in the tuple get separated into those variable names
    try:
        icd9_patients[c].append(p)
    except:
        pass

print(icd9_patients)

#def length_element2(x):
#    return len(x[1])
#could pass sort this function (e.g. .sort(length_element2), or could write it in-line
#...I think that that's the right syntax?

icd9_patients_list=list(icd9_patients.items())
icd9_patients_list.sort(key=lambda x: len(x[1]), reverse=True)

defaultdict(<class 'list'>, {'V10.46': [21, 176, 415, 414, 971, 1684, 1940, 2121, 2556], '276.3': [61, 353, 1158, 1579, 1670, 1807, 2187], '174.8': [2297], '531.70': [2266], 'E818.7': [1245], '041.4': [286, 503, 945, 1207, 1266, 2135, 2284], '682.2': [1804, 2512], '197.0': [516, 505, 623, 705, 1182, 1594, 1594, 1594, 1882, 2099, 2297, 2501, 2558, 2558], '730.15': [1890], '441.3': [405], '934.9': [2291], '038.3': [1606, 2225], 'E878.6': [1656], '707.06': [450, 450], '569.85': [1207, 1931, 1931], '599.7': [176, 773, 921, 1151, 1100, 1371, 1470, 2265, 2364, 2493], '447.1': [1702], '682.7': [353, 732, 1979], '492.8': [1336, 1313, 1972, 2135, 2493], '775.9': [2532], 'E879.1': [353, 2512], '825.25': [1610], '593.2': [631, 2328], '428.31': [491, 1158, 2156], '707.01': [443, 450, 450], '451.84': [2262], '272.0': [56, 26, 21, 83, 124, 124, 235, 177, 301, 390, 415, 443, 496, 514, 609, 584, 705, 717, 690, 711, 752, 792, 936, 826, 937, 977, 1038, 1100, 1270, 1290, 1308, 1336, 1357, 1586, 1615, 165

# repeat with a regular dictionary

In [19]:
icd9_patients = {}
#dictionary keyed by the code, then the value is the list of patients with that code

for p,c,l in icd9_data[:50]: #the three pieces in the tuple get separated into those variable names
    #print(p,c,l)
    if c in icd9_patients:
        icd9_patients[c].append(p)
    else:
        new_list = [p]
        icd9_patients[c] = new_list
    #print(icd9_patients)

In [None]:
import random
d,p = random.choice(icd9_patients_list)
print(d,len(p),sep="\n")
print(icd9_patients["V12.59"])
for d,p in icd9_patients.items():
    if "HX" in d:
        print(d,len(p))

In [None]:
assert_equals(len(icd9_patients["V12.59"]),5)
assert_equals(len(icd9_patients["572.2"]),12)

#### Loop through  `icd9_patients_list`
1. for each element in `icd9_patients_list` print the ICD9 label corresponding to the code and the number of patients with each diagnosis.

In [None]:
# YOUR CODE HERE
raise NotImplementedError()

### How many diagnoses does each patient have?

In [None]:
patient_diagnoses = defaultdict(list)
# YOUR CODE HERE
raise NotImplementedError()

In [None]:
for p, d in patient_diagnoses_list:
    print(p)
    print(d) # replace with len(d)
    print("\n")

In [None]:
assert_equals(len(patient_diagnoses[2512]),49)
assert_equals(len(patient_diagnoses[353]),56)
assert_equals(len(patient_diagnoses_list[0][1]),125)
assert_equals(len(patient_diagnoses_list[45][1]),23)
