# Importing Libraries and Data

In [290]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords

In [291]:
df =pd.read_excel('Downloads/Data with Chief Complaints Set 1 8 26 2019.xlsx')

In [292]:
df.sample(5)

Unnamed: 0,Record #,Chief Complaint,Age,Sex,Acuity (initial),Area of Care,Disposition,Unnamed: 7,ICD-9 1,Unnamed: 9,In ED
6753,8891,Chest Pain,55yr,M,2.0,Resuscitation,Admit as inpatient,,427.31,,215.0
33207,43750,"Tooth Pain, Left",26yr,M,3.0,West,Home,,351.0,,173.0
27577,36491,Neck Pain,21yr,M,3.0,West,Home,,,,313.0
10422,13758,Medication Refill,51yr,F,3.0,East,Home,,V68.1,,67.0
11018,14570,'Danger To Others',28yr,M,3.0,Psych Lock-down,Admit as inpatient,,311,,719.0


In [293]:
df.shape

(40930, 11)

# Preprocessing Chief Complaint (text)

In [294]:
df = df[df['Chief Complaint'].notnull()] # remove nan

In [295]:
df['Chief Complaint'] = df['Chief Complaint'].astype(str)

In [296]:
stopWords = set(stopwords.words('english'))
print (stopWords)

{'myself', 'because', "should've", 'same', 'who', 'there', 'on', 'then', "haven't", 'aren', 'over', "weren't", 'himself', 'she', 'yourselves', "you'll", 'which', 'at', 'didn', 'here', 'am', 'as', 'will', 'again', 'itself', 'couldn', 'this', 'more', 'each', 'isn', 'before', 'from', 'll', 'through', 'than', 'ain', 'we', 'and', 'those', 'hasn', 'between', 'all', 'up', 'with', 'was', 'ourselves', 'themselves', 'herself', 'how', 'down', "it's", 'if', 'her', 'under', 'been', 'should', 'needn', 'other', "wasn't", 'nor', 'he', 'hadn', 'shouldn', 'above', 'theirs', 'its', 'after', 'did', 'be', 'do', 'a', 'mustn', 'him', 'not', 'o', 'their', 'no', 'just', 'had', 'my', 'mightn', 'when', "aren't", 'further', 'your', 'such', 'of', 'd', 'y', 'below', "mightn't", 'while', 'm', 'ours', 'to', "you're", 's', 'shan', 'our', 'both', 'having', 'don', 'very', 'it', "you've", "won't", 'against', 've', 'once', 't', 'does', "mustn't", 'me', "that'll", 'doing', 'them', 'an', 'ma', 'few', 'weren', 'have', 'his',

In [300]:
print ("Before preprocessing text sample:", df['Chief Complaint'][16608])
sentences = df['Chief Complaint']
sentences = sentences.apply(lambda x: re.findall(r"[\w']+", x)) #removing punctuations
sentences = [[x.lower() for x in sentence] for sentence in sentences] #converting all words to lower cases
sentences = [[x if x not in stopWords else '' for x in sentence] for sentence in sentences] #removing stopwords
sentences = [' '.join(sentence) for sentence in sentences]
df['Chief Complaint'] = sentences
df['Chief Complaint'] = df['Chief Complaint'].str.replace('\d+', '') #removing digits
print ("After preprocessing text sample:", df['Chief Complaint'][16608])

Before preprocessing text sample: referred  eval  r humeral head fx    
After preprocessing text sample: referred eval r humeral head fx


# Preprocessing ICD codes (label)

In [302]:
df['ICD-9 1'] = df['ICD-9 1'].astype(str)

In [303]:
df.dropna(subset=['ICD-9 1'],how = 'any', inplace = True)
df = df[df['ICD-9 1'] != 'nan']
#df = df[np.isfinite(df['ICD-9 1'])]

In [304]:
df.shape

(32088, 11)

## Deriving ICD Level 1  

In [305]:
df['ICD-level1'] = df['ICD-9 1'].apply(lambda x : x.split('.')[0])

In [306]:
df.sample(5)

Unnamed: 0,Record #,Chief Complaint,Age,Sex,Acuity (initial),Area of Care,Disposition,Unnamed: 7,ICD-9 1,Unnamed: 9,In ED,ICD-level1
18940,25176,body pain,53yr,M,2.0,Resuscitation,Home - Refer to Social Service,,724.2,,426.0,724
11515,15225,diarrhea,61yr,M,3.0,West,Home - Refer to Social Service,,787.91,,240.0,787
37224,48936,abdominal pain,48yr,F,3.0,East,Home,,789.02,,438.0,789
16066,21379,p allergic rxn,42yr,F,4.0,UADC,Left After Being Seen,,995.3,,61.0,995
4402,5749,bloody urine f c drainage bag x week,50yr,M,2.0,North,Home,,599.0,,208.0,599


In [307]:
df[df['ICD-level1'].str.get(0).isin(['E'])]['ICD-level1'].value_counts()

E819    152
E968     71
E814     60
E906     58
E888     54
E966     46
E922     46
E920     17
E813     10
E881     10
E812     10
E826      7
E885      6
E880      5
E953      5
E950      3
E958      3
E811      3
E860      2
E905      2
E828      2
E884      2
E882      2
E956      2
E914      1
E858      1
E980      1
E926      1
E925      1
E983      1
E800      1
E850      1
E886      1
E957      1
Name: ICD-level1, dtype: int64

In [308]:
df[df['ICD-level1'].str.get(0).isin(['V'])]['ICD-level1'].value_counts()

V68    286
V70    254
V67     80
V54     39
V72     25
V58     23
V22     23
V65     16
V55     13
V60     10
V01      9
V44      6
V23      3
V53      3
V74      3
V25      3
V82      3
V62      2
V40      2
V42      2
V45      1
V04      1
V11      1
V08      1
V30      1
V71      1
Name: ICD-level1, dtype: int64

In [309]:
mask = df['ICD-level1'].str.get(0).isin(['V'])
column_name = 'ICD-level1'
df.loc[mask, column_name] = 'V'
mask = df['ICD-level1'].str.get(0).isin(['E'])
df.loc[mask, column_name] = 'E'
#df[df['ICD-level1'].str.get(0).isin(['E'])]['ICD-level1'] = 'E'

In [310]:
df[df['ICD-level1'].str.get(0).isin(['V'])]['ICD-level1'].value_counts()

V    811
Name: ICD-level1, dtype: int64

In [311]:
def d_map(i):
    
    if i == 'E':
        return 'EXTERNAL CAUSES FOR INJURY'
    if i == 'V':
        return 'SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS'
    if i == '870-897?':
        return 'MISADVENTURES TO PATIENTS DURING SURGICAL AND MEDICAL CARE'

    if i.isdigit():
        i = int(i)
        if i >= 0 and i <= 139:
            return 'INFECTIOUS AND PARASITIC DISEASES'
        elif i <= 239:
            return 'NEOPLASMS'
        elif i <= 279:
            return 'ENDOCRINE, NUTRITIONAL AND METABOLIC DISEASES, AND IMMUNITY DISORDERS'
        elif i <= 319:
            return 'MENTAL DISORDERS'
        elif i <= 389:
            return 'DISEASES OF THE NERVOUS SYSTEM AND SENSE ORGANS'
        elif i <= 459:
            return 'DISEASES OF THE CIRCULATORY SYSTEM'
        elif i<= 519:
            return 'DISEASES OF THE RESPIRATORY SYSTEM'
        elif i <= 579:
            return 'DISEASES OF THE DIGESTIVE SYSTEM'
        elif i <= 629:
            return 'DISEASES OF THE GENITOURINARY SYSTEM'
        elif i <= 679:
            return 'COMPLICATIONS OF PREGNANCY, CHILDBIRTH, AND THE PUERPERIUM'
        elif i <= 709:
            return 'DISEASES OF THE SKIN AND SUBCUTANEOUS TISSUE'
        elif i <= 739:
            return 'DISEASES OF THE MUSCULOSKELETAL SYSTEM AND CONNECTIVE TISSUE'
        elif i >= 760 and i <= 779:
            return 'CERTAIN CONDITIONS ORIGINATING IN THE PERINATAL PERIOD'
        elif i <= 789:
            return 'SYMPTOMS'
        elif i <= 796:
            return 'NONSPECIFIC ABNORMAL FINDINGS'
        elif i == 797:
            return 'Senility without mention of psychosis'
        elif i == 798:
            return 'Sudden death, cause unknown'
        elif i == 799:
            return 'Other ill-defined and unknown causes of morbidity and mortality'
        elif i <= 999:
            return  'INJURY AND POISONING'
        else:
            print (i)
            return 'OTHERS'
    return 'OTHERS'

In [312]:
df.sample(5)

Unnamed: 0,Record #,Chief Complaint,Age,Sex,Acuity (initial),Area of Care,Disposition,Unnamed: 7,ICD-9 1,Unnamed: 9,In ED,ICD-level1
11543,15265,suicidal ideation,46yr,M,3.0,Psych Lock-down,Transfer to Outside Facility,,311.0,,2191.0,311
22561,29934,right mid quadrant pain,43yr,F,2.0,East,Home,,599.7,,205.0,599
9714,12785,cough fever,41yr,M,2.0,Observation,Home,,482.9,,2311.0,482
8727,11481,vaginal bleeding,38yr,F,3.0,West,Transfer to OB Triage/Gyn Procedure,,640.9,,234.0,640
12204,16131,asthma,47yr,F,4.0,North,Home,,493.9,,204.0,493


## Generating class 

In [315]:
df['class'] = df.apply(lambda row: d_map(row['ICD-level1']), axis = 1)

In [316]:
df.sample(5)

Unnamed: 0,Record #,Chief Complaint,Age,Sex,Acuity (initial),Area of Care,Disposition,Unnamed: 7,ICD-9 1,Unnamed: 9,In ED,ICD-level1,class
40230,52870,laceration lip,21yr,M,4.0,Jail,Jail,,873.53,,531.0,873,INJURY AND POISONING
33135,43649,abdominal pain,72yr,F,3.0,East,Admit as inpatient,,789.3,,836.0,789,SYMPTOMS
26921,35650,abdominal pain,49yr,M,4.0,West,Home,,608.89,,110.0,608,DISEASES OF THE GENITOURINARY SYSTEM
10050,13258,r arm pain back pain,49yr,M,4.0,West,Home,,724.5,,136.0,724,DISEASES OF THE MUSCULOSKELETAL SYSTEM AND CON...
37618,49463,vaginal bleeding,43yr,F,3.0,West,Home,,739.1,,94.0,739,DISEASES OF THE MUSCULOSKELETAL SYSTEM AND CON...


## Distribution of class

In [318]:
df['class'].value_counts(dropna = False)

SYMPTOMS                                                                 6521
INJURY AND POISONING                                                     4747
DISEASES OF THE MUSCULOSKELETAL SYSTEM AND CONNECTIVE TISSUE             2766
MENTAL DISORDERS                                                         2667
DISEASES OF THE DIGESTIVE SYSTEM                                         2649
DISEASES OF THE GENITOURINARY SYSTEM                                     2292
DISEASES OF THE RESPIRATORY SYSTEM                                       2121
DISEASES OF THE NERVOUS SYSTEM AND SENSE ORGANS                          1789
DISEASES OF THE CIRCULATORY SYSTEM                                       1487
DISEASES OF THE SKIN AND SUBCUTANEOUS TISSUE                             1424
ENDOCRINE, NUTRITIONAL AND METABOLIC DISEASES, AND IMMUNITY DISORDERS     871
SUPPLEMENTARY CLASSIFICATION OF FACTORS INFLUENCING HEALTH STATUS         811
INFECTIOUS AND PARASITIC DISEASES                               

# Droping junk features

In [319]:
df.drop(['Unnamed: 9', 'Unnamed: 7'], axis = 1, inplace = True)

In [320]:
df.sample()

Unnamed: 0,Record #,Chief Complaint,Age,Sex,Acuity (initial),Area of Care,Disposition,ICD-9 1,In ED,ICD-level1,class
36159,47562,chest pain,48yr,F,2.0,North,Home,786.59,724.0,786,SYMPTOMS


# Saving dataset

In [321]:
df.to_csv('ICD_preprocessed_dataset')