# Mount Drive and Load ICD9 Short Descriptions

---
**Descriptions sourced from:**  
[1] Robbins D. ICD9 [Internet]. GitHub; 2013. [updated 2013 Nov 11; cited 2024 Apr 14]. Available from: https://github.com/drobbins/ICD9/tree/master

**Categories sourced from:**  
[2] Venkataraman GR, Pineda AL, Bear Don't Walk Iv OJ, Zehnder AM, Ayyar S, Page RL, Bustamante CD, Rivas MA. FasTag: Automatic text classification of unstructured medical narratives. PLoS One. 2020 Jun 22;15(6):e0234647. doi: 10.1371/journal.pone.0234647. PMID: 32569327; PMCID: PMC7307763.  

[3] Gao C, Goswami M, Chen J, Dubrawski A. Classifying unstructured clinical notes via automatic weak supervision. arXiv. 2022 Jun [cited 2024 Mar 15]. In: arXiv:2206.12088 [cs.CL]. doi: 10.48550/arXiv.2206.12088.



In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
directory_path = '/content/drive/MyDrive/CS598DLH/KeyClassReproducibility/icd9/'
file = 'CMS32_DESC_LONG_SHORT_FORMATTED.csv'

Mounted at /content/drive/


In [2]:
import pandas as pd

def load_icd_desc(directory_path, file):
    """
    Loads a CSV file containing ICD codes and descriptions,
    processes the descriptions by removing non-alphanumeric characters
    (keeping spaces), and converts them to lower case.

    Args:
        directory_path (str): The path to the directory containing the CSV file.
        file (str): The name of the CSV file to be loaded.

    Returns:
        pandas.DataFrame: A DataFrame with two columns:
                          - 'code': containing the ICD codes as strings.
                          - 'desc': containing the processed descriptions,
                                    as lower-case strings.

    Example:
        >>> load_icd_desc('path/to/directory/', 'icd_codes.csv')
        DataFrame with ICD codes and descriptions.

    Note:
        The file should be in CSV format and encoded in 'latin-1'.
        The first row should contain headers. It expects the ICD codes in the
        first column and descriptions in the second column of the CSV.
    """
    icd = pd.read_csv(directory_path + file,
                      header=0,
                      usecols=[0, 1],
                      names=['code', 'desc'],
                      dtype={
                                'code': str,
                                'desc': str,
                            },
                      encoding='latin-1'
                      )

    icd['desc'] = icd['desc'].str.replace('[^\w\s]', '', regex=True).str.lower()

    return icd


icd = load_icd_desc(directory_path, file)
icd.head()

Unnamed: 0,code,desc
0,1.0,cholera due to vibrio cholerae
1,1.1,cholera due to vibrio cholerae el tor
2,1.9,cholera unspecified
3,2.0,typhoid fever
4,2.1,paratyphoid fever a


In [3]:
def categorize_code(code):
    """
    Categorizes an ICD-9 code into various medical categories based on the
    code number.

    This function maps ICD-9 codes to their corresponding categories such as
    'Infections & Parasitic', 'Neoplasms', etc. If the code contains any
    alphabetic characters, it is categorized as 'Supplementary'.
    If the code does not fit into any predefined category,
    it is labeled as 'Uncategorized'.

    Args:
        code (str): The ICD-9 code as a string.

    Returns:
        str: The category of the ICD-9 code.

    Raises:
        ValueError: If the input is not a string.

    Example:
        >>> categorize_code('123')
        'Infections & Parasitic'
        >>> categorize_code('V23')
        'Supplementary'

    Note:
        These high-level categories do not follow standard ICD-9 categories,
        but reference the FasTag & Classifying Unstructured Clinical Notes papers.
    """

    icd9_categories = {
        "Infections & Parasitic": range(1, 140),
        "Neoplasms": range(140, 240),
        "Endocrine, Nutritional and Metabolic": range(240, 280),
        "Blood and Blood Forming Organs": range(280, 290),
        "Mental Disorders": range(290, 320),
        "Nervous System": range(320, 360),
        "Sense Organs": range(360, 390),
        "Circulatory System": range(390, 460),
        "Respiratory System": range(460, 520),
        "Digestive System": range(520, 580),
        "Genitourinary System": range(580, 630),
        "Pregnancy, Childbirth and the Puerperium": range(630, 680),
        "Skin and Subcutaneous Tissue": range(680, 710),
        "Musculoskeletal System and Connective Tissue": range(710, 740),
        "Congenital Anomalies": range(740, 760),
        "Perinatal Period Conditions": range(760, 780),
        "External Causes of Injury": range(780, 800),
        "Injury and Poisoning": range(800, 1000)
    }

    # Supplementary
    if any(char.isalpha() for char in code):
        return "Supplementary"
    else:
        numeric_code = float(code)
        for category, code_range in icd9_categories.items():
            if int(numeric_code) in code_range:
                return category
    return "Uncategorized"

icd['category'] = icd['code'].apply(categorize_code)
print("Uncategorized" in icd['category'].values)
icd.head()

False


Unnamed: 0,code,desc,category
0,1.0,cholera due to vibrio cholerae,Infections & Parasitic
1,1.1,cholera due to vibrio cholerae el tor,Infections & Parasitic
2,1.9,cholera unspecified,Infections & Parasitic
3,2.0,typhoid fever,Infections & Parasitic
4,2.1,paratyphoid fever a,Infections & Parasitic


In [None]:
def concat_desc(df):
    """
    Concatenates descriptions within each category into a single string for
    each category.

    This function groups the DataFrame by the 'category' column and aggregates
    all descriptions within each category into a single string.
    The descriptions are concatenated in the order they appear in the DataFrame.
    After concatenation, the DataFrame is reset to ensure that 'category'
    remains a column and not an index.

    Args:
        df (pandas.DataFrame): A DataFrame with at least two columns:
                               'category' and 'desc'.
                               The 'category' column contains categorical data,
                               and 'desc' contains text descriptions.

    Returns:
        pandas.DataFrame:      A DataFrame with three columns:
                                - 'code': The ICD 9 codes
                                - 'category': The unique categories from the
                                              input DataFrame.
                                - 'desc': A single string containing all
                                          descriptions for each category,
                                          concatenated together.

    Example:
        >>> data = pd.DataFrame({
            'category': ['A', 'A', 'B', 'B'],
            'desc': ['desc1', 'desc2', 'desc3', 'desc4']
        })
        >>> concat_desc(data)
            category   desc
        0      A      'desc1 desc2'
        1      B      'desc3 desc4'
    """
    df = df.groupby(['category']).agg({'desc': ' '.join}).reset_index()
    return df

icd = concat_desc(icd)

In [None]:
# Imports for tokenization and text processing

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter
import re
import nltk


nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
def extract_desc(description):
    """
    Process the given description to remove stopwords and return the top 30
    most common words as a comma-separated string.

    Args:
    description (str): The text description to process.

    Returns:
    str: A string of the top 20 most common words separated by commas.
    """

    stop_words = set(stopwords.words('english'))
    words = word_tokenize(description)

    filtered_words = [word for word in words if word not in stop_words and word.isalpha()]

    most_common = Counter(filtered_words).most_common(30)

    return ', '.join(word for word, count in most_common)

icd['desc'] = icd['desc'].apply(extract_desc)

In [None]:
icd.head()

Unnamed: 0,category,desc
0,Blood and Blood Forming Organs,"anemia, unspecified, disease, deficiency, spec..."
1,Circulatory System,"unspecified, disease, heart, chronic, acute, i..."
2,Congenital Anomalies,"congenital, anomalies, unspecified, specified,..."
3,Digestive System,"unspecified, obstruction, without, mention, he..."
4,"Endocrine, Nutritional and Metabolic","unspecified, type, diabetes, uncontrolled, dis..."


In [None]:
def write_to_csv(df, file_path):
    """
    Writes specified DataFrame columns to a CSV file.

    Args:
    df (pd.DataFrame): The DataFrame containing the data.
    file_path (str): The path where the CSV file will be saved.
    """
    df[['category', 'desc']].to_csv(file_path, index=False)

In [None]:
for index, row in icd.iterrows():
    print(f"Category: {row['category']}")
    words = row['desc'].split(', ')
    for word in words:
        print(word)
    print("\n -------------------- \n")

Category: Blood and Blood Forming Organs
anemia
unspecified
disease
deficiency
specified
anemias
thalassemia
blood
crisis
hemolytic
sicklecell
congenital
neutropenia
iron
secondary
chronic
hereditary
due
without
cell
factor
disorder
hemorrhagic
purpura
thrombocytopenia
white
syndrome
acquired
aplastic
induced

 -------------------- 

Category: Circulatory System
unspecified
disease
heart
chronic
acute
infarction
artery
embolism
kidney
thrombosis
stage
failure
cerebrovascular
venous
without
episode
care
cerebral
late
effects
myocardial
hypertensive
syndrome
atherosclerosis
aneurysm
wall
specified
veins
diseases
rheumatic

 -------------------- 

Category: Congenital Anomalies
congenital
anomalies
unspecified
specified
anomaly
cleft
complete
deficiency
incomplete
longitudinal
without
limb
system
syndrome
partial
stenosis
lip
atresia
palate
ear
pulmonary
spina
bifida
hydrocephalus
region
deformities
upper
unilateral
bilateral
uterus

 -------------------- 

Category: Digestive System
unsp

In [None]:
write_to_csv(icd, directory_path+'class_desc.csv')

## Possible Alteration to Class Descriptions

---
Removes any words found in more than one class to further reduce ambiguity.



In [None]:
all_words = [word for sublist in icd['desc'].str.split(', ') for word in sublist]

from collections import Counter
word_counts = Counter(all_words)

common_words = {word for word, count in word_counts.items() if count > 1}

def remove_common_words(description):
    words = description.split(', ')
    filtered_words = [word for word in words if word not in common_words]
    return ', '.join(filtered_words)

icd_v2 = icd.copy()
icd_v2['desc'] = icd_v2['desc'].apply(remove_common_words)

write_to_csv(icd_v2, directory_path+'class_desc_v2.csv')

for index, row in icd_v2.iterrows():
    print(f"Category: {row['category']}")
    words = row['desc'].split(', ')
    for word in words:
        print(word)
    print("\n -------------------- \n")

Category: Blood and Blood Forming Organs
anemia
anemias
thalassemia
blood
hemolytic
sicklecell
neutropenia
iron
hereditary
factor
hemorrhagic
purpura
thrombocytopenia
white
aplastic

 -------------------- 

Category: Circulatory System
heart
infarction
artery
embolism
thrombosis
failure
cerebrovascular
venous
late
myocardial
hypertensive
atherosclerosis
aneurysm
wall
veins
rheumatic

 -------------------- 

Category: Congenital Anomalies
anomaly
cleft
complete
incomplete
longitudinal
partial
stenosis
lip
atresia
palate
spina
bifida
hydrocephalus
deformities
unilateral
bilateral
uterus

 -------------------- 

Category: Digestive System
perforation
hernia
gangrene
intestine
cholecystitis
gastric
teeth
recurrent
dental
gallbladder
jaw
calculus
duodenal
peptic
gastrojejunal
bile
duct

 -------------------- 

Category: Endocrine, Nutritional and Metabolic
diabetes
uncontrolled
metabolism
mellitus
ii
juvenile
goiter
vitamin
thyrotoxic
storm
toxic
gouty
hypothyroidism
thyroiditis
coma

 ----

In [None]:
write_to_csv(icd, directory_path+'class_desc_v2.csv')