In [1]:
import pandas as pd
import json

In [2]:
df = pd.read_csv('data/data.csv')

In [3]:
df['dx'] = df['dx'].str.strip().str.lower()

In [4]:
df['dx'].value_counts()

dx
early gastric cancer                        366
rectal cancer                               270
hepatocellular carcinoma                    247
colon cancer, sigmoid                       207
thyroid nodule                              202
                                           ... 
systemic lupus erythematosus                  1
anal fissure                                  1
malignant neoplasm of salivary gland          1
biliary acute pancreatitis                    1
colon submucosal tumor, unknown behavior      1
Name: count, Length: 842, dtype: int64

In [5]:
diagnoses = pd.Series(df['dx'].unique())

In [6]:
single_worded_diagnoses = pd.Series([d for d in diagnoses if ' ' not in d])
len(single_worded_diagnoses)

67

In [7]:
mapping = {}
for d in single_worded_diagnoses:
    for x in diagnoses[diagnoses.str.contains(d)]:
        if x != d:
            mapping[x] = d

with open('mapping.json', 'w') as f:
    json.dump(mapping, f)

In [8]:
diagnoses = diagnoses.replace(mapping)
diagnoses = pd.Series(diagnoses.unique())
df['dx'] = df['dx'].replace(mapping)

In [9]:
two_worded_diagnoses = pd.Series([d for d in diagnoses if len(d.split()) == 2])
len(two_worded_diagnoses)

207

In [10]:
mapping = {}
for d in two_worded_diagnoses:
    for x in diagnoses[diagnoses.str.contains(d)]:
        if x != d:
            mapping[x] = d

with open('mapping.json', 'w') as f:
    json.dump(mapping, f)

In [11]:
exemptions = ["colorectal cancer", "gallbladder cancer"]
mapping = {k: v for k, v in mapping.items() if k not in exemptions}
diagnoses = diagnoses.replace(mapping)
df['dx'] = df['dx'].replace(mapping)
diagnoses = pd.Series(diagnoses.unique())

In [12]:
three_worded_diagnoses = pd.Series([d for d in diagnoses if len(d.split()) == 3])
len(three_worded_diagnoses)

107

In [13]:
mapping = {}
for d in three_worded_diagnoses:
    for x in diagnoses[diagnoses.str.contains(d)]:
        if x != d:
            mapping[x] = d

with open('mapping.json', 'w') as f:
    json.dump(mapping, f)

In [14]:
further_aggregated = {
    'gastric cancer': 'gastric cancer',
    'inguinal hernia': 'inguinal hernia',
    'renal cell carcinoma': 'renal cell carcinoma',
}
for k, v in mapping.items():
    for k1, v1 in further_aggregated.items():
        if k1 in k:
            mapping[k] = v1

with open('mapping.json', 'w') as f:
    json.dump(mapping, f)

In [15]:
diagnoses = diagnoses.replace(mapping)
diagnoses = pd.Series(diagnoses.unique())
df['dx'] = df['dx'].replace(mapping)

In [16]:
df.loc[df['dx'].isin(df['dx'].value_counts()[df['dx'].value_counts() < 2].index), 'dx'] = 'other'

In [17]:
df['dx'] = df['dx'].str.capitalize()
df.groupby('dx').size().sort_values(ascending=False)

dx
Colon cancer                        549
Early gastric cancer                366
Other                               320
Lung cancer                         315
Rectal cancer                       270
                                   ... 
Metastatic cancer to the abdomen      2
Metastasis of unknown origin          2
Mediastinal tumor                     2
Carcinoid tumor of lung left          2
Fournier's gangrene, male             2
Length: 287, dtype: int64

In [18]:
df['dx'].value_counts()

dx
Colon cancer                                  549
Early gastric cancer                          366
Other                                         320
Lung cancer                                   315
Rectal cancer                                 270
                                             ... 
Endometrial cancer, stage 1a                    2
Fournier's gangrene, male                       2
Liver function tests abnormality                2
Duodenal polyp                                  2
Intrahepatic duct stonewithout obstruction      2
Name: count, Length: 287, dtype: int64

In [19]:
df.to_csv('data/data_cleaned.csv', index=False)