In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

#### Demographics

In [3]:
lupus_df = pd.DataFrame()
#lupus_df['gender']= np.random.choice([0, 1], sample_num, p=[0.5, 0.5])
lupus_df.head()

In [4]:
sample_num = 70000
nan_frac = 0.95
pos_frac = 0.025
neg_frac = 0.025
renal_pos_frac = 0.0125
renal_neg_frac =0.025
renal_nan_frac = 0.90

#### Entry criterion

In [5]:
lupus_df['ana'] = np.random.choice([1, 0], sample_num, p=[0.5, 0.5])
lupus_df.head()

Unnamed: 0,ana
0,1
1,0
2,0
3,0
4,1


In [6]:
lupus_df.ana.value_counts()

0    35044
1    34956
Name: ana, dtype: int64

#### Additional criteria

In [7]:
criteria_weights = {'fever':2, 'leukopenia':3, 'thrombocytopenia':4, 'auto_immune_hemolysis':4, 'delirium':2, 
                    'psychosis':3, 'seizure':5, 'non_scarring_alopecia':2, 'oral_ulcers':2, 'subacute_cutaneous':4, 
                    'discoid_lupus':4, 'acute_cutaneous_lupus':6, 'pleural_effusion': 5, 'pericardial_effusion':5, 
                    'acute_pericarditis':6, 'joint_involvement':6, 'proteinuria':4, 'renal_biopsy_2_5':8, 
                    'renal_biopsy_3_4': 10, 'anti_cardioliphin_antibodies':2, 'anti_b2gp4_antibodies':2, 
                    'lupus_anti_coagulant':2, 'low_c3_or_low_c4':3, 'low_c3_and_low_c4':4, 'anti_dsdna_antibody':6, 
                    'anti_smith_antibody':6,}

In [8]:
def get_truncated_normal(mean=0, sd=1, low=0, upp=10, size=100):
    return truncnorm(
        (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd).rvs(size=size)

In [9]:
def create_proteinuria(size, max_val, skewness):
    from scipy.stats import skewnorm
    skewed_list = skewnorm.rvs(a = skewness, loc=max_val, size=size)
    skewed_list = skewed_list - min(skewed_list)      #Shift the set so the minimum value is equal to zero.
    skewed_list = skewed_list / max(skewed_list)      #Standadize all the vlues between 0 and 1. 
    skewed_list = skewed_list * max_val
    return skewed_list

In [10]:
lupus_df['fever'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['leukopenia'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['thrombocytopenia'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['auto_immune_hemolysis'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['delirium'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['psychosis'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['seizure'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['non_scarring_alopecia'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['oral_ulcers'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['subacute_cutaneous'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['discoid_lupus'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['acute_cutaneous_lupus'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['pleural_effusion'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['pericardial_effusion'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['acute_pericarditis'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['joint_involvement'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['proteinuria'] = get_truncated_normal(0.05, 1, 0, 3, sample_num)
lupus_df['proteinuria'] = create_proteinuria(sample_num, 3, 100)
lupus_df['proteinuria'] = lupus_df['proteinuria'].sample(frac=0.3)
lupus_df['renal_biopsy_class'] = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], sample_num, p=[renal_neg_frac, renal_neg_frac, renal_pos_frac, renal_pos_frac, renal_pos_frac, renal_pos_frac, renal_nan_frac])
lupus_df['anti_cardioliphin_antibodies'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['anti_b2gp4_antibodies'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['lupus_anti_coagulant'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['c3'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['c4'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['anti_dsdna_antibody'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df['anti_smith_antibody'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
lupus_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,renal_biopsy_class,anti_cardioliphin_antibodies,anti_b2gp4_antibodies,lupus_anti_coagulant,c3,c4,anti_dsdna_antibody,anti_smith_antibody
0,1,1.0,,,,,,,0.0,1.0,...,,,,,,,,,,
1,0,,,,,,0.0,,,,...,,0.218628,,1.0,,,,,,1.0
2,0,,,,,,,,,,...,,0.367514,,,,,,,,
3,0,,,,1.0,,,,,,...,,,,,,,,,,
4,1,,,,,,,1.0,,,...,,0.44202,,1.0,,,,,,


In [11]:
lupus_df.isna().sum()

ana                                 0
fever                           66539
leukopenia                      66570
thrombocytopenia                66610
auto_immune_hemolysis           66520
delirium                        66387
psychosis                       66552
seizure                         66559
non_scarring_alopecia           66549
oral_ulcers                     66529
subacute_cutaneous              66511
discoid_lupus                   66557
acute_cutaneous_lupus           66488
pleural_effusion                66524
pericardial_effusion            66495
acute_pericarditis              66549
joint_involvement               66592
proteinuria                     49000
renal_biopsy_class              62977
anti_cardioliphin_antibodies    66459
anti_b2gp4_antibodies           66560
lupus_anti_coagulant            66478
c3                              66481
c4                              66427
anti_dsdna_antibody             66552
anti_smith_antibody             66463
dtype: int64

In [12]:
domains_feat_dict = {'constitutional': ['fever'],
                'hematologic': ['leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis'],
                'neuropsychiatric': ['delirium', 'psychosis', 'seizure'],
                'mucocutaneous': ['non_scarring_alopecia', 'oral_ulcers', 'subacute_cutaneous', 'discoid_lupus', 'acute_cutaneous_lupus'],
                'serosal': ['pleural_effusion', 'pericardial_effusion', 'acute_pericarditis'],
                'musculoskeletal': ['joint_involvement'],
                'renal': ['proteinuria', 'renal_biopsy_class'],
                'antiphospholipid_antibodies': ['anti_cardioliphin_antibodies', 'anti_b2gp4_antibodies', 'lupus_anti_coagulant'],
                'complement_proteins': ['c3', 'c4'],
                'sle_specific_antibodies':['anti_dsdna_antibody', 'anti_smith_antibody']}

domains_max_scores_dict = {'constitutional': 2, 'hematologic': 4, 'neuropsychiatric': 5, 'mucocutaneous': 6,
                           'serosal': 6, 'musculoskeletal': 6, 'renal': 10, 'antiphospholipid_antibodies': 2, 
                           'complement_proteins': 4, 'sle_specific_antibodies':6}


#### Labelling the dataset

In [13]:
def get_renal_biopsy_score(result_class):
    if (result_class == 3) | (result_class == 4):
        return 10
    elif (result_class == 2) | (result_class == 5):
        return 8
    else:
        return 0

In [14]:
def get_proteinura_score(amount):
    if amount > 0.5:
        return 4
    else:
        return 0

In [15]:
def get_feat_score(row, feat):
    if feat == 'proteinuria':
        feat_score = get_proteinura_score(row[feat])
    elif feat == 'renal_biopsy_class':
        feat_score = get_renal_biopsy_score(row[feat])
    elif row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = criteria_weights[feat]
    return feat_score

In [16]:
def get_c3_c4_score(c3, c4): # 0 - low, 1 is not low
    if (c3 == 0) & (c4 == 0):
        return criteria_weights['low_c3_and_low_c4']
    elif (c3 == 0) | (c4 == 0):
        return criteria_weights['low_c3_or_low_c4']
    else:
        return 0

In [17]:
def get_domain_score(row, domain):
    domain_features = domains_feat_dict[domain]
    #print(f'{domain} domain: {domain_features}')
    domain_score = 0
    if domain == 'complement_proteins':
        domain_score = get_c3_c4_score(row.c3, row.c4)
        domain_features = list(set(domain_features) - set(['c3', 'c4']))
    for feat in domain_features:
        if feat == 'renal_biopsy_class': # to delete
            pass
        else:
            if row[feat] >= 0:
                feat_score = get_feat_score(row, feat)
                #print(f'{feat} feature - {feat_score}')
                if feat_score > domain_score:
                    domain_score = feat_score
    #print(f'{domain} domain: {domain_score}')
    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score

In [18]:
def create_label(row):
    #print(f'ROW {row.name}')
    if row['ana'] == 0:
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        #print(f'{domain} - {domain_score}')
        total_row_score += domain_score
        if total_row_score >= 10:
            #print(f'row {row.name} - {total_row_score}')
            return 1
    else:
        return 0

In [19]:
def compute_score(row):
    if row['ana'] == 0: # negative - 0 positive - 1
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        total_row_score += domain_score
    return total_row_score

In [20]:
lupus_df.shape

(70000, 26)

In [21]:
lupus_df.columns

Index(['ana', 'fever', 'leukopenia', 'thrombocytopenia',
       'auto_immune_hemolysis', 'delirium', 'psychosis', 'seizure',
       'non_scarring_alopecia', 'oral_ulcers', 'subacute_cutaneous',
       'discoid_lupus', 'acute_cutaneous_lupus', 'pleural_effusion',
       'pericardial_effusion', 'acute_pericarditis', 'joint_involvement',
       'proteinuria', 'renal_biopsy_class', 'anti_cardioliphin_antibodies',
       'anti_b2gp4_antibodies', 'lupus_anti_coagulant', 'c3', 'c4',
       'anti_dsdna_antibody', 'anti_smith_antibody'],
      dtype='object')

In [132]:
#lupus_df['score'] = lupus_df.apply(lambda row: compute_score(row), axis=1)
lupus_df['label'] = lupus_df.apply(lambda row: create_label(row), axis=1)
lupus_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,renal_biopsy_class,anti_cardioliphin_antibodies,anti_b2gp4_antibodies,lupus_anti_coagulant,c3,c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,1.0,,,,,,,0.0,1.0,...,,,,,,,,,,0
1,0,,,,,,0.0,,,,...,0.218628,,1.0,,,,,,1.0,0
2,0,,,,,,,,,,...,0.367514,,,,,,,,,0
3,0,,,,1.0,,,,,,...,,,,,,,,,,0
4,1,,,,,,,1.0,,,...,0.44202,,1.0,,,,,,,0


In [133]:
a = lupus_df[(lupus_df.ana == 1) & (lupus_df.label == 0)] # i need at least 20000 here
len(a)

33261

In [134]:
lupus_df.label.value_counts()

0    68305
1     1695
Name: label, dtype: int64

#### delete from here

#### End here

In [135]:
lupus_df.isna().sum()

ana                                 0
fever                           66539
leukopenia                      66570
thrombocytopenia                66610
auto_immune_hemolysis           66520
delirium                        66387
psychosis                       66552
seizure                         66559
non_scarring_alopecia           66549
oral_ulcers                     66529
subacute_cutaneous              66511
discoid_lupus                   66557
acute_cutaneous_lupus           66488
pleural_effusion                66524
pericardial_effusion            66495
acute_pericarditis              66549
joint_involvement               66592
proteinuria                     49000
renal_biopsy_class              62977
anti_cardioliphin_antibodies    66459
anti_b2gp4_antibodies           66560
lupus_anti_coagulant            66478
c3                              66481
c4                              66427
anti_dsdna_antibody             66552
anti_smith_antibody             66463
label       

In [136]:
a = lupus_df[(lupus_df.ana == 1) & (lupus_df.label == 0)] # i need at least 20000 here
len(a)

33261

In [137]:
a.iloc[0]

ana                             1.0
fever                           1.0
leukopenia                      NaN
thrombocytopenia                NaN
auto_immune_hemolysis           NaN
delirium                        NaN
psychosis                       NaN
seizure                         NaN
non_scarring_alopecia           0.0
oral_ulcers                     1.0
subacute_cutaneous              NaN
discoid_lupus                   1.0
acute_cutaneous_lupus           0.0
pleural_effusion                NaN
pericardial_effusion            NaN
acute_pericarditis              NaN
joint_involvement               NaN
proteinuria                     NaN
renal_biopsy_class              NaN
anti_cardioliphin_antibodies    NaN
anti_b2gp4_antibodies           NaN
lupus_anti_coagulant            NaN
c3                              NaN
c4                              NaN
anti_dsdna_antibody             NaN
anti_smith_antibody             NaN
label                           0.0
Name: 0, dtype: float64

#### Testing dataset

In [138]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import label_binarize, LabelBinarizer
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import roc_curve, auc

In [139]:
lupus_df = lupus_df.fillna(-1)
X = lupus_df.iloc[:, 0:-1]
y = lupus_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
#X_train, y_train = np.array(X_train), np.array(y_train)
#X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((56000, 26), (14000, 26), (56000,), (14000,))

In [140]:
from sklearn.tree import DecisionTreeClassifier, export_graphviz, export_text, plot_tree
dt = DecisionTreeClassifier(random_state=42).fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)
#dot_data = export_graphviz(dt, out_file='../images/dt_dataset.dot', 
#                           feature_names = X_train.columns,  
#                           class_names = ['Negative', 'Positive'],
#                           filled=True)
accuracy_score(y_test, y_pred_dt), precision_score(y_test, y_pred_dt), recall_score(y_test, y_pred_dt), f1_score(y_test, y_pred_dt)

(0.9963571428571428,
 0.9472049689440993,
 0.8997050147492626,
 0.9228441754916792)

In [141]:
print(classification_report(y_test, y_pred_dt))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     13661
           1       0.95      0.90      0.92       339

    accuracy                           1.00     14000
   macro avg       0.97      0.95      0.96     14000
weighted avg       1.00      1.00      1.00     14000



In [None]:
fig = plt.figure(figsize=(25,20))
_ = plot_tree(dt, feature_names = X_train.columns,  class_names = ['Negative', 'Positive'], filled=True, fontsize=6)
plt.savefig('../images/dt_dataset.png', dpi=100)

#### Rules for classification

In [None]:
def has_fever(row):
    if row.body_temp > 37.8:
        return True
    elif row.body_temp >0:
        return False
    else:
        return None

In [None]:
def has_leukopenia(row):
    if row.wbc < 4000:
        return True
    elif row.wbc > 0:
        return False
    else:
        return None

In [None]:
def has_thrombocytopenia(row):
    if row.wbc < 4000:
        return True
    elif row.wbc > 0:
        return False
    else:
        return None