In [16]:
# importing packages
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import train_test_split
import sys

In [17]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [18]:
# dataset features
features = ['fever', 'leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis', 'delirium', 'psychosis', 'seizure', 
            'non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus', 'pleural_effusion', 'pericardial_effusion', 
            'acute_pericarditis', 'joint_involvement', 'proteinuria', 'biopsy_proven_lupus_nephritis', 
            'anti_cardioliphin_antibodies', 'anti_β2gp1_antibodies', 'lupus_anti_coagulant', 'low_c3', 'low_c4', 
            'anti_dsdna_antibody', 'anti_smith_antibody']

In [19]:
#feature prevalence in positive SLE determined from existing literature
feature_fractions = {'fever':0.36, 'leukopenia':0.50, 'thrombocytopenia':0.20, 'auto_immune_hemolysis':0.03, 
                     'delirium':0.49, 'psychosis':0.12, 'seizure':0.11, 'non_scarring_alopecia':0.85, 
                     'oral_ulcers':0.08, 'subacute_cutaneous_lupus':0.10, 'discoid_lupus':0.15, 
                     'acute_cutaneous_lupus':0.03, 'pleural_effusion': 0.17, 'pericardial_effusion':0.50, 
                     'acute_pericarditis':0.01, 'joint_involvement':0.69, 'proteinuria':0.60, 'renal_biopsy_1':0.01,
                     'renal_biopsy_2':0.03, 'renal_biopsy_3':0.04, 'renal_biopsy_4':0.07, 'renal_biopsy_5':0.02, 
                     'anti_cardioliphin_antibodies':0.17, 'anti_β2gp1_antibodies':0.10, 'lupus_anti_coagulant':0.11,
                     'low_c3':0.45, 'low_c4':0.44,'anti_dsdna_antibody':0.70, 'anti_smith_antibody':0.09}

In [20]:
# feature criteria weights for the diagnosis of SLE
criteria_weights = {'fever':2, 'leukopenia':3, 'thrombocytopenia':4, 'auto_immune_hemolysis':4, 'delirium':2, 
                    'psychosis':3, 'seizure':5, 'non_scarring_alopecia':2, 'oral_ulcers':2, 
                    'subacute_cutaneous_lupus':4, 'discoid_lupus':4, 'acute_cutaneous_lupus':6, 
                    'pleural_effusion': 5, 'pericardial_effusion':5, 'acute_pericarditis':6, 'joint_involvement':6, 
                    'proteinuria':4, 'renal_biopsy_1':0, 'renal_biopsy_2':8, 'renal_biopsy_3':10, 
                    'renal_biopsy_4':10, 'renal_biopsy_5':8,'anti_cardioliphin_antibodies':2, 
                    'anti_β2gp1_antibodies':2, 'lupus_anti_coagulant':2, 'low_c3':3, 'low_c4':3, 
                    'low_c3_and_low_c4':4, 'low_c3_or_low_c4':3, 'anti_dsdna_antibody':6, 'anti_smith_antibody':6}

In [33]:
# dataset features per clinical domain
domains_feat_dict = {'constitutional': ['fever'],
                     'hematologic': ['leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis'],
                     'neuropsychiatric': ['delirium', 'psychosis', 'seizure'],
                     'mucocutaneous': ['non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus'],
                     'serosal': ['pleural_effusion', 'pericardial_effusion', 'acute_pericarditis'],
                     'musculoskeletal': ['joint_involvement'],
                     'renal': ['proteinuria', 'biopsy_proven_lupus_nephritis'],
                     'antiphospholipid_antibodies': ['anti_cardioliphin_antibodies', 'anti_β2gp1_antibodies', 'lupus_anti_coagulant'],
                     'complement_proteins': ['low_c3', 'low_c4'],
                     'sle_specific_antibodies':['anti_dsdna_antibody', 'anti_smith_antibody']}

In [34]:
# the maximum possible diagnosis criteria scpre for each clinical domain
domains_max_scores_dict = {'constitutional': 2, 'hematologic': 4, 'neuropsychiatric': 5, 'mucocutaneous': 6,
                           'serosal': 6, 'musculoskeletal': 6, 'renal': 10, 'antiphospholipid_antibodies': 2, 
                           'complement_proteins': 4, 'sle_specific_antibodies':6}

In [22]:
# defining a scaling factor
feat_factor = 2

#### Functions for feature value creation

In [23]:
def get_feat_probabilities(feature, feat_type='positive'):
    '''
    Returns the probability of positives vs negatives for a feature. 
    Works for all features except renal class, C3 and C4
    '''
    if feat_type == 'positive':
        pos_prob = feature_fractions[feature]/feat_factor # change this one
    elif feat_type == 'negative':
        feature_weight = criteria_weights[feature]
        pos_prob = weight_prob_dict[feature_weight]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    return 1-pos_prob, pos_prob

In [24]:
def create_cutaneous_lupus(df, feat_type='positive'):
    '''
    Creating the special case feature of cutaneous lupus
    '''
    if feat_type == 'positive':
        subacute_prob = feature_fractions['subacute_cutaneous_lupus']/feat_factor #1
        acute_prob = feature_fractions['acute_cutaneous_lupus']/feat_factor #2
        discoid_prob = feature_fractions['discoid_lupus']/feat_factor #3
    elif feat_type=='negative':
        subacute_prob = weight_prob_dict[criteria_weights['subacute_cutaneous_lupus']]
        acute_prob = weight_prob_dict[criteria_weights['acute_cutaneous_lupus']]
        discoid_prob = weight_prob_dict[criteria_weights['discoid_lupus']]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    neg_prob = 1 - subacute_prob - acute_prob - discoid_prob
    cutaneous_list = np.random.choice([0, 1, 2, 3], len(df), 
                                      p=[neg_prob, subacute_prob, acute_prob, discoid_prob])
    return cutaneous_list

In [25]:
def create_lupus_nephritis(df, feat_type='positive'):
    '''
    Creating the special case feature of biopsy-proven lupus nephritis
    '''
    if feat_type == 'positive':
        class1_prob = feature_fractions['renal_biopsy_1']/feat_factor
        class2_prob = feature_fractions['renal_biopsy_2']/feat_factor
        class3_prob = feature_fractions['renal_biopsy_3']/feat_factor
        class4_prob = feature_fractions['renal_biopsy_4']/feat_factor
        class5_prob = feature_fractions['renal_biopsy_5']/feat_factor
    elif feat_type == 'negative':
        class1_prob = weight_prob_dict[criteria_weights['renal_biopsy_1']]
        class2_prob = weight_prob_dict[criteria_weights['renal_biopsy_2']]
        class3_prob = weight_prob_dict[criteria_weights['renal_biopsy_3']]
        class4_prob = weight_prob_dict[criteria_weights['renal_biopsy_4']]
        class5_prob = weight_prob_dict[criteria_weights['renal_biopsy_5']]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    neg_prob = 1 - class1_prob - class2_prob - class3_prob - class4_prob - class5_prob
    renal_biopsy_list = np.random.choice([0, 1, 2, 3, 4, 5], len(df), 
                                      p=[neg_prob, class1_prob, class2_prob, class3_prob, class4_prob, class5_prob])
    return renal_biopsy_list   

#### Functions for labelling the dataset

In [27]:
def get_cutaneous_lupus_score(cutaneous_type):
    '''
    Returns the diagnosis criteria score for the cutaneous lupus feature
    '''
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus']
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus']
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus']
    else:
        print('Unknown cutaneous type')

In [28]:
def get_bpln_score(lupus_nephritis_type):
    '''
    Returns the diagnosis criteria score for the lupus nephritis feature
    '''
    if lupus_nephritis_type == 0:
        return 0
    elif lupus_nephritis_type == 1:
        return criteria_weights['renal_biopsy_1']
    elif lupus_nephritis_type == 2:
        return criteria_weights['renal_biopsy_2']
    elif lupus_nephritis_type == 3:
        return criteria_weights['renal_biopsy_3']
    elif lupus_nephritis_type == 4:
        return criteria_weights['renal_biopsy_4']
    elif lupus_nephritis_type == 5:
        return criteria_weights['renal_biopsy_5']
    else:
        print('Unknown lupus nephritis type')

In [29]:
def get_feat_score(row, feat):
    '''
    Returns the diagnosis criteria score of a feature based on its value
    '''
    if feat == 'cutaneous_lupus':
        feat_score = get_cutaneous_lupus_score(row[feat])
    elif feat == 'biopsy_proven_lupus_nephritis':
        feat_score = get_bpln_score(row[feat])
    elif row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = criteria_weights[feat]
    return feat_score

In [30]:
def get_c3_c4_score(c3, c4): # 1 - low, 0 is not low
    '''
    Returns the diagnosis criteria score for the combination of the low C3 and low C4 features
    '''
    if (c3 == 1) & (c4 == 1):
        return criteria_weights['low_c3_and_low_c4']
    elif (c3 == 1) | (c4 == 1):
        return criteria_weights['low_c3_or_low_c4']
    else:
        return 0

In [31]:
def get_domain_score(row, domain):
    '''
    Calculates the diagnosis criteria score for a given clinicla domain based on an existing guideline
    '''
    domain_features = domains_feat_dict[domain]
    domain_score = 0
    if domain == 'complement_proteins':
        domain_score = get_c3_c4_score(row.low_c3, row.low_c4)
        domain_features = list(set(domain_features) - set(['low_c3', 'low_c4']))
    for feat in domain_features:
        if row[feat] >= 0:
            if feat == 'cutaneous_lupus': # to delete
                feat_score = get_cutaneous_lupus_score(row.cutaneous_lupus)
            elif feat == 'biopsy_proven_lupus_nephritis':
                feat_score = get_bpln_score(row.biopsy_proven_lupus_nephritis)
            else:
                feat_score = get_feat_score(row, feat)
            if feat_score > domain_score:
                domain_score = feat_score
    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score

In [32]:
def create_label(row):
    '''
    Creates  a label for a given row in the dataset 
    '''
    if row['ana'] == 0:
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        total_row_score += domain_score
        if total_row_score >= 10:
            return 1
    else:
        return 0

#### Creating dataframe with positive ANA feature

In [21]:
ana_df = pd.DataFrame()
ana_df['ana'] = [1]*50000
ana_df.head()

Unnamed: 0,ana
0,1
1,1
2,1
3,1
4,1


In [26]:
for feature in features:
    if feature == 'cutaneous_lupus':
        ana_df[feature] = create_cutaneous_lupus(ana_df)
    elif feature == 'biopsy_proven_lupus_nephritis': #biopsy proven lupus nephritis
        ana_df[feature] = create_lupus_nephritis(ana_df)
    else:
        neg_frac, pos_frac = get_feat_probabilities(feature)
        ana_df[feature] = np.random.choice([0, 1], len(ana_df), p=[neg_frac, pos_frac])
ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody
0,1,0,1,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,1,0,...,1,1,0,0,1,0,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,1,0


In [35]:
ana_df['label'] = ana_df.apply(lambda row: create_label(row), axis=1)
ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,1,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,0,1,0,1
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,1,0,1


#### Creating dataframe with negative ANA feature

In [36]:
weight_prob_dict = {10:0.005, 8:0.01, 6:0.02, 5:0.025, 4:0.03, 3:0.035, 2:0.04, 0:0.05}

In [37]:
no_ana_df = pd.DataFrame()
no_ana_df['ana'] = [0]*20000
no_ana_df.head()

Unnamed: 0,ana
0,0
1,0
2,0
3,0
4,0


In [38]:
for feature in features:
    if feature == 'cutaneous_lupus':
        no_ana_df[feature] = create_cutaneous_lupus(no_ana_df, feat_type='negative')
    elif feature == 'biopsy_proven_lupus_nephritis':
        no_ana_df[feature] = create_lupus_nephritis(no_ana_df, feat_type='negative')
    else:
        neg_frac, pos_frac = get_feat_probabilities(feature, feat_type='negative')
        no_ana_df[feature] = np.random.choice([0, 1], len(no_ana_df), p=[neg_frac, pos_frac])
no_ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [39]:
no_ana_df['label'] = no_ana_df.apply(lambda row: create_label(row), axis=1)
no_ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


#### Merging the two dataframes

In [41]:
lupus_df = pd.concat([ana_df, no_ana_df])
lupus_df = lupus_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
lupus_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [42]:
# lupus_df.to_csv('../../data/full_lupus_dataset.csv', index=False)

In [None]:
#### Splitting the dataset