In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../')
from modules import utils, constants

In [2]:
# SEED = constants.SEED
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

#### Additional criteria

In [3]:
# removed renal biopsy class
# merged subacute cutaneous lupus, acute cutaneous lupus and discoid lupus into one feature (cutaneous lupus)
# changed c3 and c4 to low_c3 and low_c4 respectively for easier dataset creation. 1 means low C3, 0 means not low C3
features = ['fever', 'leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis', 'delirium', 'psychosis', 'seizure', 
            'non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus', 'pleural_effusion', 'pericardial_effusion', 
            'acute_pericarditis', 'joint_involvement', 'proteinuria', 'biopsy_proven_lupus_nephritis', 
            'anti_cardioliphin_antibodies', 'anti_β2gp1_antibodies', 'lupus_anti_coagulant', 'low_c3', 'low_c4', 
            'anti_dsdna_antibody', 'anti_smith_antibody']

In [4]:
feature_fractions = {'fever':0.36, 'leukopenia':0.50, 'thrombocytopenia':0.20, 'auto_immune_hemolysis':0.03, 
                     'delirium':0.49, 'psychosis':0.12, 'seizure':0.11, 'non_scarring_alopecia':0.85, 
                     'oral_ulcers':0.08, 'subacute_cutaneous_lupus':0.10, 'discoid_lupus':0.15, 
                     'acute_cutaneous_lupus':0.03, 'pleural_effusion': 0.17, 'pericardial_effusion':0.50, 
                     'acute_pericarditis':0.01, 'joint_involvement':0.69, 'proteinuria':0.60, 'renal_biopsy_1':0.01,
                     'renal_biopsy_2':0.03, 'renal_biopsy_3':0.04, 'renal_biopsy_4':0.07, 'renal_biopsy_5':0.02, 
                     'anti_cardioliphin_antibodies':0.17, 'anti_β2gp1_antibodies':0.10, 'lupus_anti_coagulant':0.11,
                     'low_c3':0.45, 'low_c4':0.44,'anti_dsdna_antibody':0.70, 'anti_smith_antibody':0.09}

In [5]:
criteria_weights = {'fever':2, 'leukopenia':3, 'thrombocytopenia':4, 'auto_immune_hemolysis':4, 'delirium':2, 
                    'psychosis':3, 'seizure':5, 'non_scarring_alopecia':2, 'oral_ulcers':2, 
                    'subacute_cutaneous_lupus':4, 'discoid_lupus':4, 'acute_cutaneous_lupus':6, 
                    'pleural_effusion': 5, 'pericardial_effusion':5, 'acute_pericarditis':6, 'joint_involvement':6, 
                    'proteinuria':4, 'renal_biopsy_1':0, 'renal_biopsy_2':8, 'renal_biopsy_3':10, 
                    'renal_biopsy_4':10, 'renal_biopsy_5':8,'anti_cardioliphin_antibodies':2, 
                    'anti_β2gp1_antibodies':2, 'lupus_anti_coagulant':2, 'low_c3':3, 'low_c4':3, 
                    'low_c3_and_low_c4':4, 'low_c3_or_low_c4':3, 'anti_dsdna_antibody':6, 'anti_smith_antibody':6}

In [6]:
len(feature_fractions), len(criteria_weights)

(29, 31)

#### ANA positive dataset

In [7]:
ana_df = pd.DataFrame()
ana_df['ana'] = [1]*50000
ana_df.head()

Unnamed: 0,ana
0,1
1,1
2,1
3,1
4,1


In [8]:
len(ana_df)

50000

In [9]:
feat_frac = 2

In [10]:
def get_feat_probabilities(feature, feat_type='positive'):
    '''
    Returns the probability of positives vs negatives for a feature. 
    Works for all features except renal class, C3 and C4'''
    if feat_type == 'positive':
        pos_prob = feature_fractions[feature]/feat_frac # change this one
    elif feat_type == 'negative':
        feature_weight = criteria_weights[feature]
        pos_prob = weight_prob_dict[feature_weight]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    return 1-pos_prob, pos_prob

In [11]:
def create_cutaneous_lupus(df, feat_type='positive'):
    if feat_type == 'positive':
        subacute_prob = feature_fractions['subacute_cutaneous_lupus']/feat_frac #1
        acute_prob = feature_fractions['acute_cutaneous_lupus']/feat_frac #2
        discoid_prob = feature_fractions['discoid_lupus']/feat_frac #3
    elif feat_type=='negative':
        subacute_prob = weight_prob_dict[criteria_weights['subacute_cutaneous_lupus']]
        acute_prob = weight_prob_dict[criteria_weights['acute_cutaneous_lupus']]
        discoid_prob = weight_prob_dict[criteria_weights['discoid_lupus']]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    neg_prob = 1 - subacute_prob - acute_prob - discoid_prob
    cutaneous_list = np.random.choice([0, 1, 2, 3], len(df), 
                                      p=[neg_prob, subacute_prob, acute_prob, discoid_prob])
    return cutaneous_list

In [12]:
def create_lupus_nephritis(df, feat_type='positive'):
    if feat_type == 'positive':
        class1_prob = feature_fractions['renal_biopsy_1']/feat_frac
        class2_prob = feature_fractions['renal_biopsy_2']/feat_frac
        class3_prob = feature_fractions['renal_biopsy_3']/feat_frac
        class4_prob = feature_fractions['renal_biopsy_4']/feat_frac
        class5_prob = feature_fractions['renal_biopsy_5']/feat_frac
    elif feat_type == 'negative':
        class1_prob = weight_prob_dict[criteria_weights['renal_biopsy_1']]
        class2_prob = weight_prob_dict[criteria_weights['renal_biopsy_2']]
        class3_prob = weight_prob_dict[criteria_weights['renal_biopsy_3']]
        class4_prob = weight_prob_dict[criteria_weights['renal_biopsy_4']]
        class5_prob = weight_prob_dict[criteria_weights['renal_biopsy_5']]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    neg_prob = 1 - class1_prob - class2_prob - class3_prob - class4_prob - class5_prob
    renal_biopsy_list = np.random.choice([0, 1, 2, 3, 4, 5], len(df), 
                                      p=[neg_prob, class1_prob, class2_prob, class3_prob, class4_prob, class5_prob])
    return renal_biopsy_list   

In [13]:
for feature in features:
    if feature == 'cutaneous_lupus':
        ana_df[feature] = create_cutaneous_lupus(ana_df)
    elif feature == 'biopsy_proven_lupus_nephritis': #biopsy proven lupus nephritis
        ana_df[feature] = create_lupus_nephritis(ana_df)
    else:
        neg_frac, pos_frac = get_feat_probabilities(feature)
        ana_df[feature] = np.random.choice([0, 1], len(ana_df), p=[neg_frac, pos_frac])
ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody
0,1,0,1,0,0,0,0,0,0,0,...,1,1,0,0,0,0,0,0,0,0
1,1,1,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,1,0,0,0,0,0,0,0,1,0,...,1,1,0,0,1,0,0,0,1,0
4,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,1,1,1,0


In [14]:
ana_df.cutaneous_lupus.value_counts()

0    42959
3     3778
1     2529
2      734
Name: cutaneous_lupus, dtype: int64

In [15]:
ana_df.biopsy_proven_lupus_nephritis.value_counts()

0    45736
4     1746
3      994
2      766
5      493
1      265
Name: biopsy_proven_lupus_nephritis, dtype: int64

#### Labelling the dataset

In [16]:
def get_bpln_score(lupus_nephritis_type):
    if lupus_nephritis_type == 0:
        return 0
    elif lupus_nephritis_type == 1:
        return criteria_weights['renal_biopsy_1']
    elif lupus_nephritis_type == 2:
        return criteria_weights['renal_biopsy_2']
    elif lupus_nephritis_type == 3:
        return criteria_weights['renal_biopsy_3']
    elif lupus_nephritis_type == 4:
        return criteria_weights['renal_biopsy_4']
    elif lupus_nephritis_type == 5:
        return criteria_weights['renal_biopsy_5']
    else:
        print('Unknown lupus nephritis type')

In [17]:
def get_cutaneous_lupus_score(cutaneous_type):
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus']
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus']
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus']
    else:
        print('Unknown cutaneous type')

In [18]:
def get_feat_score(row, feat):
    #if feat == 'proteinuria':
    #    feat_score = get_proteinura_score(row[feat])
    if feat == 'cutaneous_lupus':
        feat_score = get_cutaneous_lupus_score(row[feat])
    elif feat == 'biopsy_proven_lupus_nephritis':
        feat_score = get_bpln_score(row[feat])
    elif row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = criteria_weights[feat]
    return feat_score

In [19]:
def get_c3_c4_score(c3, c4): # 0 - low, 1 is not low
    if (c3 == 1) & (c4 == 1):
        return criteria_weights['low_c3_and_low_c4']
    elif (c3 == 1) | (c4 == 1):
        return criteria_weights['low_c3_or_low_c4']
    else:
        return 0

In [20]:
domains_feat_dict = {'constitutional': ['fever'],
                     'hematologic': ['leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis'],
                     'neuropsychiatric': ['delirium', 'psychosis', 'seizure'],
                     'mucocutaneous': ['non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus'],
                     'serosal': ['pleural_effusion', 'pericardial_effusion', 'acute_pericarditis'],
                     'musculoskeletal': ['joint_involvement'],
                     'renal': ['proteinuria', 'biopsy_proven_lupus_nephritis'],
                     'antiphospholipid_antibodies': ['anti_cardioliphin_antibodies', 'anti_β2gp1_antibodies', 'lupus_anti_coagulant'],
                     'complement_proteins': ['low_c3', 'low_c4'],
                     'sle_specific_antibodies':['anti_dsdna_antibody', 'anti_smith_antibody']}

In [21]:
domains_max_scores_dict = {'constitutional': 2, 'hematologic': 4, 'neuropsychiatric': 5, 'mucocutaneous': 6,
                           'serosal': 6, 'musculoskeletal': 6, 'renal': 10, 'antiphospholipid_antibodies': 2, 
                           'complement_proteins': 4, 'sle_specific_antibodies':6}

In [22]:
def get_domain_score(row, domain):
    domain_features = domains_feat_dict[domain]
    #print(f'{domain} domain: {domain_features}')
    domain_score = 0
    if domain == 'complement_proteins':
        domain_score = get_c3_c4_score(row.low_c3, row.low_c4)
        domain_features = list(set(domain_features) - set(['low_c3', 'low_c4']))
    for feat in domain_features:
        if row[feat] >= 0:
            if feat == 'cutaneous_lupus': # to delete
                feat_score = get_cutaneous_lupus_score(row.cutaneous_lupus)
            elif feat == 'biopsy_proven_lupus_nephritis':
                feat_score = get_bpln_score(row.biopsy_proven_lupus_nephritis)
            else:
                feat_score = get_feat_score(row, feat)
            #print(f'{feat} feature - {feat_score}')
            if feat_score > domain_score:
                domain_score = feat_score
    #print(f'{domain} domain: {domain_score}')
    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score

In [23]:
def create_label(row):
    #print(f'ROW {row.name}')
    if row['ana'] == 0:
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        #print(f'{domain} - {domain_score}')
        total_row_score += domain_score
        if total_row_score >= 10:
            #print(f'row {row.name} - {total_row_score}')
            return 1
    else:
        return 0

In [24]:
def compute_score(row):
    if row['ana'] == 0: # negative - 0 positive - 1
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        total_row_score += domain_score
    return total_row_score

In [25]:
# ana_df['score'] = ana_df.apply(lambda row: compute_score(row), axis=1)
ana_df['label'] = ana_df.apply(lambda row: create_label(row), axis=1)
ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
1,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
3,1,0,0,0,0,0,0,0,1,0,...,1,0,0,1,0,0,0,1,0,1
4,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,1,1,0,1


In [26]:
ana_df.label.value_counts()

1    34944
0    15056
Name: label, dtype: int64

In [27]:
ana_df[(ana_df.low_c3==1) & (ana_df.low_c4==0)].iloc[0]

ana                              1
fever                            0
leukopenia                       0
thrombocytopenia                 0
auto_immune_hemolysis            0
delirium                         0
psychosis                        0
seizure                          0
non_scarring_alopecia            0
oral_ulcers                      0
cutaneous_lupus                  1
pleural_effusion                 0
pericardial_effusion             0
acute_pericarditis               0
joint_involvement                0
proteinuria                      1
biopsy_proven_lupus_nephritis    0
anti_cardioliphin_antibodies     0
anti_β2gp1_antibodies            1
lupus_anti_coagulant             0
low_c3                           1
low_c4                           0
anti_dsdna_antibody              1
anti_smith_antibody              0
label                            1
Name: 14, dtype: int64

#### ANA negative dataset

In [28]:
weight_prob_dict = {10:0.005, 8:0.01, 6:0.02, 5:0.025, 4:0.03, 3:0.035, 2:0.04, 0:0.05}

In [29]:
no_ana_df = pd.DataFrame()
no_ana_df['ana'] = [0]*20000
no_ana_df.head()

Unnamed: 0,ana
0,0
1,0
2,0
3,0
4,0


In [30]:
for feature in features:
    if feature == 'cutaneous_lupus':
        no_ana_df[feature] = create_cutaneous_lupus(no_ana_df, feat_type='negative')
    elif feature == 'biopsy_proven_lupus_nephritis':
        no_ana_df[feature] = create_lupus_nephritis(no_ana_df, feat_type='negative')
    else:
        neg_frac, pos_frac = get_feat_probabilities(feature, feat_type='negative')
        no_ana_df[feature] = np.random.choice([0, 1], len(no_ana_df), p=[neg_frac, pos_frac])
no_ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
# no_ana_df['score'] = no_ana_df.apply(lambda row: compute_score(row), axis=1)
no_ana_df['label'] = no_ana_df.apply(lambda row: create_label(row), axis=1)
no_ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [32]:
no_ana_df.cutaneous_lupus.value_counts()

0    18411
1      614
3      598
2      377
Name: cutaneous_lupus, dtype: int64

In [33]:
no_ana_df.biopsy_proven_lupus_nephritis.value_counts()

0    18412
1     1041
2      188
5      181
3       95
4       83
Name: biopsy_proven_lupus_nephritis, dtype: int64

In [34]:
no_ana_df.label.value_counts()

0    20000
Name: label, dtype: int64

#### Merge the two dataframes

In [35]:
lupus_df = pd.concat([ana_df, no_ana_df])
len(lupus_df)

70000

In [36]:
lupus_df = lupus_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
lupus_df.label.value_counts()

0    35056
1    34944
Name: label, dtype: int64

In [37]:
lupus_df.to_csv('../../new_lupus/new_data/full_lupus_dataset.csv', index=False)

In [38]:
lupus_df.columns

Index(['ana', 'fever', 'leukopenia', 'thrombocytopenia',
       'auto_immune_hemolysis', 'delirium', 'psychosis', 'seizure',
       'non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus',
       'pleural_effusion', 'pericardial_effusion', 'acute_pericarditis',
       'joint_involvement', 'proteinuria', 'biopsy_proven_lupus_nephritis',
       'anti_cardioliphin_antibodies', 'anti_β2gp1_antibodies',
       'lupus_anti_coagulant', 'low_c3', 'low_c4', 'anti_dsdna_antibody',
       'anti_smith_antibody', 'label'],
      dtype='object')

In [39]:
lupus_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


#### Splitting the dataset

In [40]:
X = lupus_df.iloc[:, 0:-1]
y = lupus_df.iloc[:, -1]
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.10, random_state=SEED, 
                                                  stratify=y_train_val)

In [41]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
val_df = pd.concat([X_val, y_val], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
train_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
1,1,0,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,1
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,1,0,0,1,0,...,1,3,0,0,0,0,0,0,0,1


In [42]:
len(train_df), len(val_df), len(test_df)

(50400, 5600, 14000)

In [43]:
train_df.to_csv('../../new_lupus/new_data/train_set_basic.csv', index=False)
val_df.to_csv('../../new_lupus/new_data/val_set_constant.csv', index=False)
test_df.to_csv('../../new_lupus/new_data/test_set_constant.csv', index=False)

In [44]:
0.1

0.1