In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../')
from modules import utils, constants

In [2]:
SEED = constants.SEED
random.seed(SEED)
np.random.seed(SEED)

#### Initialization

In [3]:
lupus_df = pd.DataFrame()
sample_num = 70000

#### Entry criterion

In [4]:
lupus_df['ana'] = np.random.choice([1, 0], sample_num, p=[0.60, 0.40])
lupus_df.head()

Unnamed: 0,ana
0,1
1,0
2,0
3,1
4,1


In [5]:
lupus_df.ana.value_counts()

1    41966
0    28034
Name: ana, dtype: int64

#### Additional criteria

In [6]:
# removed renal biopsy class
# merged subacute cutaneous lupus, acute cutaneous lupus and discoid lupus into one feature (cutaneous lupus)
# changed c3 and c4 to low_c3 and low_c4 respectively for easier dataset creation. 1 means low C3, 0 means not low C3
features = ['fever', 'leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis', 'delirium', 'psychosis', 'seizure', 
            'non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus', 'pleural_effusion', 'pericardial_effusion', 
            'acute_pericarditis', 'joint_involvement', 'proteinuria', 'anti_cardioliphin_antibodies', 
            'anti_β2gp1_antibodies', 'lupus_anti_coagulant', 'low_c3', 'low_c4', 'anti_dsdna_antibody', 
            'anti_smith_antibody']

In [7]:
feature_fractions = {'fever':0.61, 'leukopenia':0.55, 'thrombocytopenia':0.30, 'auto_immune_hemolysis':0.03, 
                     'delirium':0.49, 'psychosis':0.12, 'seizure':0.11, 'non_scarring_alopecia':0.85, 
                     'oral_ulcers':0.27, 'subacute_cutaneous_lupus':0.15, 'discoid_lupus':0.18, 
                     'acute_cutaneous_lupus':0.10, 'pleural_effusion': 0.39, 'pericardial_effusion':0.50, 
                     'acute_pericarditis':0.01, 'joint_involvement':0.82, 'proteinuria':0.60, 
                     'anti_cardioliphin_antibodies':0.29, 'anti_β2gp1_antibodies':0.23, 'lupus_anti_coagulant':0.21,
                     'low_c3':0.38, 'low_c4':0.39,'anti_dsdna_antibody':0.84, 'anti_smith_antibody':0.29}

In [8]:
criteria_weights = {'fever':2, 'leukopenia':3, 'thrombocytopenia':4, 'auto_immune_hemolysis':4, 'delirium':2, 
                    'psychosis':3, 'seizure':5, 'non_scarring_alopecia':2, 'oral_ulcers':2, 
                    'subacute_cutaneous_lupus':4, 'discoid_lupus':4, 'acute_cutaneous_lupus':6, 
                    'pleural_effusion': 5, 'pericardial_effusion':5, 'acute_pericarditis':6, 'joint_involvement':6, 
                    'proteinuria':4, 'anti_cardioliphin_antibodies':2, 'anti_β2gp1_antibodies':2, 
                    'lupus_anti_coagulant':2, 'low_c3_or_low_c4':3, 'low_c3_and_low_c4':4, 'anti_dsdna_antibody':6, 
                    'anti_smith_antibody':6}

#### ANA positive dataset

In [9]:
ana_df = pd.DataFrame()
ana_df['ana'] = [1]*40000
ana_df.head()

Unnamed: 0,ana
0,1
1,1
2,1
3,1
4,1


In [10]:
len(ana_df)

40000

In [11]:
def get_feat_probabilities(feature):
    '''
    Returns the probability of positives vs negatives for a feature. 
    Works for all features except renal class, C3 and C4'''
    pos_prob = feature_fractions[feature]
    return 1-pos_prob, pos_prob

In [12]:
def create_cutaneous_lupus():
    subacute_prob = feature_fractions['subacute_cutaneous_lupus'] #1
    acute_prob = feature_fractions['acute_cutaneous_lupus'] #2
    discoid_prob = feature_fractions['discoid_lupus'] #3
    neg_prob = 1 - subacute_prob - acute_prob - discoid_prob
    cutaneous_list = np.random.choice([0, 1, 2, 3], len(ana_df), 
                                      p=[neg_prob, subacute_prob, acute_prob, discoid_prob])
    return cutaneous_list

In [13]:
for feature in features:
    if feature == 'cutaneous_lupus':
        ana_df[feature] = create_cutaneous_lupus()
    else:
        neg_frac, pos_frac = get_feat_probabilities(feature)
        ana_df[feature] = np.random.choice([0, 1], len(ana_df), p=[neg_frac, pos_frac])
ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,acute_pericarditis,joint_involvement,proteinuria,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody
0,1,0,0,0,0,0,0,0,1,1,...,0,1,0,1,0,0,0,0,1,0
1,1,1,0,0,0,0,0,0,1,0,...,0,1,1,0,0,0,0,0,1,0
2,1,0,0,1,0,1,0,1,1,1,...,0,1,0,0,0,0,1,1,1,0
3,1,0,1,1,0,1,0,0,1,0,...,0,1,0,1,1,0,0,1,1,1
4,1,0,0,1,0,1,0,1,1,0,...,0,1,1,0,0,0,0,1,1,0


In [14]:
ana_df.cutaneous_lupus.value_counts()

0    22758
3     7183
1     6086
2     3973
Name: cutaneous_lupus, dtype: int64

#### Labelling the dataset

In [15]:
def get_cutaneous_lupus_score(cutaneous_type):
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus']
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus']
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus']
    else:
        print('Unknown cutaneous type')

In [16]:
def get_feat_score(row, feat):
    #if feat == 'proteinuria':
    #    feat_score = get_proteinura_score(row[feat])
    if feat == 'cutaneous_lupus':
        feat_score = get_cutaneous_lupus_score(row[feat])
    elif row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = criteria_weights[feat]
    return feat_score

In [17]:
def get_c3_c4_score(c3, c4): # 0 - low, 1 is not low
    if (c3 == 1) & (c4 == 1):
        return criteria_weights['low_c3_and_low_c4']
    elif (c3 == 1) | (c4 == 1):
        return criteria_weights['low_c3_or_low_c4']
    else:
        return 0

In [18]:
domains_feat_dict = {'constitutional': ['fever'],
                     'hematologic': ['leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis'],
                     'neuropsychiatric': ['delirium', 'psychosis', 'seizure'],
                     'mucocutaneous': ['non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus'],
                     'serosal': ['pleural_effusion', 'pericardial_effusion', 'acute_pericarditis'],
                     'musculoskeletal': ['joint_involvement'],
                     'renal': ['proteinuria'],
                     'antiphospholipid_antibodies': ['anti_cardioliphin_antibodies', 'anti_β2gp1_antibodies', 'lupus_anti_coagulant'],
                     'complement_proteins': ['low_c3', 'low_c4'],
                     'sle_specific_antibodies':['anti_dsdna_antibody', 'anti_smith_antibody']}

In [19]:
domains_max_scores_dict = {'constitutional': 2, 'hematologic': 4, 'neuropsychiatric': 5, 'mucocutaneous': 6,
                           'serosal': 6, 'musculoskeletal': 6, 'renal': 4, 'antiphospholipid_antibodies': 2, 
                           'complement_proteins': 4, 'sle_specific_antibodies':6}

In [20]:
def get_domain_score(row, domain):
    domain_features = domains_feat_dict[domain]
    #print(f'{domain} domain: {domain_features}')
    domain_score = 0
    if domain == 'complement_proteins':
        domain_score = get_c3_c4_score(row.low_c3, row.low_c4)
        domain_features = list(set(domain_features) - set(['low_c3', 'low_c4']))
    for feat in domain_features:
        if row[feat] >= 0:
            if feat == 'cutaneous_lupus': # to delete
                feat_score = get_cutaneous_lupus_score(row.cutaneous_lupus)
            else:
                feat_score = get_feat_score(row, feat)
            #print(f'{feat} feature - {feat_score}')
            if feat_score > domain_score:
                domain_score = feat_score
    #print(f'{domain} domain: {domain_score}')
    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score

In [21]:
def create_label(row):
    #print(f'ROW {row.name}')
    if row['ana'] == 0:
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        #print(f'{domain} - {domain_score}')
        total_row_score += domain_score
        if total_row_score >= 10:
            #print(f'row {row.name} - {total_row_score}')
            return 1
    else:
        return 0

In [22]:
def compute_score(row):
    if row['ana'] == 0: # negative - 0 positive - 1
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        total_row_score += domain_score
    return total_row_score

In [23]:
ana_df['score'] = ana_df.apply(lambda row: compute_score(row), axis=1)
ana_df['label'] = ana_df.apply(lambda row: create_label(row), axis=1)
ana_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,score,label
0,1,0,0,0,0,0,0,0,1,1,...,0,1,0,0,0,0,1,0,21,1
1,1,1,0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,1,0,25,1
2,1,0,0,1,0,1,0,1,1,1,...,0,0,0,0,1,1,1,0,29,1
3,1,0,1,1,0,1,0,0,1,0,...,0,1,1,0,0,1,1,1,30,1
4,1,0,0,1,0,1,0,1,1,0,...,1,0,0,0,0,1,1,0,35,1


In [25]:
ana_df.label.value_counts()

1    39948
0       52
Name: label, dtype: int64

#### ANA negative dataset