In [78]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../')
from modules import utils, constants

In [79]:
SEED = constants.SEED
random.seed(SEED)
np.random.seed(SEED)

#### Demographics

In [80]:
lupus_df = pd.DataFrame()
sample_num = 70000

#### Entry criterion

In [81]:
lupus_df['ana'] = np.random.choice([1, 0], sample_num, p=[0.5, 0.5])
lupus_df.head()

Unnamed: 0,ana
0,1
1,0
2,0
3,0
4,1


In [82]:
lupus_df.ana.value_counts()

0    35044
1    34956
Name: ana, dtype: int64

#### Additional criteria

In [83]:
features = ['fever', 'leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis', 'delirium', 'psychosis', 'seizure', 
            'non_scarring_alopecia', 'oral_ulcers', 'subacute_cutaneous', 'discoid_lupus', 'acute_cutaneous_lupus', 
            'pleural_effusion', 'pericardial_effusion', 'acute_pericarditis', 'joint_involvement', 'proteinuria', 
            'renal_biopsy_class', 'anti_cardioliphin_antibodies', 'anti_b2gp4_antibodies', 'lupus_anti_coagulant', 
            'c3', 'c4', 'anti_dsdna_antibody', 'anti_smith_antibody']

In [84]:
criteria_weights = {'fever':2, 'leukopenia':3, 'thrombocytopenia':4, 'auto_immune_hemolysis':4, 'delirium':2, 
                    'psychosis':3, 'seizure':5, 'non_scarring_alopecia':2, 'oral_ulcers':2, 'subacute_cutaneous':4, 
                    'discoid_lupus':4, 'acute_cutaneous_lupus':6, 'pleural_effusion': 5, 'pericardial_effusion':5, 
                    'acute_pericarditis':6, 'joint_involvement':6, 'proteinuria':4, 'renal_biopsy_2':8, 
                    'renal_biopsy_5': 8, 'renal_biopsy_3': 10, 'renal_biopsy_4': 10, 
                    'anti_cardioliphin_antibodies':2, 'anti_b2gp4_antibodies':2, 'lupus_anti_coagulant':2, 
                    'low_c3_or_low_c4':3, 'low_c3_and_low_c4':4, 'anti_dsdna_antibody':6, 'anti_smith_antibody':6}

In [85]:
weight_prob_dict = {10:0.01, 8:0.02, 6:0.04, 5:0.06, 4:0.08, 3:0.10, 2:0.12}

In [86]:
def get_truncated_normal(mean=0, sd=1, low=0, upp=10, size=100):
    return truncnorm(
        (low - mean) / sd, (upp - mean) / sd, loc=mean, scale=sd).rvs(size=size)

In [87]:
def create_proteinuria(size, max_val, skewness):
    from scipy.stats import skewnorm
    skewed_list = skewnorm.rvs(a = skewness, loc=max_val, size=size)
    skewed_list = skewed_list - min(skewed_list)      #Shift the set so the minimum value is equal to zero.
    skewed_list = skewed_list / max(skewed_list)      #Standadize all the vlues between 0 and 1. 
    skewed_list = skewed_list * max_val
    return skewed_list

In [88]:
def get_feat_probabilities(feature):
    '''
    Returns the probability of positives vs negatives for a feature. 
    Works for all features except renal class, C3 and C4'''
    feature_weight = criteria_weights[feature]
    pos_prob = weight_prob_dict[feature_weight]
    return pos_prob, 1-pos_prob

In [89]:
def create_feat(feat_name, size):
    pos_feat_prob, neg_feat_prob = get_feat_probabilities(feat_name)
    return np.random.choice([0, 1], size, p=[neg_feat_prob, pos_feat_prob])

In [90]:
for feature in features:
    if feature == 'renal_biopsy_class':
        lupus_df[feature] = np.random.choice([0, 1, 2, 3, 4, 5], sample_num, p=[0.44, 0.44, 0.05, 0.01, 0.01, 0.05])
    elif feature in ['c3', 'c4']:
        lupus_df[feature] = np.random.choice([0, 1], sample_num, p = [0.4, 0.6])
    else:
        lupus_df[feature] = create_feat(feature, sample_num)
lupus_df.head()                                          

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,renal_biopsy_class,anti_cardioliphin_antibodies,anti_b2gp4_antibodies,lupus_anti_coagulant,c3,c4,anti_dsdna_antibody,anti_smith_antibody
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
4,1,0,0,0,0,0,0,0,1,1,...,0,0,0,0,1,0,0,1,0,0


In [91]:
#lupus_df['fever'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['leukopenia'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['thrombocytopenia'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['auto_immune_hemolysis'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['delirium'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['psychosis'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['seizure'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['non_scarring_alopecia'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['oral_ulcers'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['subacute_cutaneous'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['discoid_lupus'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['acute_cutaneous_lupus'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['pleural_effusion'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['pericardial_effusion'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['acute_pericarditis'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['joint_involvement'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['proteinuria'] = get_truncated_normal(0.05, 1, 0, 3, sample_num)
#lupus_df['proteinuria'] = create_proteinuria(sample_num, 3, 100)
#lupus_df['proteinuria'] = lupus_df['proteinuria'].sample(frac=0.3)
#lupus_df['renal_biopsy_class'] = np.random.choice([0, 1, 2, 3, 4, 5, np.nan], sample_num, p=[renal_neg_frac, renal_neg_frac, renal_pos_frac, renal_pos_frac, renal_pos_frac, renal_pos_frac, renal_nan_frac])
#lupus_df['anti_cardioliphin_antibodies'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['anti_b2gp4_antibodies'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['lupus_anti_coagulant'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['c3'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['c4'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['anti_dsdna_antibody'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df['anti_smith_antibody'] = np.random.choice([0, 1, np.nan], sample_num, p=[neg_frac, pos_frac, nan_frac])
#lupus_df.head()

In [92]:
lupus_df.isna().sum()

ana                             0
fever                           0
leukopenia                      0
thrombocytopenia                0
auto_immune_hemolysis           0
delirium                        0
psychosis                       0
seizure                         0
non_scarring_alopecia           0
oral_ulcers                     0
subacute_cutaneous              0
discoid_lupus                   0
acute_cutaneous_lupus           0
pleural_effusion                0
pericardial_effusion            0
acute_pericarditis              0
joint_involvement               0
proteinuria                     0
renal_biopsy_class              0
anti_cardioliphin_antibodies    0
anti_b2gp4_antibodies           0
lupus_anti_coagulant            0
c3                              0
c4                              0
anti_dsdna_antibody             0
anti_smith_antibody             0
dtype: int64

In [93]:
domains_feat_dict = {'constitutional': ['fever'],
                     'hematologic': ['leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis'],
                     'neuropsychiatric': ['delirium', 'psychosis', 'seizure'],
                     'mucocutaneous': ['non_scarring_alopecia', 'oral_ulcers', 'subacute_cutaneous', 'discoid_lupus', 'acute_cutaneous_lupus'],
                     'serosal': ['pleural_effusion', 'pericardial_effusion', 'acute_pericarditis'],
                     'musculoskeletal': ['joint_involvement'],
                     'renal': ['proteinuria', 'renal_biopsy_class'],
                     'antiphospholipid_antibodies': ['anti_cardioliphin_antibodies', 'anti_b2gp4_antibodies', 'lupus_anti_coagulant'],
                     'complement_proteins': ['c3', 'c4'],
                     'sle_specific_antibodies':['anti_dsdna_antibody', 'anti_smith_antibody']}

domains_max_scores_dict = {'constitutional': 2, 'hematologic': 4, 'neuropsychiatric': 5, 'mucocutaneous': 6,
                           'serosal': 6, 'musculoskeletal': 6, 'renal': 10, 'antiphospholipid_antibodies': 2, 
                           'complement_proteins': 4, 'sle_specific_antibodies':6}


#### Labelling the dataset

In [94]:
def get_renal_biopsy_score(result_class):
    if (result_class == 3) | (result_class == 4):
        return 10
    elif (result_class == 2) | (result_class == 5):
        return 8
    else:
        return 0

In [95]:
#def get_proteinura_score(amount):
#    if amount > 0.5:
#        return 4
#    else:
#        return 0

In [96]:
def get_feat_score(row, feat):
    #if feat == 'proteinuria':
    #    feat_score = get_proteinura_score(row[feat])
    if feat == 'renal_biopsy_class':
        feat_score = get_renal_biopsy_score(row[feat])
    elif row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = criteria_weights[feat]
    return feat_score

In [97]:
def get_c3_c4_score(c3, c4): # 0 - low, 1 is not low
    if (c3 == 0) & (c4 == 0):
        return criteria_weights['low_c3_and_low_c4']
    elif (c3 == 0) | (c4 == 0):
        return criteria_weights['low_c3_or_low_c4']
    else:
        return 0

In [110]:
def get_domain_score(row, domain):
    domain_features = domains_feat_dict[domain]
    #print(f'{domain} domain: {domain_features}')
    domain_score = 0
    if domain == 'complement_proteins':
        domain_score = get_c3_c4_score(row.c3, row.c4)
        domain_features = list(set(domain_features) - set(['c3', 'c4']))
    for feat in domain_features:
        if row[feat] >= 0:
            if feat == 'renal_biopsy_class': # to delete
                feat_score = get_renal_biopsy_score(row.renal_biopsy_class)
            else:
                feat_score = get_feat_score(row, feat)
            #print(f'{feat} feature - {feat_score}')
            if feat_score > domain_score:
                domain_score = feat_score
    #print(f'{domain} domain: {domain_score}')
    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score

In [119]:
def create_label(row):
    #print(f'ROW {row.name}')
    if row['ana'] == 0:
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        #print(f'{domain} - {domain_score}')
        total_row_score += domain_score
        if total_row_score >= 10:
            #print(f'row {row.name} - {total_row_score}')
            return 1
    else:
        return 0

In [135]:
def compute_score(row):
    if row['ana'] == 0: # negative - 0 positive - 1
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        total_row_score += domain_score
    return total_row_score

In [101]:
lupus_df.shape

(70000, 26)

In [102]:
lupus_df.columns

Index(['ana', 'fever', 'leukopenia', 'thrombocytopenia',
       'auto_immune_hemolysis', 'delirium', 'psychosis', 'seizure',
       'non_scarring_alopecia', 'oral_ulcers', 'subacute_cutaneous',
       'discoid_lupus', 'acute_cutaneous_lupus', 'pleural_effusion',
       'pericardial_effusion', 'acute_pericarditis', 'joint_involvement',
       'proteinuria', 'renal_biopsy_class', 'anti_cardioliphin_antibodies',
       'anti_b2gp4_antibodies', 'lupus_anti_coagulant', 'c3', 'c4',
       'anti_dsdna_antibody', 'anti_smith_antibody'],
      dtype='object')

In [129]:
lupus_df['score'] = lupus_df.apply(lambda row: compute_score(row), axis=1)
lupus_df['label'] = lupus_df.apply(lambda row: create_label(row), axis=1)
lupus_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,renal_biopsy_class,anti_cardioliphin_antibodies,anti_b2gp4_antibodies,lupus_anti_coagulant,c3,c4,anti_dsdna_antibody,anti_smith_antibody,score,label
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
2,0,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,1,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,0
4,1,0,0,0,0,0,0,0,1,1,...,0,0,1,0,0,1,0,0,10,1


In [130]:
lupus_df.label.value_counts()

0    56454
1    13546
Name: label, dtype: int64

In [131]:
a = lupus_df[(lupus_df.ana == 1) & (lupus_df.label == 0)] # i need at least 20000 here
len(a)

21410

In [132]:
lupus_df.label.value_counts()

0    56454
1    13546
Name: label, dtype: int64

In [133]:
lupus_df.to_csv('../data/lupus_dataset_full.csv', index=False)

#### testing labelling functions - delete from here

In [134]:
lupus_df[(lupus_df.ana==1) &((lupus_df.renal_biopsy_class==2) & (lupus_df.proteinuria==0))].iloc[1]

ana                              1
fever                            0
leukopenia                       0
thrombocytopenia                 0
auto_immune_hemolysis            0
delirium                         0
psychosis                        1
seizure                          0
non_scarring_alopecia            0
oral_ulcers                      0
subacute_cutaneous               0
discoid_lupus                    0
acute_cutaneous_lupus            0
pleural_effusion                 0
pericardial_effusion             0
acute_pericarditis               0
joint_involvement                0
proteinuria                      0
renal_biopsy_class               2
anti_cardioliphin_antibodies     1
anti_b2gp4_antibodies            0
lupus_anti_coagulant             0
c3                               0
c4                               1
anti_dsdna_antibody              0
anti_smith_antibody              0
score                           11
label                            1
Name: 32, dtype: int

In [126]:
myrow = lupus_df.loc[32]
mylabel = create_label(myrow)
myscore = compute_score(myrow)
myscore, mylabel

(16, 1)

#### end here

#### Splitting the dataset

In [None]:
lupus_df = lupus_df.fillna(-1)
X = lupus_df.iloc[:, 0:-1]
y = lupus_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)

In [None]:
y_train.head()

In [None]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
train_df.head()

In [None]:
#train_df.to_csv('../data/train_set_basic.csv', index=False)
#test_df.to_csv('../data/test_set_basic.csv', index=False)

#### End here

In [None]:
lupus_df.isna().sum()

In [None]:
a = lupus_df[(lupus_df.ana == 1) & (lupus_df.label == 0)] # i need at least 20000 here
len(a)

In [None]:
a.iloc[0]

#### Testing dataset

In [None]:
import plotly.express as px
labels = ['NaN', '1', '0']
values = [95000, 2500, 2500]
fig = px.pie(values=values, names=labels)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
labels = ['NaN', '0', '1', '2', '3', '4', '5']
values = [90000, 2500, 2500, 1250, 1250, 1250, 1250]
fig = px.pie(values=values, names=labels)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

In [None]:
fig = px.bar(x=labels, y=values, color=labels)
fig.show()

#### Rules for classification

In [None]:
def has_fever(row):
    if row.body_temp > 37.8:
        return True
    elif row.body_temp >0:
        return False
    else:
        return None

In [None]:
def has_leukopenia(row):
    if row.wbc < 4000:
        return True
    elif row.wbc > 0:
        return False
    else:
        return None

In [None]:
def has_thrombocytopenia(row):
    if row.wbc < 4000:
        return True
    elif row.wbc > 0:
        return False
    else:
        return None