In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../')
from modules import utils, constants

In [2]:
SEED = constants.SEED
random.seed(SEED)
np.random.seed(SEED)

#### Additional criteria

In [3]:
# removed renal biopsy class
# merged subacute cutaneous lupus, acute cutaneous lupus and discoid lupus into one feature (cutaneous lupus)
# changed c3 and c4 to low_c3 and low_c4 respectively for easier dataset creation. 1 means low C3, 0 means not low C3
features = ['fever', 'leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis', 'delirium', 'psychosis', 'seizure', 
            'non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus', 'pleural_effusion', 'pericardial_effusion', 
            'acute_pericarditis', 'joint_involvement', 'proteinuria', 'anti_cardioliphin_antibodies', 
            'anti_β2gp1_antibodies', 'lupus_anti_coagulant', 'low_c3', 'low_c4', 'anti_dsdna_antibody', 
            'anti_smith_antibody']

In [None]:
feature_fractions = {'fever':0.36, 'leukopenia':0.50, 'thrombocytopenia':0.20, 'auto_immune_hemolysis':0.03, 
                     'delirium':0.49, 'psychosis':0.12, 'seizure':0.11, 'non_scarring_alopecia':0.85, 
                     'oral_ulcers':0.08, 'subacute_cutaneous_lupus':0.10, 'discoid_lupus':0.15, 
                     'acute_cutaneous_lupus':0.03, 'pleural_effusion': 0.17, 'pericardial_effusion':0.50, 
                     'acute_pericarditis':0.01, 'joint_involvement':0.69, 'proteinuria':0.60, 
                     'anti_cardioliphin_antibodies':0.17, 'anti_β2gp1_antibodies':0.10, 'lupus_anti_coagulant':0.11,
                     'low_c3':0.45, 'low_c4':0.44,'anti_dsdna_antibody':0.70, 'anti_smith_antibody':0.09}

In [None]:
criteria_weights = {'fever':2, 'leukopenia':3, 'thrombocytopenia':4, 'auto_immune_hemolysis':4, 'delirium':2, 
                    'psychosis':3, 'seizure':5, 'non_scarring_alopecia':2, 'oral_ulcers':2, 
                    'subacute_cutaneous_lupus':4, 'discoid_lupus':4, 'acute_cutaneous_lupus':6, 
                    'pleural_effusion': 5, 'pericardial_effusion':5, 'acute_pericarditis':6, 'joint_involvement':6, 
                    'proteinuria':4, 'anti_cardioliphin_antibodies':2, 'anti_β2gp1_antibodies':2, 
                    'lupus_anti_coagulant':2, 'low_c3':3, 'low_c4':3, 'low_c3_and_low_c4':4, 'low_c3_or_low_c4':3, 
                    'anti_dsdna_antibody':6, 'anti_smith_antibody':6}

In [None]:
len(feature_fractions), len(criteria_weights)

#### ANA positive dataset

In [None]:
ana_df = pd.DataFrame()
ana_df['ana'] = [1]*50000
ana_df.head()

In [None]:
len(ana_df)

In [None]:
feat_frac = 2

In [None]:
def get_feat_probabilities(feature, feat_type='positive'):
    '''
    Returns the probability of positives vs negatives for a feature. 
    Works for all features except renal class, C3 and C4'''
    if feat_type == 'positive':
        pos_prob = feature_fractions[feature]/feat_frac # change this one
    elif feat_type == 'negative':
        feature_weight = criteria_weights[feature]
        pos_prob = weight_prob_dict[feature_weight]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    return 1-pos_prob, pos_prob

In [None]:
def create_cutaneous_lupus(df, feat_type='positive'):
    if feat_type == 'positive':
        subacute_prob = feature_fractions['subacute_cutaneous_lupus']/feat_frac #1
        acute_prob = feature_fractions['acute_cutaneous_lupus']/feat_frac #2
        discoid_prob = feature_fractions['discoid_lupus']/feat_frac #3
    elif feat_type=='negative':
        subacute_prob = weight_prob_dict[criteria_weights['subacute_cutaneous_lupus']]
        acute_prob = weight_prob_dict[criteria_weights['acute_cutaneous_lupus']]
        discoid_prob = weight_prob_dict[criteria_weights['discoid_lupus']]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    neg_prob = 1 - subacute_prob - acute_prob - discoid_prob
    cutaneous_list = np.random.choice([0, 1, 2, 3], len(df), 
                                      p=[neg_prob, subacute_prob, acute_prob, discoid_prob])
    return cutaneous_list

In [None]:
for feature in features:def get_cutaneous_lupus_score(cutaneous_type):
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus']
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus']
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus']
    else:
        print('Unknown cutaneous type')
    if feature == 'cutaneous_lupus':
        ana_df[feature] = create_cutaneous_lupus(ana_df)
    else:
        neg_frac, pos_frac = get_feat_probabilities(feature)
        ana_df[feature] = np.random.choice([0, 1], len(ana_df), p=[neg_frac, pos_frac])
ana_df.head()

In [None]:
ana_df.cutaneous_lupus.value_counts()

#### Labelling the dataset

In [None]:
def get_cutaneous_lupus_score(cutaneous_type):
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus']
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus']
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus']
    else:
        print('Unknown cutaneous type')

In [None]:
def get_feat_score(row, feat):
    #if feat == 'proteinuria':
    #    feat_score = get_proteinura_score(row[feat])
    if feat == 'cutaneous_lupus':
        feat_score = get_cutaneous_lupus_score(row[feat])
    elif row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = criteria_weights[feat]
    return feat_score

In [None]:
def get_c3_c4_score(c3, c4): # 0 - low, 1 is not low
    if (c3 == 1) & (c4 == 1):
        return criteria_weights['low_c3_and_low_c4']
    elif (c3 == 1) | (c4 == 1):
        return criteria_weights['low_c3_or_low_c4']
    else:
        return 0

In [None]:
domains_feat_dict = {'constitutional': ['fever'],
                     'hematologic': ['leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis'],
                     'neuropsychiatric': ['delirium', 'psychosis', 'seizure'],
                     'mucocutaneous': ['non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus'],
                     'serosal': ['pleural_effusion', 'pericardial_effusion', 'acute_pericarditis'],
                     'musculoskeletal': ['joint_involvement'],
                     'renal': ['proteinuria'],
                     'antiphospholipid_antibodies': ['anti_cardioliphin_antibodies', 'anti_β2gp1_antibodies', 'lupus_anti_coagulant'],
                     'complement_proteins': ['low_c3', 'low_c4'],
                     'sle_specific_antibodies':['anti_dsdna_antibody', 'anti_smith_antibody']}

In [None]:
domains_max_scores_dict = {'constitutional': 2, 'hematologic': 4, 'neuropsychiatric': 5, 'mucocutaneous': 6,
                           'serosal': 6, 'musculoskeletal': 6, 'renal': 4, 'antiphospholipid_antibodies': 2, 
                           'complement_proteins': 4, 'sle_specific_antibodies':6}

In [None]:
def get_domain_score(row, domain):
    domain_features = domains_feat_dict[domain]
    #print(f'{domain} domain: {domain_features}')
    domain_score = 0
    if domain == 'complement_proteins':
        domain_score = get_c3_c4_score(row.low_c3, row.low_c4)
        domain_features = list(set(domain_features) - set(['low_c3', 'low_c4']))
    for feat in domain_features:
        if row[feat] >= 0:
            if feat == 'cutaneous_lupus': # to delete
                feat_score = get_cutaneous_lupus_score(row.cutaneous_lupus)
            else:
                feat_score = get_feat_score(row, feat)
            #print(f'{feat} feature - {feat_score}')
            if feat_score > domain_score:
                domain_score = feat_score
    #print(f'{domain} domain: {domain_score}')
    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score

In [None]:
def create_label(row):
    #print(f'ROW {row.name}')
    if row['ana'] == 0:
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        #print(f'{domain} - {domain_score}')
        total_row_score += domain_score
        if total_row_score >= 10:
            #print(f'row {row.name} - {total_row_score}')
            return 1
    else:
        return 0

In [None]:
def compute_score(row):
    if row['ana'] == 0: # negative - 0 positive - 1
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_domain_score(row, domain)
        total_row_score += domain_score
    return total_row_score

In [None]:
#ana_df['score'] = ana_df.apply(lambda row: compute_score(row), axis=1)
ana_df['label'] = ana_df.apply(lambda row: create_label(row), axis=1)
ana_df.head()

In [None]:
ana_df.label.value_counts()

In [None]:
ana_df[(ana_df.low_c3==1) & (ana_df.low_c4==0)].iloc[0]

#### ANA negative dataset

In [None]:
weight_prob_dict = {8:0.01, 6:0.02, 5:0.025, 4:0.03, 3:0.035, 2:0.04}

In [None]:
no_ana_df = pd.DataFrame()
no_ana_df['ana'] = [0]*20000
no_ana_df.head()

In [None]:
for feature in features:
    if feature == 'cutaneous_lupus':
        no_ana_df[feature] = create_cutaneous_lupus(no_ana_df, feat_type='negative')
    else:
        neg_frac, pos_frac = get_feat_probabilities(feature, feat_type='negative')
        no_ana_df[feature] = np.random.choice([0, 1], len(no_ana_df), p=[neg_frac, pos_frac])
no_ana_df.head()

In [None]:
#no_ana_df['score'] = no_ana_df.apply(lambda row: compute_score(row), axis=1)
no_ana_df['label'] = no_ana_df.apply(lambda row: create_label(row), axis=1)
no_ana_df.head()

In [None]:
no_ana_df.cutaneous_lupus.value_counts()

In [None]:
no_ana_df.label.value_counts()

#### Merge the two dataframes

In [None]:
lupus_df = pd.concat([ana_df, no_ana_df])
len(lupus_df)

In [None]:
lupus_df = lupus_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
lupus_df.label.value_counts()

In [None]:
lupus_df.to_csv('../data/orig/lupus_dataset.csv', index=False)

In [None]:
lupus_df.columns

#### Splitting the dataset

In [None]:
X = lupus_df.iloc[:, 0:-1]
y = lupus_df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)

In [None]:
y_train.head()

In [None]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
train_df.head()

In [None]:
train_df.to_csv('../data/orig/train_set_basic.csv', index=False)
test_df.to_csv('../data/orig/test_set_basic.csv', index=False)