In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
#### to delete
# Dictionary with the weignts of the different features
criteria_weights = {'fever':2, 'leukopenia':3, 'thrombocytopenia':4, 'auto_immune_hemolysis':4, 'delirium':2, 'psychosis':3, 'seizure':5, 
                    'non_scarring_alopecia':2, 'oral_ulcers':2, 'subacute_cutaneous_lupus':4, 'discoid_lupus':4, 'acute_cutaneous_lupus':6, 
                    'pleural_effusion': 5, 'pericardial_effusion':5, 'acute_pericarditis':6, 'joint_involvement':6, 'proteinuria':4, 
                    'anti_cardioliphin_antibodies':2, 'anti_β2gp1_antibodies':2, 'lupus_anti_coagulant':2, 'low_c3':3, 'low_c4':3, 
                    'low_c3_and_low_c4':4,
                    'low_c3_or_low_c4':3, 'anti_dsdna_antibody':6, 'anti_smith_antibody':6}

# Dictionary mapping features to their respective domains
domains_feat_dict = {'constitutional': ['fever'],
                     'hematologic': ['leukopenia', 'thrombocytopenia', 'auto_immune_hemolysis'],
                     'neuropsychiatric': ['delirium', 'psychosis', 'seizure'],
                     'mucocutaneous': ['non_scarring_alopecia', 'oral_ulcers', 'cutaneous_lupus'],
                     'serosal': ['pleural_effusion', 'pericardial_effusion', 'acute_pericarditis'],
                     'musculoskeletal': ['joint_involvement'],
                     'renal': ['proteinuria'],
                     'antiphospholipid_antibodies': ['anti_cardioliphin_antibodies', 'anti_β2gp1_antibodies', 'lupus_anti_coagulant'],
                     'complement_proteins': ['low_c3', 'low_c4'],
                     'sle_specific_antibodies':['anti_dsdna_antibody', 'anti_smith_antibody']}

# Dicionary with the maximum possible score for each domain
domains_max_scores_dict = {'constitutional': 2, 'hematologic': 4, 'neuropsychiatric': 5, 'mucocutaneous': 6, 'serosal': 6, 'musculoskeletal': 6, 
                           'renal': 4, 'antiphospholipid_antibodies': 2, 'complement_proteins': 4, 'sle_specific_antibodies':6}

#### Old functions

In [4]:
def get_original_c3_c4_score(c3, c4): # 0 - low, 1 is not low
    if (c3 == 1) & (c4 == 1):
        return criteria_weights['low_c3_and_low_c4']
    elif (c3 == 1) | (c4 == 1):
        return criteria_weights['low_c3_or_low_c4']
    else:
        return 0
    
def get_original_feat_score(row, feat):
    if feat == 'cutaneous_lupus':
        feat_score = get_cutaneous_lupus_score(row[feat])
    elif row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = criteria_weights[feat]
    return feat_score

def get_original_cutaneous_lupus_score(cutaneous_type):
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus']
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus']
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus']
    else:
        print('Unknown cutaneous type')
        
def get_original_domain_score(row, domain):
    domain_features = domains_feat_dict[domain]
    domain_score = 0
    if domain == 'complement_proteins':
        domain_score = get_original_c3_c4_score(row.low_c3, row.low_c4)
        domain_features = list(set(domain_features) - set(['low_c3', 'low_c4']))
    for feat in domain_features:
        if row[feat] >= 0:
            if feat == 'cutaneous_lupus': # to delete
                feat_score = get_original_cutaneous_lupus_score(row.cutaneous_lupus)
            else:
                feat_score = get_original_feat_score(row, feat)
            if feat_score > domain_score:
                domain_score = feat_score
    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score

def compute_original_score(row):
    if row['ana'] == 0: # negative - 0 positive - 1
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_original_domain_score(row, domain)
        total_row_score += domain_score
    return total_row_score

def create_original_label(row):
    if row['ana'] == 0:
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_original_domain_score(row, domain)
        total_row_score += domain_score
        if total_row_score >= 10:
            return 1
    else:
        return 0


#### New functions

In [5]:
def get_new_cutaneous_lupus_score(cutaneous_type): #fini
    '''
    Computes the score given a value for the cutaneous lupus feature
    '''
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0, 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus'], 0
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus'], 0
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus'], 0
    elif cutaneous_type < 0: #if missing value
        return 0, 6 #the 6 is for the max possible value for cutaneous lupus
    else:
        raise Exception('Unknown cutaneous lupus type!')

def get_new_c3_c4_score(c3, c4):  #returns domain core and missing score
    '''
    Computes the score for the complements proteins domain
    '''
    if (c3 == 1) & (c4 == 1): # 1 is low, 0 is not low
        return criteria_weights['low_c3_and_low_c4'], 0
    elif (c3 < 0) & (c4 < 0): # if both of them are missing
        return 0, criteria_weights['low_c3_and_low_c4']
    elif ((c3 < 0)&(c4 == 0)) | ((c4<0)&(c3==0)): 
        return 0, 3
    elif ((c3 < 0)&(c4 == 1)) | ((c4 < 0)&(c3 == 1)):
        return criteria_weights['low_c3_or_low_c4'], 1
    elif (c3 == 1) | (c4 == 1):
        return criteria_weights['low_c3_or_low_c4'], 0
    elif (c3==0) & (c4==0): #c3 =0 and c4 =0
        return 0, 0
    else:
        raise Exception('Unknown C3 and C4 values!')
        
def get_new_feat_score(row, feat):
    '''
    Computes the score for a given feature in a row
    '''
    if feat == 'cutaneous_lupus':
        feat_score, missing_score = get_new_cutaneous_lupus_score(row[feat])
    elif row[feat] < 0:#if missing value
        feat_score, missing_score = 0, criteria_weights[feat]
    elif row[feat] == 0:
        feat_score, missing_score = 0, 0
    else:
        feat_score, missing_score = criteria_weights[feat], 0
    return feat_score, missing_score


def get_new_domain_score(row, domain):
    '''
    Computes the score for a given domain in a specified row
    '''
    domain_features = domains_feat_dict[domain] # a list of features in that domain
    domain_score, domain_missing_score = 0, 0
    if domain == 'complement_proteins':
        domain_score, domain_missing_score = get_new_c3_c4_score(row.low_c3, row.low_c4)
    else:
        for feat in domain_features:
            feat_score, feat_missing_score = get_new_feat_score(row, feat)
            if feat_score > domain_score:
                domain_score = feat_score
                domain_missing_score = max(0, domain_missing_score - feat_score)
            elif feat_missing_score > domain_score:
                domain_missing_score = max(0, feat_missing_score - domain_score)      

    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score, domain_missing_score


def compute_new_score(row):
    '''
    Computes the total score of a row in a dataframe
    '''
    if row['ana'] == 0: # 0=negative, 1=positive 
        return 0, 0
#     print('ANA not zero')
    total_row_score = 0
    total_missing_score = 0
    for domain in domains_feat_dict.keys():
#         print(f'{domain} domain')
        domain_score, domain_missing_score = get_new_domain_score(row, domain)
#         print(f'domain:{domain}, domain score: {domain_score}, domain missing score: {domain_missing_score}')
        total_row_score += domain_score
        total_missing_score += domain_missing_score
    return total_row_score, total_missing_score

    
def create_new_label(row):
    if row.new_score >=10:
        return 1 # Lupus
    elif (row.new_score + row.missing_score) >= 10:
        return 2 #Inclusive diagnosis
    elif (row.new_score + row.missing_score) < 10:
        return 0 # No lupus
    else:
        return np.nan

#### delete from here

In [6]:
# my_row = {'fever':0, 'leukopenia':0, 'thrombocytopenia':0, 'auto_immune_hemolysis':0, 'delirium':0, 'psychosis':0, 
#           'seizure':0, 'non_scarring_alopecia':0, 'oral_ulcers':0, 'cutaneous_lupus':0, 'pleural_effusion': 0, 
#           'pericardial_effusion':0, 'acute_pericarditis':0, 'joint_involvement':0, 'proteinuria':0, 
#           'anti_cardioliphin_antibodies':0, 'anti_β2gp1_antibodies':0, 'lupus_anti_coagulant':0, 'low_c3':0, 
#           'low_c4':0, 'anti_dsdna_antibody':0, 'anti_smith_antibody':0}
# sample_row = pd.Series(my_row)

In [7]:
# sample_row = new_df.iloc[9]
# sample_row

In [8]:
# a, b = compute_new_score(sample_row)
# a, b

#### end here

#### The data

In [9]:
original_df = pd.read_csv('../data/orig/lupus_dataset.csv')
original_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,1,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,1,0,0,0,0,...,0,1,0,0,0,1,0,0,0,1
3,1,1,0,1,0,0,0,0,0,0,...,1,0,0,0,0,1,1,0,0,1
4,1,1,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


In [10]:
original_df['original_score'] = original_df.apply(lambda row: compute_original_score(row), axis=1)
original_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,5
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
2,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,1,0,0,0,1,13
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,1,16
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,8


#### Insert misssing data

In [11]:
frac = 0.1

In [12]:
def insert_nans(df, col, frac):
    non_null_indices = df[df[col].notna()].index.tolist() #getting not null indices
    to_change_indices = random.sample(non_null_indices, int(frac*len(non_null_indices)))
    df.loc[to_change_indices, col] = np.nan
    return df    

In [13]:
new_df = original_df.copy(deep=True)
for column in new_df.columns:
    if column not in ['ana', 'label', 'original_score']:
        new_df = insert_nans(new_df, column, frac)
    new_df.fillna(-1, inplace=True)
new_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0,5
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0,9
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1,13
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1,16
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0,8


In [14]:
# new_df['new_score'], new_df['missing_score'] = new_df.apply(lambda row: compute_new_score(row), axis=1)
# new_df.head()

new_df['new_score'], new_df['missing_score'] = None, None
new_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score,new_score,missing_score
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0,5,,
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,9,,
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1,13,,
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1,16,,
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0,8,,


In [15]:
for i, row in new_df.iterrows():
    new_df.at[i, 'new_score'], new_df.at[i, 'missing_score'] = compute_new_score(row)
new_df.head(8)

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score,new_score,missing_score
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0,5,5,6
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,9,6,5
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,1,13,13,0
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,1.0,0.0,0.0,1,16,16,0
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0,8,8,5
5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,-1.0,1.0,-1.0,1,11,11,1
6,1,1.0,1.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,-1.0,0.0,1,14,14,14
7,1,0.0,1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,14,14,8


In [16]:
new_df['new_label'] = new_df.apply(lambda row: create_new_label(row), axis=1)
new_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score,new_score,missing_score,new_label
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0,5,5,6,2
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,9,6,5,2
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1,13,13,0,1
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,1,16,16,0,1
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,8,8,5,2


In [17]:
new_df.new_label.value_counts()

1    29889
0    24944
2    15167
Name: new_label, dtype: int64

In [18]:
new_df.label.value_counts()

0    36393
1    33607
Name: label, dtype: int64

In [19]:
new_df = new_df.drop(['label', 'original_score', 'new_score', 'missing_score'], axis=1)
new_df = new_df.rename({'new_label':'label'}, axis=1)
new_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,2


In [20]:
new_df.to_csv('../data/inconclusive_diagnosis/lupus_dataset.csv', index=False)