In [1]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.append('..')
from modules import utils, constants 
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.



  "stable-baselines is in maintenance mode, please use [Stable-Baselines3 (SB3)](https://github.com/DLR-RM/stable-baselines3) for an up-to-date version. You can find a [migration guide](https://stable-baselines3.readthedocs.io/en/master/guide/migration.html) in SB3 documentation."





In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [3]:
criteria_weights = constants.CRITERIA_WEIGHTS
domains_feat_dict = constants.DOMAINS_FEAT_DICT
domains_max_scores_dict = constants.DOMAINS_MAX_SCORES_DICT

#### Old functions

In [4]:
def create_original_label(row):
    if row['ana'] == 0:
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_original_domain_score(row, domain)
        total_row_score += domain_score
        if total_row_score >= 10:
            return 1
    else:
        return 0

In [5]:
#### with biopsy-proven lupus nephritis

def get_original_feat_score(row, feat):
    if feat == 'cutaneous_lupus':
        feat_score = get_original_cutaneous_lupus_score(row[feat])
    elif feat == 'biopsy_proven_lupus_nephritis':
        feat_score = get_original_bpln_score(row[feat])
    elif row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = criteria_weights[feat]
    return feat_score

def get_original_bpln_score(lupus_nephritis_type):
    if lupus_nephritis_type == 0:
        return 0
    elif lupus_nephritis_type == 1:
        return criteria_weights['renal_biopsy_1']
    elif lupus_nephritis_type == 2:
        return criteria_weights['renal_biopsy_2']
    elif lupus_nephritis_type == 3:
        return criteria_weights['renal_biopsy_3']
    elif lupus_nephritis_type == 4:
        return criteria_weights['renal_biopsy_4']
    elif lupus_nephritis_type == 5:
        return criteria_weights['renal_biopsy_5']
    else:
        print('Unknown lupus nephritis type')

def get_original_cutaneous_lupus_score(cutaneous_type):
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus']
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus']
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus']
    else:
        print('Unknown cutaneous type')

def get_original_c3_c4_score(c3, c4): # 1 - low, 0 is not low
    if (c3 == 1) & (c4 == 1):
        return criteria_weights['low_c3_and_low_c4']
    elif (c3 == 1) | (c4 == 1):
        return criteria_weights['low_c3_or_low_c4']
    else:
        return 0

def get_original_domain_score(row, domain): # to compare
    domain_features = domains_feat_dict[domain]
    domain_score = 0
    if domain == 'complement_proteins':
        domain_score = get_original_c3_c4_score(row.low_c3, row.low_c4)
        domain_features = list(set(domain_features) - set(['low_c3', 'low_c4']))
    for feat in domain_features:
        if row[feat] >= 0:
            if feat == 'cutaneous_lupus': # to delete
                feat_score = get_original_cutaneous_lupus_score(row.cutaneous_lupus)
            elif feat == 'biopsy_proven_lupus_nephritis':
                feat_score = get_original_bpln_score(row.biopsy_proven_lupus_nephritis)
            else:
                feat_score = get_original_feat_score(row, feat)
            if feat_score > domain_score:
                domain_score = feat_score
    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score  

def compute_original_score(row):
    if row['ana'] == 0: # negative - 0 positive - 1
        return 0
    total_row_score = 0
    for domain in domains_feat_dict.keys():
        domain_score = get_original_domain_score(row, domain)
        total_row_score += domain_score
    return total_row_score

In [6]:
def get_new_cutaneous_lupus_score(cutaneous_type): #fini
    '''
    Computes the score given a value for the cutaneous lupus feature
    '''
    if cutaneous_type == 0: #negative for any form of cutaneous lupus
        return 0, 0
    elif cutaneous_type == 1: #subacute cutaneous lupus
        return criteria_weights['subacute_cutaneous_lupus'], 0
    elif cutaneous_type == 2: #acute cutaneous lupus
        return criteria_weights['acute_cutaneous_lupus'], 0
    elif cutaneous_type == 3: #discoid lupus
        return criteria_weights['discoid_lupus'], 0
    elif cutaneous_type < 0: #if missing value
        return 0, 6 #the 6 is for the max possible value for cutaneous lupus
    else:
        raise Exception('Unknown cutaneous lupus type!')
        
        
def get_new_bpln_score(lupus_nephritis_type):
    if lupus_nephritis_type == 0:
        return 0, 0
    elif lupus_nephritis_type == 1:
        return criteria_weights['renal_biopsy_1'], 0
    elif lupus_nephritis_type == 2:
        return criteria_weights['renal_biopsy_2'], 0
    elif lupus_nephritis_type == 3:
        return criteria_weights['renal_biopsy_3'], 0
    elif lupus_nephritis_type == 4:
        return criteria_weights['renal_biopsy_4'], 0
    elif lupus_nephritis_type == 5:
        return criteria_weights['renal_biopsy_5'], 0
    elif lupus_nephritis_type < 0:
        return 0, 10
    else:
        raise Exception('Unknown lupus nephritis type')

def get_new_feat_score(row, feat):
    '''
    Computes the score for a given feature in a row
    '''
    if feat == 'cutaneous_lupus':
        feat_score, missing_score = get_new_cutaneous_lupus_score(row[feat])
    elif feat == 'biopsy_proven_lupus_nephritis':
        feat_score, missing_score = get_new_bpln_score(row[feat])
    elif row[feat] < 0:#if missing value
        feat_score, missing_score = 0, criteria_weights[feat]
    elif row[feat] == 0:
        feat_score, missing_score = 0, 0
    else:
        feat_score, missing_score = criteria_weights[feat], 0
    return feat_score, missing_score


def get_new_c3_c4_score(c3, c4):  #returns domain score and missing score
    '''
    Computes the score for the complements proteins domain
    '''
    if (c3 == 1) & (c4 == 1): # 1 is low, 0 is not low
        return criteria_weights['low_c3_and_low_c4'], 0
    elif (c3 < 0) & (c4 < 0): # if both of them are missing
        return 0, criteria_weights['low_c3_and_low_c4']
    elif ((c3 < 0)&(c4 == 0)) | ((c4<0)&(c3==0)): 
        return 0, 3
    elif ((c3 < 0)&(c4 == 1)) | ((c4 < 0)&(c3 == 1)):
        return criteria_weights['low_c3_or_low_c4'], 1
    elif (c3 == 1) | (c4 == 1):
        return criteria_weights['low_c3_or_low_c4'], 0
    elif (c3==0) & (c4==0): #c3 =0 and c4 =0
        return 0, 0
    else:
        raise Exception('Unknown C3 and C4 values!')


def get_new_domain_score(row, domain):
    '''
    Computes the score for a given domain in a specified row
    '''
    domain_features = domains_feat_dict[domain] # a list of features in that domain
    domain_score, domain_missing_score = 0, 0
    if domain == 'complement_proteins':
        domain_score, domain_missing_score = get_new_c3_c4_score(row.low_c3, row.low_c4)
    else:
        for feat in domain_features:
            feat_score, feat_missing_score = get_new_feat_score(row, feat)
            if feat_score > domain_score:
                domain_score = feat_score
                domain_missing_score = max(0, domain_missing_score - feat_score)
            elif feat_missing_score > domain_score:
                domain_missing_score = max(0, feat_missing_score - domain_score)      

    if domain_score > domains_max_scores_dict[domain]:
        raise Exception('The score is too large for this domain!')
    return domain_score, domain_missing_score


def compute_new_score(row):
    '''
    Computes the total score of a row in a dataframe
    '''
    if row['ana'] == 0: # 0=negative, 1=positive 
        return 0, 0
    total_row_score = 0
    total_missing_score = 0
    for domain in domains_feat_dict.keys():
        domain_score, domain_missing_score = get_new_domain_score(row, domain)
        total_row_score += domain_score
        total_missing_score += domain_missing_score
    return total_row_score, total_missing_score

def create_new_label(row):
    if row.new_score >=10:
        return 1 # Lupus
    elif (row.new_score + row.missing_score) >= 10:
        return 2 #Inclusive diagnosis
    elif (row.new_score + row.missing_score) < 10:
        return 0 # No lupus
    else:
        return np.nan


#### The original dataset

In [7]:
original_df = pd.read_csv('../new_data/full_lupus_dataset.csv')
original_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,1,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,1,0,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,1
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [8]:
original_df.shape

(70000, 25)

In [9]:
original_df['original_score'] = original_df.apply(lambda row: compute_original_score(row), axis=1)
original_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score
0,1,0,0,0,0,0,0,0,0,0,...,0,1,0,0,1,0,0,0,0,5
1,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,9
2,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,1,10
3,1,1,0,1,0,0,0,0,0,0,...,0,0,0,1,1,0,0,0,1,17
4,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,14


#### Insert missing data

In [10]:
frac = 0.1

In [11]:
def insert_nans(df, col, frac):
    non_null_indices = df[df[col].notna()].index.tolist() #getting not null indices
    to_change_indices = random.sample(non_null_indices, int(frac*len(non_null_indices)))
    df.loc[to_change_indices, col] = np.nan
    return df    

In [12]:
new_df = original_df.copy(deep=True)
for column in new_df.columns:
    if column not in ['ana', 'label', 'original_score']:
        new_df = insert_nans(new_df, column, frac)
    new_df.fillna(-1, inplace=True)
new_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0,5
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,9
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,10
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1,17
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,1,14


In [13]:
new_df['new_score'], new_df['missing_score'] = None, None
new_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score,new_score,missing_score
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,5,,
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,9,,
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,10,,
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1,17,,
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,14,,


In [14]:
for i, row in new_df.iterrows():
    new_df.at[i, 'new_score'], new_df.at[i, 'missing_score'] = compute_new_score(row)
new_df.head(8)

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score,new_score,missing_score
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0,5,5,6
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,9,6,13
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,10,10,0
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1,17,17,0
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1,14,14,5
5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,-1.0,1.0,-1.0,0.0,1,11,5,7
6,1,1.0,1.0,0.0,-1.0,0.0,0.0,-1.0,0.0,0.0,...,0.0,0.0,0.0,-1.0,0.0,0.0,1,14,14,15
7,1,0.0,1.0,-1.0,-1.0,0.0,0.0,-1.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1,14,14,8


In [15]:
new_df['new_label'] = new_df.apply(lambda row: create_new_label(row), axis=1)
new_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label,original_score,new_score,missing_score,new_label
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0,5,5,6,2
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0,9,6,13,2
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1,10,10,0,1
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1,17,17,0,1
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1,14,14,5,1


In [16]:
new_df.new_label.value_counts()

1    31276
0    24133
2    14591
Name: new_label, dtype: int64

In [17]:
new_df.label.value_counts()

0    35056
1    34944
Name: label, dtype: int64

In [18]:
new_df = new_df.drop(['label', 'original_score', 'new_score', 'missing_score'], axis=1)
new_df = new_df.rename({'new_label':'label'}, axis=1)
new_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,2
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,1.0,1


In [19]:
# new_df.to_csv('../new_data/with_inconclusive_diagnosis/full_lupus_dataset.csv', index=False)

#### Splitting the data

In [20]:
X = new_df.iloc[:, 0:-1]
y = new_df.iloc[:, -1]
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, random_state=SEED, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.10, random_state=SEED, 
                                                  stratify=y_train_val)

In [21]:
train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
val_df = pd.concat([X_val, y_val], axis=1).reset_index(drop=True)
test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
train_df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,proteinuria,biopsy_proven_lupus_nephritis,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,...,0.0,-1.0,-1.0,0.0,0.0,1.0,0.0,0.0,-1.0,1
1,1,0.0,0.0,0.0,1.0,-1.0,0.0,0.0,1.0,0.0,...,1.0,0.0,0.0,0.0,-1.0,0.0,1.0,0.0,1.0,1
2,1,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,-1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1
4,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-1.0,0.0,...,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [22]:
len(train_df), len(val_df), len(test_df)

(50400, 5600, 14000)

In [23]:
# train_df.to_csv('../new_data/with_inconclusive_diagnosis/train_set_basic.csv', index=False)
# val_df.to_csv('../new_data/with_inconclusive_diagnosis/val_set_constant.csv', index=False)
# test_df.to_csv('../new_data/with_inconclusive_diagnosis/test_set_constant.csv', index=False)