In [1]:
import pandas as pd
import numpy as np
import random
import seaborn as sns
from scipy.stats import truncnorm
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../')
from modules import utils, constants

In [2]:
SEED = constants.SEED
random.seed(SEED)
np.random.seed(SEED)

In [3]:
# FEATURES = ['ana', 'anti_dsdna_antibody', 'joint_involvement', 'proteinuria', 'pericardial_effusion', 
#             'non_scarring_alopecia', 'leukopenia', 'delirium']
FEATURES = ['ana', 'anti_dsdna_antibody', 'joint_involvement', 'proteinuria', 'pericardial_effusion']

In [4]:
# FEATURE_FRACTIONS = {'leukopenia':0.50, 'delirium':0.49, 'non_scarring_alopecia':0.85, 'pericardial_effusion':0.50, 
#                      'joint_involvement':0.69, 'proteinuria':0.60, 'anti_dsdna_antibody':0.70}
FEATURE_FRACTIONS = {'pericardial_effusion':0.50, 'joint_involvement':0.69, 'proteinuria':0.60, 
                     'anti_dsdna_antibody':0.70}

In [5]:
# CRITERIA_WEIGHTS = {'leukopenia':3, 'delirium':2, 'non_scarring_alopecia':2, 'pericardial_effusion':5, 
#                     'joint_involvement':6, 'proteinuria':4, 'anti_dsdna_antibody':6}
CRITERIA_WEIGHTS = {'pericardial_effusion':5, 'joint_involvement':6, 'proteinuria':4, 'anti_dsdna_antibody':6}

In [6]:
WEIGHT_PROB_DICT = {8:0.1, 6:0.2, 5:0.25, 4:0.3, 3:0.35, 2:0.4}

In [7]:
FEAT_FRACTION = 1.2

In [8]:
def get_feat_probabilities(feature, feat_type='positive'):
    '''
    Returns the probability of positives vs negatives for a feature. 
    Works for all features except renal class, C3 and C4'''
    if feat_type == 'positive':
        pos_prob = FEATURE_FRACTIONS[feature]/FEAT_FRACTION # change this one
    elif feat_type == 'negative':
        feature_weight = CRITERIA_WEIGHTS[feature]
        pos_prob = WEIGHT_PROB_DICT[feature_weight]
    else:
        print('Unknown feature type. Should either be "positive" or "negative"')
        return None
    return 1-pos_prob, pos_prob

In [9]:
def get_feat_score(row, feat):
    if row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = CRITERIA_WEIGHTS[feat]
    return feat_score

In [10]:
def create_label_8_ft(row):
    if row.ana == 0:
        return 'No lupus'
    if row.anti_dsdna_antibody == 0:
        if row.joint_involvement==0: #where i am 
            if row.proteinuria==0:
                if row.pericardial_effusion==0:
                    return 'No lupus'
                else: #score 5
                    if row.leukopenia == 0:
                        return 'No lupus'
                    else: #score 8
                        if row.delirium ==0:
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else:
                                return 'Lupus'
                        else: #score 10
                            return 'Lupus'
            else: #score 4
                if row.pericardial_effusion == 0:
                    if row.leukopenia == 0:
                        return 'No lupus'
                    else: #score 7
                        if row.delirium ==0:
                            return 'No lupus'
                        else: #score 9
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else: #score 11
                                return 'Lupus'
                else: # score 9
                    if row.leukopenia ==0:
                        if row.delirium == 0:
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else: # score 11
                                return 'Lupus'
                        else: #score 11
                            return 'Lupus'
                    else: # score 12
                        return 'Lupus'
            
        else: #row.joint_involvement==1 score=6
            if row.proteinuria == 0:
                if row.pericardial_effusion == 0:
                    if row.leukopenia==0:
                        if row.delirium ==0:
                            return 'No lupus'
                        else: #score 8
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else:
                                return 'Lupus'
                    else: #score 9
                        if row.delirium == 0:
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else: #score 11
                                return 'Lupus'
                        else: #score 11
                            return 'Lupus'
                else: #score 11
                    return 'Lupus'
                
            else: #score 10
                return 'Lupus'
    if row.joint_involvement ==0: #score here is 6 becuase the antibody above is == 1
        if row.proteinuria==0:
            if row.pericardial_effusion==0:
                if row.leukopenia ==0:
                    if row.delirium==0:
                        return 'No lupus'
                    else: #score 8
                        if row.non_scarring_alopecia==0:
                            return 'No lupus'
                        else: #score 10
                            return 'Lupus'
                else: #score 9
                    if row.delirium==0:
                        if row.non_scarring_alopecia ==0:
                            return 'No lupus'
                        else:
                            return 'Lupus'
                    else: #score 11
                        return 'Lupus'
                    
            else: #score 11
                return 'Lupus'
        else: #score 10
            return 'Lupus'
    else: #score 12
        return 'Lupus'

In [11]:
def create_label_5_ft(row):
    if row.ana == 0:
        return 'No lupus'
    if row.anti_dsdna_antibody == 0:
        if row.joint_involvement ==0:
            return 'No lupus'
        if row.proteinuria!=0:
            return 'Lupus'
        if row.pericardial_effusion == 0:
            return 'No lupus'
        if row.pericardial_effusion ==1:
            return 'Lupus'
    if row.joint_involvement ==1:
        return 'Lupus'
    if row.proteinuria ==1:
        return 'Lupus'
    if row.pericardial_effusion == 0:
        return 'No lupus'
    if row.pericardial_effusion ==1:
        return 'Lupus'

#### ANA positive dataset

In [12]:
ana_df = pd.DataFrame()
ana_df['ana'] = [1]*50000
ana_df.head()

Unnamed: 0,ana
0,1
1,1
2,1
3,1
4,1


In [13]:
for feature in FEATURES:
    if feature != 'ana':
        neg_frac, pos_frac = get_feat_probabilities(feature)
        ana_df[feature] = np.random.choice([0, 1], len(ana_df), p=[neg_frac, pos_frac])
ana_df.head()

Unnamed: 0,ana,anti_dsdna_antibody,joint_involvement,proteinuria,pericardial_effusion
0,1,0,1,1,0
1,1,1,1,1,1
2,1,1,0,0,1
3,1,1,1,0,1
4,1,0,0,0,1


In [14]:
def create_label(row):
    #print(f'ROW {row.name}')
    if row['ana'] == 0:
        return 'No lupus' #0
    total_row_score = 0
    for feature in FEATURES:
        if feature != 'ana':
            feature_score = get_feat_score(row, feature)
            total_row_score += feature_score
            if total_row_score >= 10:
                #print(f'row {row.name} - {total_row_score}')
                return 'Lupus'#1
    else:
        return 'No lupus'#0

In [15]:
ana_df['label'] = ana_df.apply(lambda row: create_label(row), axis=1)
# ana_df['dt_label'] = ana_df.apply(lambda row: create_label_5_ft(row), axis=1)
ana_df.head()

Unnamed: 0,ana,anti_dsdna_antibody,joint_involvement,proteinuria,pericardial_effusion,label
0,1,0,1,1,0,Lupus
1,1,1,1,1,1,Lupus
2,1,1,0,0,1,Lupus
3,1,1,1,0,1,Lupus
4,1,0,0,0,1,No lupus


In [16]:
ana_df.label.value_counts()

Lupus       34161
No lupus    15839
Name: label, dtype: int64

In [17]:
# ana_df.dt_label.value_counts()

#### ANA negative dataset

In [18]:
no_ana_df = pd.DataFrame()
no_ana_df['ana'] = [0]*20000
no_ana_df.head()

Unnamed: 0,ana
0,0
1,0
2,0
3,0
4,0


In [19]:
for feature in FEATURES:
    if feature != 'ana':
        neg_frac, pos_frac = get_feat_probabilities(feature, feat_type='negative')
        no_ana_df[feature] = np.random.choice([0, 1], len(no_ana_df), p=[neg_frac, pos_frac])
no_ana_df['label'] = 'No lupus'
no_ana_df.head()

Unnamed: 0,ana,anti_dsdna_antibody,joint_involvement,proteinuria,pericardial_effusion,label
0,0,0,0,0,0,No lupus
1,0,0,0,0,0,No lupus
2,0,0,0,0,1,No lupus
3,0,0,0,0,0,No lupus
4,0,0,0,0,1,No lupus


#### Merging the 2 dataframes

In [20]:
lupus_df = pd.concat([ana_df, no_ana_df])
len(lupus_df)

70000

In [21]:
lupus_df = lupus_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
lupus_df.label.value_counts()

No lupus    35839
Lupus       34161
Name: label, dtype: int64

In [22]:
lupus_df.head()

Unnamed: 0,ana,anti_dsdna_antibody,joint_involvement,proteinuria,pericardial_effusion,label
0,1,1,1,1,1,Lupus
1,1,1,1,0,0,Lupus
2,1,0,1,1,1,Lupus
3,1,1,1,1,1,Lupus
4,1,1,0,1,0,Lupus


#### Saving the dataset

In [23]:
lupus_df.to_csv('../data/very_simple_datasets/lupus_5_features.csv', index=False)