In [1]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split
import sys
sys.path.append('../')
from modules import utils, constants

In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

#### 2-feature anemia dataset

In [None]:
two_features_anem = ['hemoglobin', 'gender']

In [None]:
def create_label_2_ft_anem(row):
    if row.hemoglobin > 13:
        return 'No anemia'
    if row.gender == 1: #male
        return 'Anemia'
    if row.gender == 0: #female
        if row.hemoglobin<12:
            return 'Anemia'
        elif row.hemoglobin >12:
            return 'No anemia'
        else:
            return np.nan
        

In [None]:
feat_2_anem_df = pd.DataFrame()
feat_2_anem_df['hemoglobin'] = np.random.uniform(10, 15, 10000)
feat_2_anem_df['gender'] = np.random.choice([0, 1], 10000)
feat_2_anem_df['label'] = feat_2_anem_df.apply(lambda row: create_label_2_ft_anem(row), axis=1)
feat_2_anem_df.head()

In [None]:
feat_2_anem_df.label.value_counts()

In [None]:
# feat_2_df.to_csv('../data/very_simple_datasets/feats_2.csv', index=False)

#### 2-feature lupus dataset

In [None]:
two_features = ['ana', 'anti_dsdna_antibody']

In [None]:
def create_label_2_ft(row):
    if row.ana ==0:
        return 'No lupus'
    if row.anti_dsdna_antibody == 1:
        return 'Lupus'
    if row.anti_dsdna_antibody == 0:
        return 'No lupus'

In [None]:
feat_2_df = pd.DataFrame()
feat_2_df['ana'] = np.random.choice([0,1], 70000)
feat_2_df['anti_dsdna_antibody'] = np.random.choice([0, 1], 70000)
feat_2_df['label'] = feat_2_df.apply(lambda row: create_label_2_ft(row), axis=1)
feat_2_df.head()

In [None]:
feat_2_df.label.value_counts()

In [None]:
# feat_2_df.to_csv('../data/very_simple_datasets/feats_2.csv', index=False)

#### 3-feature dataset

In [None]:
three_features = ['ana', 'anti_dsdna_antibody', 'joint_involvement']

In [None]:
def create_label_3_ft(row):
    if row.ana == 0:
        return 'No lupus'
    if row.anti_dsdna_antibody == 0:
        return 'No lupus'
    if row.joint_involvement == 0:
        return 'No lupus'
    if row.joint_involvement == 1:
        return 'Lupus'  

In [None]:
feat_3_df = pd.DataFrame()
for feat in three_features:
    feat_3_df[feat] = np.random.choice([0,1], 10000)
feat_3_df['label'] = feat_3_df.apply(lambda row: create_label_3_ft(row), axis=1)
feat_3_df.head()

In [None]:
feat_3_df.label.value_counts()

In [None]:
feat_3_df[((feat_3_df.ana==1) & (feat_3_df.anti_dsdna_antibody==1)) & (feat_3_df.joint_involvement==1)].label.value_counts()

In [None]:
# feat_3_df.to_csv('../data/very_simple_datasets/feats_3.csv', index=False)

#### 5 - feature dataset

In [None]:
five_features = ['ana', 'anti_dsdna_antibody', 'joint_involvement', 'proteinuria', 'pericardial_effusion']

In [None]:
def create_label_5_ft(row):
    if row.ana == 0:
        return 'No lupus'
    if row.anti_dsdna_antibody == 0:
        if row.joint_involvement ==0:
            return 'No lupus'
        if row.proteinuria!=0:
            return 'Lupus'
        if row.pericardial_effusion == 0:
            return 'No lupus'
        if row.pericardial_effusion ==1:
            return 'Lupus'
    if row.joint_involvement ==1:
        return 'Lupus'
    if row.proteinuria ==1:
        return 'Lupus'
    if row.pericardial_effusion == 0:
        return 'No lupus'
    if row.pericardial_effusion ==1:
        return 'Lupus'

In [None]:
feat_5_df = pd.DataFrame()
for feat in five_features:
    feat_5_df[feat] = np.random.choice([0,1], 70000)
feat_5_df['label'] = feat_5_df.apply(lambda row: create_label_5_ft(row), axis=1)
feat_5_df.head()

In [None]:
feat_5_df.label.value_counts()

In [None]:
feat_5_df[((feat_5_df.ana==1) & (feat_5_df.anti_dsdna_antibody==1)) & (feat_5_df.joint_involvement==1)].label.value_counts()

In [None]:
feat_5_df.to_csv('../data/very_simple_datasets/feats_5.csv', index=False)

#### 8-feature dataset

In [None]:
eight_features = ['ana', 'anti_dsdna_antibody', 'joint_involvement', 'proteinuria', 'pericardial_effusion', 
                 'non_scarring_alopecia', 'leukopenia', 'delirium']

In [None]:
def create_label_8_ft(row):
    if row.ana == 0:
        return 'No lupus'
    if row.anti_dsdna_antibody == 0:
        if row.joint_involvement==0: #where i am 
            if row.proteinuria==0:
                if row.pericardial_effusion==0:
                    return 'No lupus'
                else: #score 5
                    if row.leukopenia == 0:
                        return 'No lupus'
                    else: #score 8
                        if row.delirium ==0:
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else:
                                return 'Lupus'
                        else: #score 10
                            return 'Lupus'
            else: #score 4
                if row.pericardial_effusion == 0:
                    if row.leukopenia == 0:
                        return 'No lupus'
                    else: #score 7
                        if row.delirium ==0:
                            return 'No lupus'
                        else: #score 9
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else: #score 11
                                return 'Lupus'
                else: # score 9
                    if row.leukopenia ==0:
                        if row.delirium == 0:
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else: # score 11
                                return 'Lupus'
                        else: #score 11
                            return 'Lupus'
                    else: # score 12
                        return 'Lupus'
            
        else: #row.joint_involvement==1 score=6
            if row.proteinuria == 0:
                if row.pericardial_effusion == 0:
                    if row.leukopenia==0:
                        if row.delirium ==0:
                            return 'No lupus'
                        else: #score 8
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else:
                                return 'Lupus'
                    else: #score 9
                        if row.delirium == 0:
                            if row.non_scarring_alopecia==0:
                                return 'No lupus'
                            else: #score 11
                                return 'Lupus'
                        else: #score 11
                            return 'Lupus'
                else: #score 11
                    return 'Lupus'
                
            else: #score 10
                return 'Lupus'
    if row.joint_involvement ==0: #score here is 6 becuase the antibody above is == 1
        if row.proteinuria==0:
            if row.pericardial_effusion==0:
                if row.leukopenia ==0:
                    if row.delirium==0:
                        return 'No lupus'
                    else: #score 8
                        if row.non_scarring_alopecia==0:
                            return 'No lupus'
                        else: #score 10
                            return 'Lupus'
                else: #score 9
                    if row.delirium==0:
                        if row.non_scarring_alopecia ==0:
                            return 'No lupus'
                        else:
                            return 'Lupus'
                    else: #score 11
                        return 'Lupus'
                    
            else: #score 11
                return 'Lupus'
        else: #score 10
            return 'Lupus'
    else: #score 12
        return 'Lupus'

In [None]:
feat_8_df = pd.DataFrame()
for feat in eight_features:
    feat_8_df[feat] = np.random.choice([0,1], 70000)
feat_8_df['label'] = feat_8_df.apply(lambda row: create_label_8_ft(row), axis=1)
feat_8_df.head()

In [None]:
feat_8_df.label.value_counts()

In [None]:
feat_8_df.to_csv('../data/very_simple_datasets/feats_8.csv', index=False)

#### 11 features - 1 feature per domain

In [2]:
eleven_features = ['ana', 'anti_dsdna_antibody', 'joint_involvement', 'proteinuria', 'pericardial_effusion', 
                   'non_scarring_alopecia', 'leukopenia', 'delirium', 'low_c3_and_c4', 'fever', 
                   'anti_cardioliphin_antibodies']

In [3]:
CRITERIA_WEIGHTS = {'anti_dsdna_antibody':6, 'joint_involvement':6, 'proteinuria':4, 'pericardial_effusion':5,
                    'non_scarring_alopecia':2, 'leukopenia':3, 'delirium':2, 'low_c3_and_c4':4, 'fever':2, 
                    'anti_cardioliphin_antibodies':2 
                      }

In [4]:
def get_feat_score(row, feat):
    if row[feat] <= 0:
        feat_score = 0
    else:
        feat_score = CRITERIA_WEIGHTS[feat]
    return feat_score

In [5]:
def create_label_11_ft(row):
    if row['ana'] == 0:
        return 'No lupus' #0
    total_row_score = 0
    for feature in eleven_features:
        if feature != 'ana':
            feature_score = get_feat_score(row, feature)
            total_row_score += feature_score
            if total_row_score >= 10:
                #print(f'row {row.name} - {total_row_score}')
                return 'Lupus'#1
    else:
        return 'No lupus'#0

In [6]:
feat_11_df = pd.DataFrame()
for feat in eleven_features:
    feat_11_df[feat] = np.random.choice([0,1], 70000)
feat_11_df['label'] = feat_11_df.apply(lambda row: create_label_11_ft(row), axis=1)
feat_11_df.head()

Unnamed: 0,ana,anti_dsdna_antibody,joint_involvement,proteinuria,pericardial_effusion,non_scarring_alopecia,leukopenia,delirium,low_c3_and_c4,fever,anti_cardioliphin_antibodies,label
0,0,0,1,1,1,0,1,1,1,0,1,No lupus
1,1,1,0,1,1,0,1,0,1,0,0,Lupus
2,0,1,0,0,1,1,0,0,1,1,1,No lupus
3,0,0,1,0,0,1,1,1,1,1,1,No lupus
4,0,1,1,1,1,0,0,0,0,0,0,No lupus


In [7]:
feat_11_df.label.value_counts()

No lupus    38047
Lupus       31953
Name: label, dtype: int64

In [8]:
feat_11_df.to_csv('../data/very_simple_datasets/feats_11.csv', index=False)

#### 22-feature dataset

In [None]:
eleven_features = ['ana', 'non_scarring_alopecia', 'anti_dsdna_antibody', 'joint_involvement', 'proteinuria', 
                   'pericardial_effusion', 'leukopenia', 'delirium', 'low_c3', 'low_c4', 'fever', 'thrombocytopenia'
                   'anti_cardioliphin_antibodies', 'pleural_effusion', 'psychosis', 'seizure', 
                   'lupus_anti_coagulant', 'anti_β2gp1_antibodies', 'anti_smith_antibody', 'oral_ulcers', 
                   'auto_immune_hemolysis', 'acute_pericarditis']