In [1]:
import pandas as pd
import numpy as np
import random
import sys
sys.path.append('../')
from modules import utils, constants
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
SEED = constants.SEED
random.seed(SEED)
np.random.seed(SEED)

#### Missingness function

In [3]:
def insert_nans(df, col, frac):
    non_null_indices = df[df[col].notna()].index.tolist() #getting not null indices
    to_change_indices = random.sample(non_null_indices, int(frac*len(non_null_indices)))
    df.loc[to_change_indices, col] = np.nan
    return df    

#### The data

In [4]:
# df = pd.read_csv('../data/orig/lupus_dataset.csv')
df = pd.read_csv('../data/inconclusive_diagnosis/lupus_dataset.csv')
df.head()

Unnamed: 0,ana,fever,leukopenia,thrombocytopenia,auto_immune_hemolysis,delirium,psychosis,seizure,non_scarring_alopecia,oral_ulcers,...,joint_involvement,proteinuria,anti_cardioliphin_antibodies,anti_β2gp1_antibodies,lupus_anti_coagulant,low_c3,low_c4,anti_dsdna_antibody,anti_smith_antibody,label
0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,2
1,1,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,2
2,1,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1
3,1,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1
4,1,1.0,-1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,-1.0,0.0,0.0,0.0,0.0,0.0,2


In [5]:
class_dict = {'No lupus':0, 'Lupus':1, 'Inconclusive diagnosis':2}
df['label'] = df['label'].replace(class_dict)
X = df.iloc[:, 0:-1]
y = df.iloc[:, -1]
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=SEED)
X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.10, stratify=y_train_val, random_state=SEED)
training_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
validation_df = pd.concat([X_val, y_val], axis=1).reset_index(drop=True)
testing_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)
X_train, y_train = np.array(X_train), np.array(y_train)
X_val, y_val = np.array(X_val), np.array(y_val)
X_test, y_test = np.array(X_test), np.array(y_test)
X_train.shape, X_val.shape, X_test.shape, y_train.shape, y_val.shape, y_test.shape

((50400, 23), (5600, 23), (14000, 23), (50400,), (5600,), (14000,))

In [None]:
frac = 0.5

In [None]:
for column in training_df.columns:
    if column not in ['ana', 'label']:
        training_df = insert_nans(training_df, column, frac)
training_df.to_csv(f'../data/missingness/{frac}/training_set.csv', index=False)

In [7]:
validation_df.to_csv(f'../data/inconclusive_diagnosis/missingness/0/validation_set.csv', index=False)
testing_df.to_csv(f'../data/inconclusive_diagnosis/missingness/0/testing_set.csv', index=False)

In [None]:
training_df.isna().sum()