## Remove Nan value

In [1]:
import pandas as pd

df = pd.read_csv("ukb_sample.csv")
df.info()


  df = pd.read_csv("ukb_sample.csv")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 273353 entries, 0 to 273352
Columns: 209 entries, Unnamed: 0 to dm
dtypes: float64(193), int64(3), object(13)
memory usage: 435.9+ MB


In [2]:
df[['dateofattending', "endometri"]]

Unnamed: 0,dateofattending,endometri
0,2008-06-03,
1,2008-04-08,
2,2007-12-19,
3,2008-05-09,
4,2008-03-28,
...,...,...
273348,2009-07-22,
273349,2008-03-27,
273350,2010-06-17,
273351,2010-02-17,


In [3]:
df.dateofattending  = df.dateofattending.map(lambda x : x.replace("-", "")).astype(int)
df.endometri = df.endometri.map(lambda x: str(x).replace("-", "").replace("nan", "0")).astype(int)
df[['dateofattending', "endometri"]]

Unnamed: 0,dateofattending,endometri
0,20080603,0
1,20080408,0
2,20071219,0
3,20080509,0
4,20080328,0
...,...,...
273348,20090722,0
273349,20080327,0
273350,20100617,0
273351,20100217,0


In [4]:
label = df.dateofattending - df.endometri
label

0         20080603
1         20080408
2         20071219
3         20080509
4         20080328
            ...   
273348    20090722
273349    20080327
273350    20100617
273351    20100217
273352    20090514
Length: 273353, dtype: int64

In [5]:
label.loc[df.endometri == 0] = 0
label

0         0
1         0
2         0
3         0
4         0
         ..
273348    0
273349    0
273350    0
273351    0
273352    0
Length: 273353, dtype: int64

In [6]:
label.loc[label > 0] = 1
label.loc[label < 0] = 2
label

0         0
1         0
2         0
3         0
4         0
         ..
273348    0
273349    0
273350    0
273351    0
273352    0
Length: 273353, dtype: int64

In [7]:
n0 = label[label == 0].count()
n1 = label[label == 1].count()
n2 = label[label == 2].count()
print(f"health: {n0}, sick_after: {n1}, sick_before: {n2}")
print(n1 + n2 + n0)

health: 263414, sick_after: 8372, sick_before: 1567
273353


## Remove column with date info

In [8]:
for i, key in enumerate(df.keys()):
    df[key].fillna(-1, inplace=True)

In [9]:
from collections import defaultdict
data_type = set()
data_type_dict = defaultdict(int)

for key in df.keys():
    data_type.add(df[key].dtype)
    data_type_dict[str(df[key].dtype)] += 1
    if df[key].dtype == "object":
        df.drop(columns=[key], inplace=True)
        print(key)

print(data_type)
print(data_type_dict.items())

alldementia
alzhe1
alzhe2
vasdementia1
vasdementia2
frontdementia
allparkin
parkinson1
parkinson2
death
ectopicpreg
{dtype('int64'), dtype('O'), dtype('float64')}
dict_items([('int64', 5), ('float64', 193), ('object', 11)])


In [10]:
df.drop(columns=["endometri", "dateofattending"], inplace=True)
df.drop(columns=["Unnamed: 0", "eid"], inplace=True)

## Save numpy array for training / valid / test

In [11]:
df.insert(df.shape[1], "label", label)

In [12]:
data = df.to_numpy()

In [13]:
import numpy as np
np.save("ukb_array_3_classes.npy", data)

In [16]:
def split_data(array, save=False):
    row_total = array.shape[0]
    num_train_sample = int(row_total * 0.7)
    num_valid_sample = int(row_total * 0.1)
    num_test_sample = int(row_total * 0.2)
    print(
        f"num of training data : {num_train_sample}, " +
        f"valid data : {num_valid_sample}, " +
        f"test data : {num_test_sample}, " +
        f"total num of samples : {row_total}"
    )
    row_sequence = np.arange(row_total)
    np.random.shuffle(row_sequence)
    train_arr = array[row_sequence[0: num_train_sample], :]
    valid_arr = array[row_sequence[num_train_sample: num_train_sample + num_valid_sample], :]
    test_arr = array[row_sequence[-num_test_sample: ], :]
    if save:
        np.save("ukb_array_train.npy", train_arr)
        np.save("ukb_array_valid.npy", valid_arr)
        np.save("ukb_array_test.npy", test_arr)
    return train_arr, valid_arr, test_arr

train_arr, valid_arr, test_arr = split_data(data)

num of training data : 191347, valid data : 27335, test data : 54670, total num of samples : 273353


## Data split and distribute

In [22]:
def distribute(data_file):
    data = np.load(data_file)
    cla_0 = (data[:, -1] == 0).sum()
    cla_1 = (data[:, -1] == 1).sum()
    cla_2 = (data[:, -1] == 2).sum()
    print(f"health : {cla_0}, sick_after : {cla_1}, sick_before : {cla_2}, total : {len(data)}")

distribute("./ukb_array_valid.npy")


health : 26377, sick_after : 806, sick_before : 152, total : 27335


In [23]:
distribute("./ukb_array_test.npy")


health : 52639, sick_after : 1703, sick_before : 328, total : 54670


In [24]:
distribute("./ukb_array_train.npy")


health : 184397, sick_after : 5863, sick_before : 1087, total : 191347
