In [None]:
# Author: Helen Silva
# Version: 1.0
# Pre-processing of patients clinical data

### Import the necessary libraries

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import Imputer

# enable printing of large arrays
np.set_printoptions(threshold=np.nan)

### Get features and target data

In [2]:
# get the raw data
clinical_df = pd.read_csv('../data/article-clinical-data.csv', sep=';', decimal=',')

# get the features and target data
features = clinical_df[:]
response = clinical_df['response']

# remove target data from features dataframe
features = features.drop('response', axis=1)

# check dataframe information
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 241 entries, 0 to 240
Data columns (total 5 columns):
age_onset                240 non-null float64
age_onset_groups         240 non-null float64
febrile_seizures         236 non-null object
gender                   241 non-null object
hippocampal_sclerosis    241 non-null object
dtypes: float64(2), object(3)
memory usage: 9.5+ KB


### Check features dataframe

In [3]:
features.head()

Unnamed: 0,age_onset,age_onset_groups,febrile_seizures,gender,hippocampal_sclerosis
0,14.0,3.0,no,male,no
1,4.0,1.0,no,female,no
2,6.0,2.0,no,female,no
3,13.0,3.0,no,female,yes
4,4.0,1.0,no,male,no


### Encode target data into numerical variables¶

In [4]:
# encode categorical variables to an integer form
response.replace({"refractory": 0, "responsive": 1}, inplace=True)

# check encoded values
response.values

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

### Encode categorical features into numerical variables¶

In [5]:
# encode categorical variables to an integer form
features.replace({"no": 0, "yes": 1, "male": 0, "female": 1}, inplace=True)

# check encoded values
features.head()

Unnamed: 0,age_onset,age_onset_groups,febrile_seizures,gender,hippocampal_sclerosis
0,14.0,3.0,0.0,0,0
1,4.0,1.0,0.0,1,0
2,6.0,2.0,0.0,1,0
3,13.0,3.0,0.0,1,1
4,4.0,1.0,0.0,0,0


### Check missing values on features dataframe

In [6]:
print(features.isnull().sum())

age_onset                1
age_onset_groups         1
febrile_seizures         5
gender                   0
hippocampal_sclerosis    0
dtype: int64


### Impute missing features values using most frequent value strategy

In [16]:
# set the missing values identifier, strategy and direction to be used
imputer = Imputer(missing_values="NaN", strategy='most_frequent', axis=0)

# create new dataframe with all transformed values
filled_features = pd.DataFrame(imputer.fit_transform(features), 
                               columns=['age_onset', 'age_onset_groups', 'febrile_seizures','gender', 'hippocampal_sclerosis'])

# check filled features dataframe
filled_features.head()

Unnamed: 0,age_onset,age_onset_groups,febrile_seizures,gender,hippocampal_sclerosis
0,14.0,3.0,0.0,0.0,0.0
1,4.0,1.0,0.0,1.0,0.0
2,6.0,2.0,0.0,1.0,0.0
3,13.0,3.0,0.0,1.0,1.0
4,4.0,1.0,0.0,0.0,0.0


### Check missing values on features dataframe after handling with imputation

In [8]:
print(filled_features.isnull().sum())

age_onset                0
age_onset_groups         0
febrile_seizures         0
gender                   0
hippocampal_sclerosis    0
dtype: int64


### Fix data type of encoded categorical features

In [9]:
filled_features.febrile_seizures = filled_features.febrile_seizures.astype(int)
filled_features.gender = filled_features.gender.astype(int)
filled_features.hippocampal_sclerosis = filled_features.hippocampal_sclerosis.astype(int)

### Create dummy variables for each categorical feature using Pandas (get_dummies)

In [10]:
# get dummy variables based on dataframe column and prefix
febrile_seizures_dummies = pd.get_dummies(filled_features.febrile_seizures, prefix="febrile_seizures")
gender_dummies = pd.get_dummies(filled_features.gender, prefix="gender")
hippocampal_sclerosis_dummies = pd.get_dummies(filled_features.hippocampal_sclerosis, prefix="hippocampal_sclerosis")

### Concatenate dummy variables with filled features dataframe using Pandas

In [11]:
final_features = pd.concat([filled_features, febrile_seizures_dummies, gender_dummies, hippocampal_sclerosis_dummies], axis=1)

# check final features dataframe
final_features.head()

Unnamed: 0,age_onset,age_onset_groups,febrile_seizures,gender,hippocampal_sclerosis,febrile_seizures_0,febrile_seizures_1,gender_0,gender_1,hippocampal_sclerosis_0,hippocampal_sclerosis_1
0,14.0,3.0,0,0,0,1,0,1,0,1,0
1,4.0,1.0,0,1,0,1,0,0,1,1,0
2,6.0,2.0,0,1,0,1,0,0,1,1,0
3,13.0,3.0,0,1,1,1,0,0,1,0,1
4,4.0,1.0,0,0,0,1,0,1,0,1,0


### Drop encoded categorical features from final dataframe

In [12]:
final_features.drop(['febrile_seizures','gender', 'hippocampal_sclerosis'], axis=1, inplace=True)

# check final features dataframe
final_features.head()

Unnamed: 0,age_onset,age_onset_groups,febrile_seizures_0,febrile_seizures_1,gender_0,gender_1,hippocampal_sclerosis_0,hippocampal_sclerosis_1
0,14.0,3.0,1,0,1,0,1,0
1,4.0,1.0,1,0,0,1,1,0
2,6.0,2.0,1,0,0,1,1,0
3,13.0,3.0,1,0,0,1,0,1
4,4.0,1.0,1,0,1,0,1,0


### Rename dummy columns to semantic names

In [13]:
final_features.rename(columns={'febrile_seizures_0': 'febrile_seizures_no', 'febrile_seizures_1': 'febrile_seizures_yes',
                               'gender_0': 'gender_male', 'gender_1': 'gender_female',
                               'hippocampal_sclerosis_0': 'hippocampal_sclerosis_no', 'hippocampal_sclerosis_1': 'hippocampal_sclerosis_yes'}, 
                      inplace=True)

# check final features dataframe
final_features.head()

Unnamed: 0,age_onset,age_onset_groups,febrile_seizures_no,febrile_seizures_yes,gender_male,gender_female,hippocampal_sclerosis_no,hippocampal_sclerosis_yes
0,14.0,3.0,1,0,1,0,1,0
1,4.0,1.0,1,0,0,1,1,0
2,6.0,2.0,1,0,0,1,1,0
3,13.0,3.0,1,0,0,1,0,1
4,4.0,1.0,1,0,1,0,1,0


### Export preprocessed data using Pickle

In [14]:
# export preprocessed data as pickle objects
pickle.dump(final_features, open("../data/preprocessed/article-clinical-data-features.p", "wb"))
pickle.dump(response, open("../data/preprocessed/article-clinical-data-labels.p", "wb"))

### Export preprocessed data as CSV file using Pandas¶

In [15]:
# concatenate target and feature data to get the complete dataset
preprocessed_dataset = pd.concat([response, final_features], axis=1)

# export dataset csv file
preprocessed_dataset.to_csv('../data/preprocessed/preprocessed-article-clinical-data.csv', index=False)