In [1]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek


from typing import Tuple

from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier

from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

Data Preprocessing

In [2]:
#import dataset 
pbc_data = pd.read_csv(r'pbc.csv')
pbc_data 

Unnamed: 0,ID,N_Days,Status,Drug,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,1,400,D,D-penicillamine,21464,F,Y,Y,Y,Y,14.5,261.0,2.60,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,2,4500,C,D-penicillamine,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,3,1012,D,D-penicillamine,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.10,55.0,151.0,12.0,4.0
3,4,1925,D,D-penicillamine,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,5,1504,CL,Placebo,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,414,681,D,,24472,F,,,,N,1.2,,2.96,,,,,174.0,10.9,3.0
414,415,1103,C,,14245,F,,,,N,0.9,,3.83,,,,,180.0,11.2,4.0
415,416,1055,C,,20819,F,,,,N,1.6,,3.42,,,,,143.0,9.9,3.0
416,417,691,C,,21185,F,,,,N,0.8,,3.75,,,,,269.0,10.4,3.0


In [3]:
#remove features which are not relevant in determine the cirrhotic stage
pbc_data = pbc_data.drop(["ID"], axis=1) 
pbc_data = pbc_data.drop(["N_Days"], axis=1)
pbc_data = pbc_data.drop(["Drug"], axis=1) 
pbc_data = pbc_data.drop(["Status"], axis=1)
pbc_data 

Unnamed: 0,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,21464,F,Y,Y,Y,Y,14.5,261.0,2.60,156.0,1718.0,137.95,172.0,190.0,12.2,4.0
1,20617,F,N,Y,Y,N,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,3.0
2,25594,M,N,N,N,S,1.4,176.0,3.48,210.0,516.0,96.10,55.0,151.0,12.0,4.0
3,19994,F,N,Y,Y,S,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,4.0
4,13918,F,N,Y,Y,N,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,24472,F,,,,N,1.2,,2.96,,,,,174.0,10.9,3.0
414,14245,F,,,,N,0.9,,3.83,,,,,180.0,11.2,4.0
415,20819,F,,,,N,1.6,,3.42,,,,,143.0,9.9,3.0
416,21185,F,,,,N,0.8,,3.75,,,,,269.0,10.4,3.0


In [4]:
# datatypes
pbc_data.dtypes

Age                int64
Sex               object
Ascites           object
Hepatomegaly      object
Spiders           object
Edema             object
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Prothrombin      float64
Stage            float64
dtype: object

In [5]:
# combine into two classes
pbc_data["Stage"]=pbc_data["Stage"].replace([3.0, 2.0, 1.0], 0.0)
pbc_data["Stage"]=pbc_data["Stage"].replace([4.0], 1.0)

In [6]:
# separate categorical features and numerical features
categorical_features = []
numerical_features = []
for i in pbc_data.columns:
    if pbc_data[i].nunique() > 3:
        numerical_features.append(i)
    else:
        categorical_features.append(i)
print("Categorical features: ", categorical_features)
print("Numerical features: ", numerical_features)

Categorical features:  ['Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Edema', 'Stage']
Numerical features:  ['Age', 'Bilirubin', 'Cholesterol', 'Albumin', 'Copper', 'Alk_Phos', 'SGOT', 'Tryglicerides', 'Platelets', 'Prothrombin']


In [7]:
# lable encoding
pbc_data[categorical_features] = pbc_data [categorical_features].apply(
    lambda series: pd.Series(LabelEncoder().fit_transform(series[series.notnull()]), 
                             index= series[series.notnull()].index))
pbc_data

Unnamed: 0,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,21464,0,1.0,1.0,1.0,2,14.5,261.0,2.60,156.0,1718.0,137.95,172.0,190.0,12.2,1.0
1,20617,0,0.0,1.0,1.0,0,1.1,302.0,4.14,54.0,7394.8,113.52,88.0,221.0,10.6,0.0
2,25594,1,0.0,0.0,0.0,1,1.4,176.0,3.48,210.0,516.0,96.10,55.0,151.0,12.0,1.0
3,19994,0,0.0,1.0,1.0,1,1.8,244.0,2.54,64.0,6121.8,60.63,92.0,183.0,10.3,1.0
4,13918,0,0.0,1.0,1.0,0,3.4,279.0,3.53,143.0,671.0,113.15,72.0,136.0,10.9,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,24472,0,,,,0,1.2,,2.96,,,,,174.0,10.9,0.0
414,14245,0,,,,0,0.9,,3.83,,,,,180.0,11.2,1.0
415,20819,0,,,,0,1.6,,3.42,,,,,143.0,9.9,0.0
416,21185,0,,,,0,0.8,,3.75,,,,,269.0,10.4,0.0


In [8]:
# datatypes
pbc_data.dtypes

Age                int64
Sex                int32
Ascites          float64
Hepatomegaly     float64
Spiders          float64
Edema              int32
Bilirubin        float64
Cholesterol      float64
Albumin          float64
Copper           float64
Alk_Phos         float64
SGOT             float64
Tryglicerides    float64
Platelets        float64
Prothrombin      float64
Stage            float64
dtype: object

In [9]:
# normalization
target = pbc_data["Stage"]
features = pbc_data.drop(["Stage"], axis = 1)
scaler = MinMaxScaler()
pbc_data_normalized = scaler.fit_transform(features)
pbc_data_normalized = pd.DataFrame(data = pbc_data_normalized, columns = features.columns)
pbc_data_normalized = pbc_data_normalized.join(target)

pbc_data_normalized

Unnamed: 0,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,0.622822,0.0,1.0,1.0,1.0,1.0,0.512635,0.085196,0.238806,0.260274,0.105279,0.258993,0.246018,0.194234,0.355556,1.0
1,0.578364,0.0,0.0,1.0,1.0,0.0,0.028881,0.109970,0.813433,0.085616,0.523509,0.202298,0.097345,0.241275,0.177778,0.0
2,0.839597,1.0,0.0,0.0,0.0,0.5,0.039711,0.033837,0.567164,0.352740,0.016724,0.161871,0.038938,0.135053,0.333333,1.0
3,0.545664,0.0,0.0,1.0,1.0,0.5,0.054152,0.074924,0.216418,0.102740,0.429723,0.079554,0.104425,0.183612,0.144444,1.0
4,0.226748,0.0,0.0,1.0,1.0,0.0,0.111913,0.096073,0.585821,0.238014,0.028143,0.201439,0.069027,0.112291,0.211111,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.780705,0.0,,,,0.0,0.032491,,0.373134,,,,,0.169954,0.211111,0.0
414,0.243911,0.0,,,,0.0,0.021661,,0.697761,,,,,0.179059,0.244444,1.0
415,0.588967,0.0,,,,0.0,0.046931,,0.544776,,,,,0.122914,0.100000,0.0
416,0.608178,0.0,,,,0.0,0.018051,,0.667910,,,,,0.314112,0.155556,0.0


In [10]:
#check missing value
pbc_data_normalized.isnull().sum()

Age                0
Sex                0
Ascites          106
Hepatomegaly     106
Spiders          106
Edema              0
Bilirubin          0
Cholesterol      134
Albumin            0
Copper           108
Alk_Phos         106
SGOT             106
Tryglicerides    136
Platelets         11
Prothrombin        2
Stage              6
dtype: int64

In [11]:
# drop the whole row where the stage is empyty
pbc_data_normalized.dropna(subset=["Stage"], inplace = True)
pbc_data_normalized

Unnamed: 0,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,0.622822,0.0,1.0,1.0,1.0,1.0,0.512635,0.085196,0.238806,0.260274,0.105279,0.258993,0.246018,0.194234,0.355556,1.0
1,0.578364,0.0,0.0,1.0,1.0,0.0,0.028881,0.109970,0.813433,0.085616,0.523509,0.202298,0.097345,0.241275,0.177778,0.0
2,0.839597,1.0,0.0,0.0,0.0,0.5,0.039711,0.033837,0.567164,0.352740,0.016724,0.161871,0.038938,0.135053,0.333333,1.0
3,0.545664,0.0,0.0,1.0,1.0,0.5,0.054152,0.074924,0.216418,0.102740,0.429723,0.079554,0.104425,0.183612,0.144444,1.0
4,0.226748,0.0,0.0,1.0,1.0,0.0,0.111913,0.096073,0.585821,0.238014,0.028143,0.201439,0.069027,0.112291,0.211111,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
413,0.780705,0.0,,,,0.0,0.032491,,0.373134,,,,,0.169954,0.211111,0.0
414,0.243911,0.0,,,,0.0,0.021661,,0.697761,,,,,0.179059,0.244444,1.0
415,0.588967,0.0,,,,0.0,0.046931,,0.544776,,,,,0.122914,0.100000,0.0
416,0.608178,0.0,,,,0.0,0.018051,,0.667910,,,,,0.314112,0.155556,0.0


In [12]:
# imputation
imputer = KNNImputer(n_neighbors = 10)
pbc_data_impute = pd.DataFrame(imputer.fit_transform(pbc_data_normalized), columns = pbc_data_normalized.columns)
c1 = ['Sex', 'Ascites', 'Hepatomegaly', 'Spiders', 'Stage']
pbc_data_impute[c1] = pbc_data_impute[c1].values.round(0)
pbc_data_impute[c1] = pbc_data_impute[c1].astype(int)

pbc_data_impute

Unnamed: 0,Age,Sex,Ascites,Hepatomegaly,Spiders,Edema,Bilirubin,Cholesterol,Albumin,Copper,Alk_Phos,SGOT,Tryglicerides,Platelets,Prothrombin,Stage
0,0.622822,0,1,1,1,1.0,0.512635,0.085196,0.238806,0.260274,0.105279,0.258993,0.246018,0.194234,0.355556,1
1,0.578364,0,0,1,1,0.0,0.028881,0.109970,0.813433,0.085616,0.523509,0.202298,0.097345,0.241275,0.177778,0
2,0.839597,1,0,0,0,0.5,0.039711,0.033837,0.567164,0.352740,0.016724,0.161871,0.038938,0.135053,0.333333,1
3,0.545664,0,0,1,1,0.5,0.054152,0.074924,0.216418,0.102740,0.429723,0.079554,0.104425,0.183612,0.144444,1
4,0.226748,0,0,1,1,0.0,0.111913,0.096073,0.585821,0.238014,0.028143,0.201439,0.069027,0.112291,0.211111,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,0.780705,0,0,1,0,0.0,0.032491,0.157885,0.373134,0.097260,0.081122,0.220724,0.156637,0.169954,0.211111,0
408,0.243911,0,0,1,0,0.0,0.021661,0.125982,0.697761,0.107021,0.089491,0.219691,0.092035,0.179059,0.244444,1
409,0.588967,0,0,0,0,0.0,0.046931,0.125680,0.544776,0.071747,0.056345,0.184312,0.125841,0.122914,0.100000,0
410,0.608178,0,0,0,0,0.0,0.018051,0.083625,0.667910,0.053082,0.038656,0.125551,0.117522,0.314112,0.155556,0


In [13]:
#check missing value
pbc_data_impute.isnull().sum()

Age              0
Sex              0
Ascites          0
Hepatomegaly     0
Spiders          0
Edema            0
Bilirubin        0
Cholesterol      0
Albumin          0
Copper           0
Alk_Phos         0
SGOT             0
Tryglicerides    0
Platelets        0
Prothrombin      0
Stage            0
dtype: int64

In [14]:
y = pbc_data_impute["Stage"]
X = pbc_data_impute.drop(["Stage"], axis=1)

In [15]:
# train test split, 20% test, 80% train
y = pbc_data_impute["Stage"]
X = pbc_data_impute.drop(["Stage"], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state = 1)

In [16]:
print(y_train.tolist().count(0))
print(y_train.tolist().count(1))

243
127


In [17]:
resample = SMOTE(random_state = 1)
X_train, y_train = resample.fit_resample(X_train, y_train)
print(y_train.tolist().count(0))
print(y_train.tolist().count(1))

243
243


In [18]:
print(y_test.tolist().count(0))
print(y_test.tolist().count(1))

25
17


Classification without feature selection

In [19]:
# random forest
model = RandomForestClassifier(random_state = 1)
train_model = model.fit(X_train, y_train)
pred_test = train_model.predict(X_test)

print("Accuracy: {}".format(round(accuracy_score(pred_test, y_test)*100, 2)))
print("Precision: {}".format(round(precision_score(pred_test, y_test)*100, 2)))
print("Recall: {}".format(round(recall_score(pred_test, y_test)*100, 2)))
print("F1-score: {}".format(round(f1_score(pred_test, y_test)*100, 2)))
print("Confusion matrix: ", confusion_matrix(pred_test, y_test))

Accuracy: 85.71
Precision: 82.35
Recall: 82.35
F1-score: 82.35
Confusion matrix:  [[22  3]
 [ 3 14]]


In [20]:
# scm
model = SVC(random_state = 1)
train_model = model.fit(X_train, y_train)
pred_test = train_model.predict(X_test)

print("Accuracy: {}".format(round(accuracy_score(pred_test, y_test)*100, 2)))
print("Precision: {}".format(round(precision_score(pred_test, y_test)*100, 2)))
print("Recall: {}".format(round(recall_score(pred_test, y_test)*100, 2)))
print("F1-score: {}".format(round(f1_score(pred_test, y_test)*100, 2)))
print("Confusion matrix: ", confusion_matrix(pred_test, y_test))

Accuracy: 73.81
Precision: 88.24
Recall: 62.5
F1-score: 73.17
Confusion matrix:  [[16  2]
 [ 9 15]]


In [21]:
# xgboost
model = XGBClassifier(random_state = 1)
train_model = model.fit(X_train, y_train)
pred_test = train_model.predict(X_test)

print("Accuracy: {}".format(round(accuracy_score(pred_test, y_test)*100, 2)))
print("Precision: {}".format(round(precision_score(pred_test, y_test)*100, 2)))
print("Recall: {}".format(round(recall_score(pred_test, y_test)*100, 2)))
print("F1-score: {}".format(round(f1_score(pred_test, y_test)*100, 2)))
print("Confusion matrix: ", confusion_matrix(pred_test, y_test))

Accuracy: 83.33
Precision: 88.24
Recall: 75.0
F1-score: 81.08
Confusion matrix:  [[20  2]
 [ 5 15]]


Ensemble Feature Selection

In [22]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import mutual_info_classif
from skfeature.function.similarity_based import fisher_score
from skfeature.function.similarity_based import reliefF

In [23]:
#feature selection
c2 = SelectKBest(score_func=chi2, k=15).fit(X, y)
c2_feat = X.columns[c2.get_support()]
c2_scores = c2.scores_
print("Chi-Square\n")
for i, feature in enumerate(X):
    score = c2_scores[i]
    print(f"{feature}: C2 Score = {score:.4f}")


Chi-Square

Age: C2 Score = 1.6954
Sex: C2 Score = 0.2628
Ascites: C2 Score = 29.1494
Hepatomegaly: C2 Score = 57.6278
Spiders: C2 Score = 29.9393
Edema: C2 Score = 18.5815
Bilirubin: C2 Score = 3.9270
Cholesterol: C2 Score = 0.3948
Albumin: C2 Score = 2.0951
Copper: C2 Score = 3.0165
Alk_Phos: C2 Score = 0.0062
SGOT: C2 Score = 0.2377
Tryglicerides: C2 Score = 0.0056
Platelets: C2 Score = 1.7244
Prothrombin: C2 Score = 2.4334


In [69]:
mi = SelectKBest(score_func=mutual_info_classif, k=15).fit(X, y)
mi_feat = X.columns[mi.get_support()]
mi_scores = mi.scores_
mi_selected = X[mi_feat]
print("Information Gain\n")
for i, feature in enumerate(X):
    score = mi_scores[i]
    print(f"{feature}: IG Score = {score:.4f}")

Information Gain

Age: IG Score = 0.0000
Sex: IG Score = 0.0000
Ascites: IG Score = 0.0242
Hepatomegaly: IG Score = 0.2004
Spiders: IG Score = 0.0350
Edema: IG Score = 0.0000
Bilirubin: IG Score = 0.0542
Cholesterol: IG Score = 0.0059
Albumin: IG Score = 0.0391
Copper: IG Score = 0.0634
Alk_Phos: IG Score = 0.0725
SGOT: IG Score = 0.0010
Tryglicerides: IG Score = 0.0000
Platelets: IG Score = 0.0310
Prothrombin: IG Score = 0.0756


In [25]:
fs = fisher_score.fisher_score(np.array(X), np.array(y))
fs_idx = np.argsort(fs, 0)
fs_arr = fs_idx[::-1]

print("Fisher Score\n")
for i, feature in enumerate(X):
    print(f"{feature}: Fisher Score = {fs[i]: .4f}")

Fisher Score

Age: Fisher Score =  0.0502
Sex: Fisher Score =  0.0007
Ascites: Fisher Score =  0.0812
Hepatomegaly: Fisher Score =  0.3832
Spiders: Fisher Score =  0.1069
Edema: Fisher Score =  0.0763
Bilirubin: Fisher Score =  0.0411
Cholesterol: Fisher Score =  0.0094
Albumin: Fisher Score =  0.1326
Copper: Fisher Score =  0.0705
Alk_Phos: Fisher Score =  0.0001
SGOT: Fisher Score =  0.0093
Tryglicerides: Fisher Score =  0.0002
Platelets: Fisher Score =  0.0639
Prothrombin: Fisher Score =  0.0966


In [26]:
rf = reliefF.reliefF(np.array(X), np.array(y))
rf_idx = np.argsort(rf, 0)
rf_arr = rf_idx[::-1]

print("ReliefF\n")
for i, feature in enumerate(X):
    print(f"{feature}: ReliefF Score = {rf[i]: .4f}")


ReliefF

Age: ReliefF Score =  25.9590
Sex: ReliefF Score =  6.8000
Ascites: ReliefF Score =  10.2000
Hepatomegaly: ReliefF Score =  5.2000
Spiders: ReliefF Score =  1.6000
Edema: ReliefF Score =  9.6000
Bilirubin: ReliefF Score =  2.3964
Cholesterol: ReliefF Score =  10.1939
Albumin: ReliefF Score =  9.1746
Copper: ReliefF Score =  7.1039
Alk_Phos: ReliefF Score =  5.8712
SGOT: ReliefF Score =  5.2451
Tryglicerides: ReliefF Score =  3.3171
Platelets: ReliefF Score =  8.5003
Prothrombin: ReliefF Score =  5.0609


In [27]:
union_set = list(set().union(c2_selected.columns, mi_selected.columns, fs_selected.columns, rf_selected.columns))
num_of_union_feature = len(union_set)
print("Union Set: ", union_set)
print("Number of features: " + str(num_of_union_feature))

NameError: name 'c2_selected' is not defined

In [None]:
# union feature
ensemble_fs = X[union_set]
ensemble_fs

Unnamed: 0,Spiders,Ascites,Edema,Hepatomegaly,Copper,Platelets,Cholesterol,Albumin,Prothrombin,Alk_Phos,Sex,Age,Bilirubin
0,1,1,1.0,1,0.260274,0.194234,0.085196,0.238806,0.355556,0.105279,0,0.622822,0.512635
1,1,0,0.0,1,0.085616,0.241275,0.109970,0.813433,0.177778,0.523509,0,0.578364,0.028881
2,0,0,0.5,0,0.352740,0.135053,0.033837,0.567164,0.333333,0.016724,1,0.839597,0.039711
3,1,0,0.5,1,0.102740,0.183612,0.074924,0.216418,0.144444,0.429723,0,0.545664,0.054152
4,1,0,0.0,1,0.238014,0.112291,0.096073,0.585821,0.211111,0.028143,0,0.226748,0.111913
...,...,...,...,...,...,...,...,...,...,...,...,...,...
407,0,0,0.0,1,0.097260,0.169954,0.157885,0.373134,0.211111,0.081122,0,0.780705,0.032491
408,0,0,0.0,1,0.107021,0.179059,0.125982,0.697761,0.244444,0.089491,0,0.243911,0.021661
409,0,0,0.0,0,0.071747,0.122914,0.125680,0.544776,0.100000,0.056345,0,0.588967,0.046931
410,0,0,0.0,0,0.053082,0.314112,0.083625,0.667910,0.155556,0.038656,0,0.608178,0.018051
