In [1]:
# import necessary library
import pandas as pd
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
# import csv dataset
file = "tubes2_HeartDisease_train.csv"
df = pd.read_csv(file)

In [3]:
# split feature and label
feature = df.drop("Column14",inplace=False,axis=1)
label = df["Column14"]

In [4]:
# handle missing value

header = feature.columns.values.tolist()
feature_impute = feature.replace('?',np.nan)

imputer_mode = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
imputer_mean = SimpleImputer(missing_values=np.nan, strategy='mean')

discrete_value = ['Column1','Column2','Column3','Column6','Column7','Column9','Column13']
continues_value = ['Column4','Column5','Column8','Column10','Column11','Column12']

imputer_mode.fit(feature_impute[discrete_value])
feature_impute[discrete_value] = imputer_mode.transform(feature_impute[discrete_value])

imputer_mean.fit(feature_impute[continues_value])
feature_impute[continues_value] = imputer_mean.transform(feature_impute[continues_value])

feature_impute['Column13'] = pd.to_numeric(feature_impute['Column13'])
feature_impute.head()

Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,54,1,4,125.0,216.0,0,0,140.0,0,0.0,1.762089,0.686792,3
1,55,1,4,158.0,217.0,0,0,110.0,1,2.5,2.0,0.686792,3
2,54,0,3,135.0,304.0,1,0,170.0,0,0.0,1.0,0.0,3
3,48,0,3,120.0,195.0,0,0,125.0,0,0.0,1.762089,0.686792,3
4,50,1,4,120.0,0.0,0,1,156.0,1,0.0,1.0,0.686792,6


In [5]:
idx_to_drop = []

for item in continues_value:
    mean = feature_impute[item].mean()
    std = feature_impute[item].std()
    low_threshold = mean - 2*std
    high_threshold = mean + 2*std
    
#     print(item, low_threshold, high_threshold)
    
    for i in range(feature_impute[item].shape[0]):
        cur_value = feature_impute[item].iloc[i]
        if (cur_value < low_threshold or cur_value > high_threshold):
            idx_to_drop.append(i)

feature_impute.drop(feature_impute.index[idx_to_drop],inplace=True)
label.drop(label.index[idx_to_drop],inplace=True)
# feature_impute.describe()

In [6]:
# feature scale
feature_scale = pd.DataFrame(preprocessing.scale(feature_impute), columns=header)
feature_scale.head()

  


Unnamed: 0,Column1,Column2,Column3,Column4,Column5,Column6,Column7,Column8,Column9,Column10,Column11,Column12,Column13
0,0.176725,0.555458,0.852692,-0.380985,0.149048,-0.387298,-0.693683,-0.01527,-0.700908,-0.567213,0.253637,0.396182,-0.510509
1,0.285116,0.555458,0.852692,1.952471,0.158792,-0.387298,-0.693683,-1.299403,1.426721,0.036852,0.86704,0.396182,-0.510509
2,0.176725,-1.800315,-0.236859,0.326123,1.006587,2.581989,-0.693683,1.268864,-0.700908,-0.567213,-1.711246,-1.883308,-0.510509
3,-0.473623,-1.800315,-0.236859,-0.734539,-0.055593,-0.387298,-0.693683,-0.657336,-0.700908,-0.567213,0.253637,0.396182,-0.510509
4,-0.25684,0.555458,0.852692,-0.734539,-1.955823,-0.387298,0.619047,0.669601,1.426721,-0.567213,-1.711246,0.396182,1.417604


In [7]:
# display feature correlation with label
for feat in feature_scale:
    print(feat, feature_scale[feat].corr(label))

Column1 0.00950298929003149
Column2 0.002169923491672243
Column3 -0.0019815812945992647
Column4 -0.06693641003884787
Column5 -0.1177640309076227
Column6 -0.03524705294950225
Column7 -0.05383415418850949
Column8 0.029848809975586067
Column9 -0.05664339720406507
Column10 -0.08849686660664911
Column11 -0.0795584160413824
Column12 -0.02364959703871219
Column13 0.061955162333051754


In [33]:
import itertools
feature_combinations = []

# print(list(itertools.combinations(header,1)))
for i in range(1,14):
    feature_combinations.append(list(itertools.combinations(header,i)))

1053


Isu pertama dalam optimasi model MLP adalah menentukan parameter-parameter MLPClassifier. Untuk parameter solver, kami memilih lbfgs yang cocok digunakan untuk data berukuran kecil. Untuk parameter activation kami memilih logistic (sigmoid) karena itu yang sudah diajarkan di kelas. Dan random_state diisi 100 (bebas, asalkan bukan false) agar parameter-parameter initialization seperti weight, bias, dll sama. Sedangkan parameter alpha, hidden_layer_sizes, dan learning_rates kami coba mencari kombinasi yang terbaik dari beberapa nilai yang kami tawarkan. Hasilnya hidden_layer_sizes = (7, 11, 7), learning_rates = constant, dan alpha = 0.001.

Isu optimasi selanjutnya adalah pemilihan feature yang digunakan. Dengan pertimbangan jumlah feature yang tidak terlalu banyak, dalam pemilihan feature kami mencari kombinasi feature terbaik dari keseluruhan kombinasi yang ada. Kami juga menggunakan strategi K-Fold untuk mencari dataset yang menghasilkan akurasi paling baik.

Untuk MLP, jumlah feature yang menghasilkan akurasi model MLP terbaik adalah 5 yaitu ['Column4', 'Column5', 'Column6', 'Column11', 'Column12'] . Akurasi dari model ini adalah 80%

In [39]:
kf = KFold(n_splits=10)

best_model = None
worst_model = None
best_accuracy = 0.0
worst_accuracy = 1.0
sum_accuracy = 0.0

best_header = None

models = []

i = 4
s = 1053
e = 1054
for j in range(s,e):
    cur_header = []
    for k in range(len(feature_combinations[i][j])):
        cur_header.append(feature_combinations[i][j][k])
        
    cur_feature = feature_scale[cur_header]
    for train_idx,test_idx in kf.split(cur_feature):
        X_train = cur_feature.iloc[train_idx] 
        y_train = label.iloc[train_idx]      
        X_test = cur_feature.iloc[test_idx]
        y_test = label.iloc[test_idx] 

        list_hidden_layer_sizes = [(7,11,7), (100)]
        list_alpha = [0.00001, 0.0001, 0.001, 0.01, 0.1]
        list_learning_rate = ['constant','adaptive']

        for hidden_layer_sizes_ in list_hidden_layer_sizes:
            for alpha_ in list_alpha:
                for learning_rate_ in list_learning_rate:                
                    cur_model = MLPClassifier(solver='lbfgs', activation='logistic', random_state=100, alpha=alpha_,
                                     hidden_layer_sizes=hidden_layer_sizes_,  learning_rate=learning_rate_).fit(X_train,y_train)
                    y_predict = cur_model.predict(X_test)
                    cur_accuracy = accuracy_score(y_test, y_predict)

                    if (cur_accuracy > best_accuracy):
                        best_model = cur_model
                        best_accuracy = cur_accuracy
                        best_header = cur_header
                        best_parameter = {
                            'alpha' : alpha_,
                            'hidden_layer_sizes' : hidden_layer_sizes_,
                            'learning_rate' : learning_rate_
                        }

                    if (cur_accuracy < worst_accuracy):
                        worst_model = cur_model
                        worst_accuracy = cur_accuracy

                    models.append(cur_model)

print('Best Header: ', best_header)
print('Best Parameter: ', best_parameter)
print('Best Accuracy: ', best_accuracy)

['Column4', 'Column5', 'Column6', 'Column11', 'Column12']
{'alpha': 0.001, 'hidden_layer_sizes': (7, 11, 7), 'learning_rate': 'constant'}
0.8
