# Kasus Klasifikasi Penyakit Mata <font color = red>_Diabetic retinopathy_ </font>
## <font color = red>_Diabetic retinopathy_ </font> merupakan penyakit komplikasi dari diabetes yang mengakibatkan kebutaan
### Proses klasifikasi yang dilakukan adalah menentukan mata dengan penyakit <font color = red>_Diabetic retinopathy_ </font> dan mata <font color = red> normal </font>

In [None]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff
import functions as kp
import warnings 

warnings.filterwarnings('ignore')
%config IPCompleter.greedy=True

In [None]:
dr = pd.read_csv('study_case/dr_new.csv')

In [None]:
dr.head()

In [None]:
dr.shape

## Pada _dataset_ di atas, kolom dengan nama mengandung `MA` dan `exudate` merupakan gejala dari penyakit <font color = red> _Diabetic retinopathy_ </font> 
### MA merupakan singkatan dari _microaneurysm_, kondisi mata dengan munculnya bintik merah
### _Exudate_ merupakan cairan akibat kebocoran pada pembuluh darah mata, dapat berwarna putih atau berwarna kuning
![Gambar](study_case/images.png)

In [None]:
dr.describe().T

In [None]:
# Check the data
dr.info()

**Tidak ada kolom berisi _null_ data, sehingga tidak perlu menambahkan nilai apapun**

In [None]:
dr.dtypes

### Melakukan <font color=green> *Classification problems* </font>
**Menentukan <font color=green>*feature*</font> `input` dan <font color= #0073e6>*target variabel*</font> `output`**

In [None]:
target = dr.diagnosis
features = dr.drop(['diagnosis'], axis=1)

In [None]:
target.head()

In [None]:
features.head()

# *Exploratory Data Analysis*

In [None]:
# Plot the diagnosis
g = sns.countplot(x='diagnosis', data=dr);
DR, N = target.value_counts()
plt.title('Distribution of Diagnosis', fontsize=20)
g.set_xticklabels(['Normal Eye', 'Diabetic retinopathy Eye'])
# plt.savefig('study_case/diagnosis_sample.png')

print('Number of Normal Eye: ', N)
print('Number of DR Eye: ', DR)

**Distribusi dari hasil diagnosis terlihat tidak seimbang atau _imbalance classfication_** <br>

Karena perbandingan target klasifikasi tidak begitu besar, maka tidak perlu melakukan teknik penyamaan jumlah label klasifikasi

In [None]:
plt.figure(figsize=(20,8))
plt.title("Data Summary of Debrecen Diabetic retinopathy", fontsize=15)
sns.boxplot(data=dr, orient='v');
plt.xticks(rotation=30);
# plt.savefig('study_case/boxplot.png')

In [None]:
# sepearate categorical features
num_features = features.drop(['qa', 'pre-screen', 'am/fm_class'], axis=1)

In [None]:
num_features_norm = (num_features - num_features.mean()) / (num_features.std())

In [None]:
plt.figure(figsize=(20,8))
plt.title("Data Summary of Numerical Debrecen Diabetic retinopathy", fontsize=15)
sns.boxplot(data=num_features_norm, orient='v');
plt.xticks(rotation=30);
#plt.savefig('study_case/boxplot_norm.png')

**Pada grafik di atas, beberapa fitur pada `exudate` memiliki _outliers_ yang cukup banyak, mungkin dapat mempengaruhi hasil prediksi**

In [None]:
# Plot the diagnosis
plt.figure(figsize=(10,5))
h = sns.countplot(x='qa', hue=target, data=dr);
S, B = features['qa'].value_counts()
plt.title('Distribution of Quality Assessment', fontsize=20)
h.set_xticklabels(['Bad', 'Sufficient'])
# plt.savefig('study_case/qa_sample.png')

print('Number of Bad Quality: ', B)
print('Number of Sufficient Quality: ', S)

**Dataset yang diambil memiliki kualitas gambar mata yang bagus, sehingga tidak perlu diperbaiki**

In [None]:
# Plot the diagnosis
j = sns.countplot(x='pre-screen', hue=target, data=dr);
A, L = features['pre-screen'].value_counts()
plt.title('Distribution of Pre-screening', fontsize=20)
j.set_xticklabels(['Retinal Lack Abnormality', 'Severe Retinal Abnormality'])
# plt.savefig('study_case/ps_sample.png')

print('Number of Severe Retinal Abnormality: ', A)
print('Number of Retinal Lack Abnormality: ', L)

In [None]:
# Plot the diagnosis
j = sns.countplot(x='am/fm_class', data=dr);
N, DR = features['am/fm_class'].value_counts()
plt.title('Distribution of AM/FM Based-Classification', fontsize=20)
j.set_xticklabels(['Normal Eye', 'Diabetic retinopathy Eye'])
# plt.savefig('study_case/ps_sample.png')

print('Number of Normal Eye: ', N)
print('Number of Diabetic retinopathy Eye: ', DR)

**Distribusi grafik di atas menjelaskan proses *screening* dan hasil diagnosis dengan menggunakan metode AM/FM**

In [None]:
plt.title('Distribution of Euclidean between Macula and Optic Disc', fontsize=20)
sns.histplot(data=dr, x='eucli_of_mac_optic_disc', hue='diagnosis', kde=True);
# plt.savefig('study_case/hist_mac.png')

In [None]:
plt.title('Distribution of Diameter of Optic Disc', fontsize=20)
sns.histplot(data=dr, x='diam_optic_disc', hue='diagnosis', kde=True);
# plt.savefig('study_case/hist_optic.png')

### Di bawah ini, akan mengecek hubungan antara gejala-gejala yang muncul pada mata _Diabetic retinopathy_

In [None]:
# Collect column with the symptoms
syms = [sym for sym in dr.columns if sym.startswith('MA_') or sym.startswith('exudate_') ]

# Collect column with MAs
MAs = [ma for ma in dr.columns if ma.startswith('MA_')]

# Collect column with exus
exus = [exu for exu in dr.columns if exu.startswith('exudate_')]

print('syms = ', syms, '\n')
print('MAs = ', MAs, '\n')
print('exus = ', exus, '\n')

In [None]:
# Make new list of MAs and exus which include diagnosis
diag_syms = syms[:]
diag_MAs = MAs[:]
diag_exus = exus[:]

# Insert the diagnosis
diag_syms.insert(6, 'diagnosis')
diag_MAs.insert(6, 'diagnosis')
diag_exus.insert(6, 'diagnosis')

In [None]:
plt.figure(figsize=(40,25));
sns.pairplot(dr[diag_syms], hue='diagnosis');
# plt.savefig('study_case/pair.png')

In [None]:
corr_syms = features[syms].corr()
plt.figure(figsize=(15,10))
sns.heatmap(corr_syms, annot=True, linewidths=.7, fmt=".1f");
# plt.savefig('study_case/heatm.png')

**Korelasi antara gejala MA dan _exudate_ lemah, tetapi antara gejala dengan nama yang sama ditemukan korelasi yang cukup kuat.** <br>
</br>
 Untuk kolom dengan nama `exudate` : <br>
 * <font color = #0073e6> exudate_0.5 </font> dengan <font color = #0073e6> *exudate_0.3* </font>, <font color = #0073e6> *exudate_0.4* </font>, dan <font color = #0073e6> *exudate_0.8* </font> 
 * <font color = #0073e6> exudate_0.6 </font> dengan <font color = #0073e6> *exudate_0.5* </font>, <font color = #0073e6> *exudate_0.7* </font>, <font color = #0073e6> *exudate_0.8* </font>, dan <font color = #0073e6> *exudate_0.9* </font> 
 * <font color = #0073e6> exudate_0.7 </font> dengan <font color = #0073e6> *exudate_0.6* </font>, <font color = #0073e6> *exudate_0.8* </font>, dan <font color = #0073e6> *exudate_0.9* </font> 
 * <font color = #0073e6> exudate_0.8 </font> dengan <font color = #0073e6> *exudate_0.6* </font>, <font color = #0073e6> *exudate_0.7* </font>, <font color = #0073e6> *exudate_0.9* </font>, dan <font color = #0073e6> *exudate_1* </font> 
 * <font color = #0073e6> exudate_0.9 </font> dengan <font color = #0073e6> *exudate_0.7* </font>, <font color = #0073e6> *exudate_0.8* </font>, dan <font color = #0073e6> *exudate_1* </font> 
 * <font color = #0073e6> exudate_1 </font> dengan <font color = #0073e6> *exudate_0.8* </font>, dan <font color = #0073e6> *exudate_0.9* </font> <br> </br>
 Untuk kolom dengan nama `MA` : semua fitur saling berkorelasi

### Fitur dengan kolom `exudate` dapat dipertahankan beberapa, sedangkan fitur dengan kolom `MA` dapat dipilih satu fitur saja

In [None]:
sns.pairplot(dr[diag_MAs], hue='diagnosis');
# plt.savefig('study_case/pairplot_MA.png')

In [None]:
sns.pairplot(dr[diag_exus], hue='diagnosis');
# plt.savefig('study_case/pairplot_exu.png')

# *Data Pre-processing*

Memilih data yang akan digunakan dalam pelatihan model _machine learning_

## _Feature Scaling_
Melakukan normalisasi data terhadap fitur yang numerikal 

In [None]:
# split numerical feature 
num_features = features.drop(['qa', 'pre-screen', 'am/fm_class'], axis=1)

In [None]:
num_features.head()

In [None]:
norm_feature = kp.normalize(num_features)

In [None]:
norm_feature.head()

In [None]:
norm_feature.tail()

In [None]:
header = ['MA_0.5', 'MA_0.6', 'MA_0.7','MA_0.8', 'MA_0.9', 'MA_1', 
          'exudate_0.3', 'exudate_0.4','exudate_0.5', 'exudate_0.6', 'exudate_0.7', 'exudate_0.8', 'exudate_0.9', 'exudate_1',
          'eucli_of_mac_optic_disc', 'diam_optic_disc']

In [None]:
norm_feature.columns = header

norm_feature.head()

In [None]:
cat_feature = ['qa','pre-screen','am/fm_class']
# combine all feature 
feature_new = norm_feature.join(dr[cat_feature])

In [None]:
feature_new.head()

In [None]:
# make new normalize dataset
dr_norm = feature_new.join(target)

In [None]:
dr_norm.head()

In [None]:
# Save to .csv
dr_norm.to_csv('study_case/dr_norm.csv', index = False)

## *Feature Selection*

### *Filter Methods*

Karena dataset ini memiliki *input* berupa numerikal dan *output* berupa kategorikal, maka dilakukan proses *filter* dengan <font color = #0073e6>*__Mutual Information__*</font>

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from matplotlib import pyplot

In [None]:
feature_new, target, X, y = kp.get_dr_dataset('study_case/dr_norm.csv')

In [None]:
# split into train and test sets
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.33, random_state=1)

X_train.shape, X_test.shape

In [None]:
X_train_mi, X_test_mi, fs_mi = kp.select_features_mutual(X_train, y_train, X_test)
# calculate the feature score
for i in range(len(fs_mi.scores_)):
    print('Feature %d: %f' % (i, fs_mi.scores_[i]))

In [None]:
# plot the scores
pyplot.bar([i for i in range(len(fs_mi.scores_))], fs_mi.scores_)
pyplot.show()

Skor tiap fitur berbeda-beda

**Melakukan penentuan pemilihan fitur jika skor fitur berbeda**

In [None]:
# import Logistic Regression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
# define number of features of features to evaluate
num_features = [i+1 for i in range(X.shape[1])]

num_features

In [None]:
# fit the model using all features
model_all = LogisticRegression(solver='liblinear')
model_all.fit(X_train, y_train)
# evaluate the model
yhat_all = model_all.predict(X_test)
# evaluate predictions
accuracy = accuracy_score(y_test, yhat_all)
print('Accuracy: %.2f' % (accuracy*100))

In [None]:
mi_res = []

for num in num_features:
    X_train_mi, X_test_mi, fs_mi= kp.select_features_mutual(X_train, y_train, X_test, num)
    # fit the model selected-features Mutual Information
    model_mi= LogisticRegression(solver='liblinear')
    model_mi.fit(X_train_mi, y_train)
    # evaluate the model
    yhat_mi= model_mi.predict(X_test_mi)
    # evaluate predictions
    accuracy = accuracy_score(y_test, yhat_mi)
    mi_res.append(accuracy*100)
    print('Feature %d with accuracy: %.2f' % (num, accuracy*100))

__Mencari relasi antara fitur-fitur yang terpilih dengan akurasi dari klasifikasi__

In [None]:
from sklearn.model_selection import cross_val_score, RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline

In [None]:
#enumerate each number of features
results=list()

for num_feature in num_features:
    # create pipeline
    model = LogisticRegression(solver='liblinear')
    fs=SelectKBest(score_func=mutual_info_classif, k=num_feature)
    pipeline=Pipeline(steps=[('mi',fs),('lr',model)])
    
    # evaluate the model
    scores=kp.evaluate_model(X, y, pipeline)
    results.append(scores)
    # summarize the results
    print('>%d %.3f (%.3f)' % (num_feature, np.mean(scores)*100, np.std(scores)))

In [None]:
#enumerate each number of features with ANOVA
results=list()

for num_feature in num_features:
    # create pipeline
    model = LogisticRegression(solver='liblinear')
    fs=SelectKBest(score_func=f_classif, k=num_feature)
    pipeline=Pipeline(steps=[('mi',fs),('lr',model)])
    
    # evaluate the model
    scores=kp.evaluate_model(X, y, pipeline)
    results.append(scores)
    # summarize the results
    print('>%d %.3f (%.3f)' % (num_feature, np.mean(scores)*100, np.std(scores)))

In [None]:
# plot model performance for comparison
pyplot.boxplot(results, labels=num_features, showmeans=True)
pyplot.show()

Dipilih 15 fitur yang terbaik

In [None]:
fs_15=SelectKBest(score_func=f_classif, k=15)
fs_15.fit(X_train, y_train)
print(fs_15.get_support(indices=True))

7 8 14 15 

In [None]:
feature_15 = ['exudate_0.4', 'exudate_0.5', 'eucli_of_mac_optic_disc', 'diam_optic_disc']

In [None]:
feature_fs = feature_new.drop(feature_15, axis=1)

In [None]:
feature_fs.head()

__Membuang fitur yang disarankan oleh _filter method___

# *Train-Test Data Split*

Pada tahap ini akan dilakukan metode train test split, lalu akan fitur-fitur yang terpilih dengan melakukan metode **SMOTE Oversampling**

In [None]:
target_map = target.map({"b'1'": 1, "b'0'": 0})

In [None]:
# Split data train 80% and test 20%
X_train_all, X_test, y_train_all, y_test = train_test_split(feature_new, target_map, test_size=0.2, random_state=2)

In [None]:
X_train_all.shape, X_test_all.shape

In [None]:
y_train_all.shape, y_test_all.shape

# Membandingkan Model *Machine Learning*
### Akan dilakukan proses perbandingan model-model ML dengan algoritma berikut:
* KNN _Classificaton_
* _Decision Tree Classification_
* _Random Forest Classification_
* _Naive Bayes Classification_
* _Logistic Regression Classification_
* _Support Vector Machine Classification_
* _Artificial Neural Network_

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

## *k-NN Classification*

In [None]:
# train knn model all features
knn_res_all, knn_score = kp.train_model_knn(X_train_all, y_train_all)

Skor akurasi validasi paling bagus ditemukan saat n_neighbors = 49 dengan akurasi 62,899%

In [None]:
# plot knn validation
k_range = [i for i in range(len(knn_res_all))]
#plt.figure(figsize=(10,7))
plt.plot(k_range, knn_res_all, marker='o');
plt.title('Grafik Akurasi Validasi k-NN')
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [None]:
model_knn = KNeighborsClassifier(n_neighbors=49)
knn_all_acc, knn_all_rec, knn_all_prec, knn_all_f1, knn_all_cm = kp.test_model(model_knn, X_train_all, y_train_all, X_test_all, y_test_all)

In [None]:
print('Accuracy score: ', knn_all_acc*100)
print('Recall score: ', knn_all_rec*100)
print('Precision score: ', knn_all_prec*100)
print('F1 score: ', knn_all_f1*100)
sns.heatmap(knn_all_cm, annot=True, fmt='d')
plt.savefig('study_case/knn_all_cm.png')

In [None]:
# train knn model with selected features
knn_res_fs, knn_score_fs = kp.train_model_knn(X_train_fs, y_train_fs)

Skor akurasi validasi paling bagus ditemukan saat n_neighbors = 36 dengan skor 66,159%

In [None]:
# plot knn validation
k_range = [i for i in range(len(knn_res_fs))]
#plt.figure(figsize=(10,7))
plt.plot(k_range, knn_res_fs, color='red', marker='o');
plt.title('Grafik Akurasi Validasi k-NN')
plt.xlabel('Value of K for KNN')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [None]:
model = KNeighborsClassifier(n_neighbors=36)
knn_fs_acc, knn_fs_rec, knn_fs_prec, knn_fs_f1, knn_fs_cm = kp.test_model(model, X_train_fs, y_train_fs, X_test_fs, y_test_fs)

In [None]:
print('Accuracy score: ', knn_fs_acc*100)
print('Recall score: ', knn_fs_rec*100)
print('Precision score: ', knn_fs_prec*100)
print('F1 score: ', knn_fs_f1*100)
sns.heatmap(knn_fs_cm, annot=True, fmt='d')
plt.savefig('study_case/knn_fs_cm.png')

## *Decision Tree Classification*

In [None]:
# train decision tree model all features
dt_res_all, dt_score = kp.train_model_dt(X_train_all, y_train_all, quality=1)

Skor akurasi validasi yang bagus ditemukan jika dicek berdasarkan _gini impurity_ saat max_depth = 4 dengan skor 63,188%

In [None]:
# plot knn validation
depth_range = [i for i in range(len(dt_res_all))]
#plt.figure(figsize=(10,7))
plt.plot(depth_range, dt_res_all, color='green', marker='o')
plt.title('Grafik Akurasi Validasi Decision Tree')
plt.xlabel('Value of depth for Decision Tree')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [None]:
model_dt_all = DecisionTreeClassifier(random_state=1, max_depth=4)
dt_all_acc, dt_all_rec, dt_all_prec, dt_all_f1, dt_all_cm = kp.test_model(model_dt_all, X_train_all, y_train_all, X_test_all, y_test_all)

In [None]:
from sklearn.tree import plot_tree

In [None]:
model_dt_all = model_dt_all.fit(X_train_all, y_train_all)
plot_tree(model_dt_all);

In [None]:
print('Accuracy score: ', dt_all_acc*100)
print('Recall score: ', dt_all_rec*100)
print('Precision score: ', dt_all_prec*100)
print('F1 score: ', dt_all_f1*100)
sns.heatmap(dt_all_cm, annot=True, fmt='d')
plt.savefig('study_case/dt_all_cm.png')

In [None]:
# train decision tree model selected features
dt_res_fs, dt_score_fs = kp.train_model_dt(X_train_fs, y_train_fs, quality=1)

Skor akurasi validasi yang bagus ditemukan jika dicek berdasarkan _gini impurity_ saat max_depth = 2 dengan skor 64,71%

In [None]:
# plot decision tree validation
depth_range = [i for i in range(len(dt_res_fs))]
# plt.figure(figsize=(10,7))
plt.plot(depth_range, dt_res_fs, marker='o')
plt.title('Grafik Akurasi Validasi Decision Tree')
plt.xlabel('Value of depth for Decision Tree')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [None]:
model_dt_fs = DecisionTreeClassifier(random_state=1, #criterion='entropy', 
                               max_depth=2)
dt_fs_acc, dt_fs_rec, dt_fs_prec, dt_fs_f1, dt_fs_cm = kp.test_model(model_dt_fs, X_train_fs, y_train_fs, X_test_fs, y_test_fs)

In [None]:
model_dt_fs = model_dt_fs.fit(X_train_fs, y_train_fs)
plot_tree(model_dt_fs);

In [None]:
print('Accuracy score: ', dt_fs_acc*100)
print('Recall score: ', dt_fs_rec*100)
print('Precision score: ', dt_fs_prec*100)
print('F1 score: ', dt_fs_f1*100)
sns.heatmap(dt_fs_cm, annot=True, fmt='d')
plt.savefig('study_case/dt_fs_cm.png')

## *Random Forest Classification*

In [None]:
# train random forest model all features
rf_res_all, rf_score = kp.train_model_rf(X_train_all, y_train_all)

Skor akurasi validasi yang bagus ditemukan jika dicek berdasarkan _gini impurity_ saat max_depth = 10 dengan skor 66,993%

In [None]:
# plot random forest validation
depth_range = [i for i in range(len(rf_res_all))]
#plt.figure(figsize=(10,7))
plt.plot(depth_range, rf_res_all, marker='o')
plt.title('Grafik Akurasi Validasi Random Forest')
plt.xlabel('Value of depth for Random Forest')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [None]:
model = RandomForestClassifier(random_state=1, max_depth=10)
rf_all_acc, rf_all_rec, rf_all_prec, rf_all_f1, rf_all_cm = kp.test_model(model, X_train_all, y_train_all, X_test_all, y_test_all)

In [None]:
print('Accuracy score: ', rf_all_acc*100)
print('Recall score: ', rf_all_rec*100)
print('Precision score: ', rf_all_prec*100)
print('F1 score: ', rf_all_f1*100)
sns.heatmap(rf_all_cm, annot=True, fmt='d')
plt.savefig('study_case/rf_all_cm.png')

In [None]:
# train random forest model selected features
rf_res_fs, rf_score = kp.train_model_rf(X_train_fs, y_train_fs)

Skor akurasi validasi yang bagus ditemukan jika dicek berdasarkan _gini impurity_ saat max_depth = 10 dengan skor 68,370%

In [None]:
# plot random forest validation
depth_range = [i for i in range(len(rf_res_fs))]
#plt.figure(figsize=(10,7))
plt.plot(depth_range, rf_res_fs, color='olive', marker='o')
plt.title('Grafik Akurasi Validasi Random Forest')
plt.xlabel('Value of depth for Random Forest')
plt.ylabel('Cross-Validated Accuracy')
plt.show()

In [None]:
model = RandomForestClassifier(random_state=2, #criterion='entropy', 
                               max_depth=10)
rf_fs_acc, rf_fs_rec, rf_fs_prec, rf_fs_f1, rf_fs_cm = kp.test_model(model, X_train_fs, y_train_fs, X_test_fs, y_test_fs)

In [None]:
print('Accuracy score: ', rf_fs_acc*100)
print('Recall score: ', rf_fs_rec*100)
print('Precision score: ', rf_fs_prec*100)
print('F1 score: ', rf_fs_f1*100)
sns.heatmap(rf_fs_cm, annot=True, fmt='d')
plt.savefig('study_case/rf_fs_cm.png')

## *Naive Bayes*

In [None]:
# train naive bayes model all features
nb_res_all = kp.train_model_nb(X_train_all, y_train_all)

59,203%

In [None]:
model = GaussianNB()
nb_all_acc, nb_all_rec, nb_all_prec, nb_all_f1, nb_all_cm = kp.test_model(model, X_train_all, y_train_all, X_test_all, y_test_all)

In [None]:
print('Accuracy score: ', nb_all_acc*100)
print('Recall score: ', nb_all_rec*100)
print('Precision score: ', nb_all_prec*100)
print('F1 score: ', nb_all_f1*100)
sns.heatmap(nb_all_cm, annot=True, fmt='d')
plt.savefig('study_case/nb_all_cm.png')

In [None]:
# train naive bayes model selected features
nb_res_fs = kp.train_model_nb(X_train_fs, y_train_fs)

In [None]:
model = GaussianNB()
nb_fs_acc, nb_fs_rec, nb_fs_prec, nb_fs_f1, nb_fs_cm = kp.test_model(model, X_train_fs, y_train_fs, X_test_fs, y_test_fs)

In [None]:
print('Accuracy score: ', nb_fs_acc*100)
print('Recall score: ', nb_fs_rec*100)
print('Precision score: ',nb_fs_prec*100)
print('F1 score: ',nb_fs_f1*100)
sns.heatmap(nb_fs_cm, annot=True, fmt='d')
plt.savefig('study_case/nb_fs_cm.png')

## *Logistic Regression Classification*

In [None]:
# train logistic regressions model all features
lr_res_all, lr_scores_all = kp.train_model_lr(X_train_all, y_train_all)

parameter terbaik newton-cg dengan akurasi 72,029%

In [None]:
model = LogisticRegression(solver='newton-cg')
lr_all_acc, lr_all_rec, lr_all_prec, lr_all_f1, lr_all_cm = kp.test_model(model, X_train_all, y_train_all, X_test_all, y_test_all)

In [None]:
print('Accuracy score: ', lr_all_acc*100)
print('Recall score: ', lr_all_rec*100)
print('Precision score: ', lr_all_prec*100)
print('F1 score: ', lr_all_f1*100)
sns.heatmap(lr_all_cm, annot=True, fmt='d')
plt.savefig('study_case/lr_all_cm.png')

In [None]:
# train logistic regressions model selected features
lr_res_fs = kp.train_model_lr(X_train_fs, y_train_fs)

Didapatkan skor validasi 74,167%

In [None]:
model = LogisticRegression(solver='liblinear', random_state=1)
lr_fs_acc, lr_fs_rec, lr_fs_prec, lr_fs_f1, lr_fs_cm = kp.test_model(model, X_train_fs, y_train_fs, X_test_fs, y_test_fs)

In [None]:
print('Accuracy score: ', lr_fs_acc*100)
print('Recall score: ', lr_fs_rec*100)
print('Precision score: ', lr_fs_prec*100)
print('F1 score: ', lr_fs_f1*100)
sns.heatmap(lr_fs_cm, annot=True, fmt='d')
plt.savefig('study_case/lr_fs_cm.png')

## *Support Vector Machine Classification*

In [None]:
# train support vector machine model all features
svc_res_all, svc_scores_all = kp.train_model_svc(X_train_all, y_train_all)

Ditemukan skor validasi 71,775%

In [None]:
model = SVC(gamma='auto', kernel='linear', random_state=1)
svc_all_acc, svc_all_rec, svc_all_prec, svc_all_f1, svc_all_cm = kp.test_model(model, X_train_all, y_train_all, X_test_all, y_test_all)

In [None]:
print('Accuracy score: ', svc_all_acc*100)
print('Recall score: ', svc_all_rec*100)
print('Precision score: ', svc_all_prec*100)
print('F1 score: ', svc_all_f1*100)
sns.heatmap(svc_all_cm, annot=True, fmt='d')
plt.savefig('study_case/svc_all_cm.png')

In [None]:
# train support vector machine model selected features
svc_res_fs, svc_scores_fs = kp.train_model_svc(X_train_fs, y_train_fs)

Ditemukan skor validasi 70,145%

In [None]:
model = SVC(kernel='poly', degree=2, coef0=3, random_state=1)
svc_fs_acc, svc_fs_rec, svc_fs_prec, svc_fs_f1, svc_fs_cm = kp.test_model(model, X_train_fs, y_train_fs, X_test_fs, y_test_fs)

In [None]:
print('Accuracy score: ', svc_fs_acc*100)
print('Recall score: ', svc_fs_rec*100)
print('Precision score: ', svc_fs_prec*100)
print('F1 score: ', svc_fs_f1*100)
sns.heatmap(svc_fs_cm, annot=True, fmt='d')
plt.savefig('study_case/svc_fs_cm.png')

## *Artificial Neural Network*

In [None]:
# fix random seed for reproducibility
seed = 10
np.random.seed(seed)

In [None]:
# load diabetic retinopathy debrecen dataset
dr_data = np.loadtxt("study_case/dr_norm2.csv", delimiter=",")

In [None]:
dr_data

In [None]:
X = dr_data[:,:-1]
y = dr_data[:,-1]

In [None]:
from tensorflow import keras
from tensorflow.keras.utils import plot_model

In [None]:
X_dum, X_test, y_dum, y_test = train_test_split(X, y, test_size=0.2, stratify=np.array(y), random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=np.array(y), random_state=12)

### *Functional* API

In [None]:
# create a model 
func_model = kp.build_ann()

In [None]:
func_model.summary()

In [None]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

In [None]:
func_model.save('func_model.h5')

checkpoint = ModelCheckpoint('func_model.h5', save_best_only=True)
early_stopping = EarlyStopping(patience=10, restore_best_weights=True)

In [None]:
history = func_model.fit(x = X_train, y=y_train,
                        validation_data=(X_val, y_val),
                        batch_size=8, epochs=500, 
                        callbacks=[checkpoint, early_stopping],
                        verbose=1)

In [None]:
# evaluate the model
scores = func_model.evaluate(X_train, y_train)
print('%s: %.2f%%' % (func_model.metrics_names[1], scores[1]*100))

In [None]:
epoch_list = list(range(1,59)) # EPOCH = 150
y_train_acc = history.history['accuracy']
y_val_acc = history.history['val_accuracy']
y_train_loss = history.history['loss']
y_val_loss = history.history['val_loss']

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,8))
t = f.suptitle('Artificial Neural Network', fontsize=12)

ax1.plot(epoch_list, y_train_acc, label='Train Accuracy')
ax1.plot(epoch_list, y_val_acc, label='Validation Accuracy')
ax1.set_xticks(np.arange(0, 59, 2))
#ax1.set_ylim(0.2,1)
ax1.set_ylabel('Accuracy Value')
ax1.set_xlabel('Epoch')
ax1.set_title('Accuracy')
l1 = ax1.legend(loc="best")

ax2.plot(epoch_list, y_train_loss, label='Train Loss')
ax2.plot(epoch_list, y_val_loss, label='Validation Loss')
ax2.set_xticks(np.arange(0, 59, 2))
#ax2.set_ylim(0,1)
ax2.set_ylabel('Cross Entropy')
ax2.set_xlabel('Epoch')
ax2.set_title('Loss')
l2 = ax2.legend(loc="best")

#plt.savefig('study_case/acc_loss2.png')

In [None]:
func_acc, func_rec, func_prec, func_f1, func_cm = kp.test_ann_model(func_model, X_train, y_train, X_test, y_test)

In [None]:
print('Accuracy: %.3f ' % (func_acc*100))
print('Precision: %.3f ' % (func_rec*100))
print('Recall: %.3f ' % (func_prec*100))
print('F1 Score: %.3f ' % (func_f1*100))
sns.heatmap(func_cm, annot=True, fmt='d')
#plt.savefig('study_case/func_ann_cm.png')

In [None]:
plot_model(func_model, to_file='funcAPI.png', show_shapes=True)