In [1]:
#library untuk manipulasi data
import pandas as pd
#library untuk naive bayes
from sklearn.naive_bayes import GaussianNB
#library untuk mengubah data kategorikal menjadi numeric
from sklearn.preprocessing import LabelEncoder

In [2]:
#import datasetnya
df = pd.read_csv('post-operative.csv')
df

Unnamed: 0,L-CORE,L-SURF,L-O2,L-BP,SURF-STBL,CORE-STBL,BP-STBL,COMFORT,ADM-DECS
0,mid,low,excellent,mid,stable,stable,stable,15.0,A
1,mid,high,excellent,high,stable,stable,stable,10.0,S
2,high,low,excellent,high,stable,stable,mod-stable,10.0,A
3,mid,low,good,high,stable,unstable,mod-stable,15.0,A
4,mid,mid,excellent,high,stable,stable,stable,10.0,A
...,...,...,...,...,...,...,...,...,...
85,mid,mid,excellent,mid,unstable,stable,stable,10.0,A
86,mid,mid,excellent,mid,unstable,stable,stable,15.0,S
87,mid,mid,good,mid,unstable,stable,stable,15.0,A
88,mid,mid,excellent,mid,unstable,stable,stable,10.0,A


In [3]:
#informasi terkait datasetnya
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90 entries, 0 to 89
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   L-CORE     90 non-null     object 
 1   L-SURF     90 non-null     object 
 2   L-O2       90 non-null     object 
 3   L-BP       90 non-null     object 
 4   SURF-STBL  90 non-null     object 
 5   CORE-STBL  90 non-null     object 
 6   BP-STBL    90 non-null     object 
 7   COMFORT    87 non-null     float64
 8   ADM-DECS   90 non-null     object 
dtypes: float64(1), object(8)
memory usage: 6.5+ KB


In [4]:
#menampilkan jumlah NA di tiap atribut
df.isna().sum()

L-CORE       0
L-SURF       0
L-O2         0
L-BP         0
SURF-STBL    0
CORE-STBL    0
BP-STBL      0
COMFORT      3
ADM-DECS     0
dtype: int64

Dari In[4] dapat diketahui bahwa pada kolom COMFORT terdapat 3 missing value, sehingga kolom tersebut akan di proses lebih lanjut pada tahap CLEANING

# Cleaning data

In [5]:
#cleaning data, mengisi NaN dengan metode don't care pada atribut comfort
df.COMFORT.fillna(10, inplace = True)
df

Unnamed: 0,L-CORE,L-SURF,L-O2,L-BP,SURF-STBL,CORE-STBL,BP-STBL,COMFORT,ADM-DECS
0,mid,low,excellent,mid,stable,stable,stable,15.0,A
1,mid,high,excellent,high,stable,stable,stable,10.0,S
2,high,low,excellent,high,stable,stable,mod-stable,10.0,A
3,mid,low,good,high,stable,unstable,mod-stable,15.0,A
4,mid,mid,excellent,high,stable,stable,stable,10.0,A
...,...,...,...,...,...,...,...,...,...
85,mid,mid,excellent,mid,unstable,stable,stable,10.0,A
86,mid,mid,excellent,mid,unstable,stable,stable,15.0,S
87,mid,mid,good,mid,unstable,stable,stable,15.0,A
88,mid,mid,excellent,mid,unstable,stable,stable,10.0,A


In [6]:
#pengecekan apakah masih ada NaN
df.isna().sum()

L-CORE       0
L-SURF       0
L-O2         0
L-BP         0
SURF-STBL    0
CORE-STBL    0
BP-STBL      0
COMFORT      0
ADM-DECS     0
dtype: int64

Setelah tahap CLEANING selesai, lanjut ke tahap TRANSFORMASI. Pada tahap ini data pada kolom COMFORT diubah menjadi kategorikal, berdasarkan kondisi tertentu 

# data Transformation

In [7]:
#mengubah data numerik menjadi kategorikal berdasarkan kondisi tertentu
for index, row in df.iterrows():
    if row['COMFORT'] >10:
        df.loc[index, 'COMFORT'] = 'excellent'
    elif row['COMFORT'] >5:
        df.loc[index, 'COMFORT'] = 'good'
    else:
        df.loc[index, 'COMFORT'] = 'fair'
    

df

Unnamed: 0,L-CORE,L-SURF,L-O2,L-BP,SURF-STBL,CORE-STBL,BP-STBL,COMFORT,ADM-DECS
0,mid,low,excellent,mid,stable,stable,stable,excellent,A
1,mid,high,excellent,high,stable,stable,stable,good,S
2,high,low,excellent,high,stable,stable,mod-stable,good,A
3,mid,low,good,high,stable,unstable,mod-stable,excellent,A
4,mid,mid,excellent,high,stable,stable,stable,good,A
...,...,...,...,...,...,...,...,...,...
85,mid,mid,excellent,mid,unstable,stable,stable,good,A
86,mid,mid,excellent,mid,unstable,stable,stable,excellent,S
87,mid,mid,good,mid,unstable,stable,stable,excellent,A
88,mid,mid,excellent,mid,unstable,stable,stable,good,A


Setelah proses preprocessing selesai, kemudian masuk ke tahap Klasifikasi. Disini menggunakan algoritma Naive Bayes. Hal pertama yang dilakukan yaitu mengubah data kategorikal menjadi numerik, di semua kolom.

# Klafisikasi dengan Naive Bayes

In [8]:
#mengubah kategorikal menjadi numerik (0,1,2,...)
number = LabelEncoder()
df['L-CORE'] = number.fit_transform(df['L-CORE'])
df['L-SURF'] = number.fit_transform(df['L-SURF'])
df['L-O2'] = number.fit_transform(df['L-O2'])
df['L-BP'] = number.fit_transform(df['L-BP'])
df['SURF-STBL'] = number.fit_transform(df['SURF-STBL'])
df['CORE-STBL'] = number.fit_transform(df['CORE-STBL'])
df['BP-STBL'] = number.fit_transform(df['BP-STBL'])
df['COMFORT'] = number.fit_transform(df['COMFORT'])
df['ADM-DECS'] = number.fit_transform(df['ADM-DECS'])

In [9]:
#hasil data setelah diubah menjadi numerik
df

Unnamed: 0,L-CORE,L-SURF,L-O2,L-BP,SURF-STBL,CORE-STBL,BP-STBL,COMFORT,ADM-DECS
0,2,1,0,2,0,1,1,0,0
1,2,0,0,0,0,1,1,2,2
2,0,1,0,0,0,1,0,2,0
3,2,1,1,0,0,2,0,0,0
4,2,2,0,0,0,1,1,2,0
...,...,...,...,...,...,...,...,...,...
85,2,2,0,2,1,1,1,2,0
86,2,2,0,2,1,1,1,0,2
87,2,2,1,2,1,1,1,0,0
88,2,2,0,2,1,1,1,2,0


In [10]:
#menentukan atribute dan target untuk model naive bayes
attributes = ['L-CORE','L-SURF','L-O2','L-BP','SURF-STBL','CORE-STBL','BP-STBL','COMFORT']
target = ["ADM-DECS"]

In [11]:
x = df[attributes]
y = df[target]

Kemudian setelah dilakukan proses penentuan kolom mana saja yang menjadi atribute dan target / kelas, maka dilanjutkan dengan pembagian data training dan data testing. Pembagian yang dilakukan disini memiliki ratio 60:40 untuk data training : data testing

In [12]:
#untuk membagi data training dan tes
from sklearn.model_selection import train_test_split

#membagi data menjadi set training dan test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4)

In [13]:
print("ukuran data train", x_train.shape, y_train.shape)
print("ukuran data test", x_test.shape, y_test.shape)

ukuran data train (54, 8) (54, 1)
ukuran data test (36, 8) (36, 1)


In [14]:
#model klasifikasi dengan Naive Bayes, pada data training
model = GaussianNB()

In [15]:
model.fit(x_train, y_train)

  return f(*args, **kwargs)


GaussianNB()

In [16]:
#menampilkan skor akurasi dari prediksi pada data testing
model.score(x_test, y_test)

0.75

Setelah proses Naive Bayes selesai, kemudian dilanjutkan dengan proses pengujian Confusion Matrix.

# Pengujian dengan Confusion Matrix

In [17]:
#import library untuk melakukan pengujian confusion matrix
from sklearn.metrics import confusion_matrix, classification_report 

In [18]:
#melakukan pengujian confusion matrix untuk tiap kali testing
Prediksi = model.predict(x_test)
confusion_matrix(Prediksi, y_test)

array([[25,  2,  6],
       [ 0,  0,  0],
       [ 1,  0,  2]], dtype=int64)

In [19]:
#menampilkan hasil precision, reacl, f-1score, support, beserta akurasi score nya
print(classification_report(Prediksi, y_test))

              precision    recall  f1-score   support

           0       0.96      0.76      0.85        33
           1       0.00      0.00      0.00         0
           2       0.25      0.67      0.36         3

    accuracy                           0.75        36
   macro avg       0.40      0.47      0.40        36
weighted avg       0.90      0.75      0.81        36



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
#hasil prediksi kelas untuk data testing
Prediksi

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0])