In [1]:
#Import library numpy, pandas dan scikit-learn
import numpy as np
import pandas as pd
from sklearn import tree

In [2]:
#Membaca dan menampilkan Dataset dari File ke Pandas dataFrame
data = pd.read_csv('exams.csv', delimiter=';', header=0)
data.head()

Unnamed: 0,gender,ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group D,some college,standard,completed,59.0,70,78
1,male,group D,associate's degree,standard,none,96.0,93,87
2,female,group D,some college,free/reduced,none,57.0,76,77
3,male,group B,some college,free/reduced,none,70.0,70,63
4,,group D,associate's degree,standard,none,83.0,85,86


In [3]:
#Mengecek atribut manasaja yang memiliki missing value
data.isnull().sum()

gender                         23
ethnicity                       0
parental_level_of_education     0
lunch                           8
test_preparation_course         0
math_score                      4
reading_score                   0
writing_score                   0
dtype: int64

In [4]:
#Mengisi missing value pada atribut 'math_score' karena atribut itu yang nanti kita akan gunakan
from sklearn.impute import SimpleImputer
imputer = SimpleImputer(strategy='mean')
data['math_score'] = imputer.fit_transform(data[['math_score']])

In [5]:
#Mengecek atribut manasaja yang memiliki missing value
data.isnull().sum()

gender                         23
ethnicity                       0
parental_level_of_education     0
lunch                           8
test_preparation_course         0
math_score                      0
reading_score                   0
writing_score                   0
dtype: int64

In [6]:
#Mengubah kelas (kolom "Species") dari String ke Unique-Integer
data["test_preparation_course"] = pd.factorize(data.test_preparation_course)[0]

In [7]:
#Menghapus kolom "gender", "ethnicity", "parental_level_of_education", "lunch"
data = data.drop(labels="gender", axis=1)
data = data.drop(labels="ethnicity", axis=1)
data = data.drop(labels="parental_level_of_education", axis=1)
data = data.drop(labels="lunch", axis=1)
data.head()

Unnamed: 0,test_preparation_course,math_score,reading_score,writing_score
0,0,59.0,70,78
1,1,96.0,93,87
2,1,57.0,76,77
3,1,70.0,70,63
4,1,83.0,85,86


In [8]:
#Mengubah dataFrame ke array Numpy
data = data.to_numpy()

In [9]:
#Menampilkan data
print(data)

[[ 0.         59.         70.         78.        ]
 [ 1.         96.         93.         87.        ]
 [ 1.         57.         76.         77.        ]
 ...
 [ 0.         67.         86.         86.        ]
 [ 1.         80.         72.         62.        ]
 [ 1.         67.90160643 47.         45.        ]]


In [10]:
#Membagi Dataset => 800 baris data untuk training dan 200 baris data untuk testing
dataTraining = np.concatenate((data[0:400, :], data[500:900, :]), 
                              axis=0)
dataTesting = np.concatenate((data[400:500, :], data[900:1000, :]), 
                             axis=0)


In [11]:
#Memecah Dataset ke Input dan Label
inputTraining = dataTraining[:, 0:4]
inputTesting = dataTesting[:, 0:4]
labelTraining = dataTraining[:, 0]
labelTesting = dataTesting[:, 0]

In [12]:
#Mendefinisikan Decision Tree Classifier
model = tree.DecisionTreeClassifier()

In [13]:
#Mentraining Model
model = model.fit(inputTraining, labelTraining)

In [14]:
#Memprediksi Input Data Testing
hasilPrediksi = model.predict(inputTesting)
print("Label Sebenarnya : ", labelTesting)
print("Hasil Prediksi : ", hasilPrediksi)

Label Sebenarnya :  [1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1.
 0. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0.
 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1. 1. 1. 1. 1.
 1. 0. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1.
 1. 1. 0. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 0. 0. 0. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 0. 0. 0. 1. 1. 1. 1. 1. 1. 0. 1. 0. 1. 1. 1. 0. 1. 0. 1. 1.
 0. 1. 1. 1. 1. 0. 1. 1.]
Hasil Prediksi :  [1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 0. 1. 0. 0. 1.
 0. 1. 1. 0. 1. 1. 1. 1. 0. 1. 1. 0. 0. 1. 0. 1. 1. 1. 1. 0. 0. 0. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 0. 1. 1. 1. 0. 0. 1. 1. 0. 1. 0.
 0. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 0. 0. 0. 1. 0. 1. 1. 0. 1.
 1. 1. 1. 1. 1. 1. 0. 1. 1. 1. 1. 0. 1. 0. 0. 0. 0. 1. 0. 1.

In [15]:
#Menghitung Akurasi
prediksiBenar = (hasilPrediksi == labelTesting).sum()
prediksiSalah = (hasilPrediksi != labelTesting).sum()
print("Prediksi Benar :", prediksiBenar, "data")
print("Prediksi Salah :", prediksiSalah, "data")
print("Akurasi :", prediksiBenar/(prediksiBenar+prediksiSalah) * 100, "%")

Prediksi Benar : 200 data
Prediksi Salah : 0 data
Akurasi : 100.0 %
