***Oğuzhan Nejat Karabaş***



# Gerekli Ortamların Kurulması

In [145]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
from graphviz import Digraph


# Veri setinin yüklenmesi ve ön işleme işlemleri


In [146]:
# Veri setinin yüklenmesi ve ön işleme işlemleri
train_data = pd.read_csv('trainSet.csv')
test_data = pd.read_csv('testSet.csv')

In [147]:
# Veri setlerimizi görelim
train_data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,b,25.17,3.500,u,g,cc,v,0.625,t,t,7,f,g,0.0,7059,good
1,a,20.67,3.000,u,g,q,v,0.165,t,t,3,f,g,100.0,6,good
2,a,28.58,1.665,u,g,q,v,2.415,t,f,0,t,g,440.0,0,bad
3,a,24.33,2.500,y,p,i,bb,4.500,f,f,0,f,g,200.0,456,bad
4,b,36.67,4.415,y,p,k,v,0.250,t,t,10,t,g,320.0,0,good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
485,a,17.92,10.210,u,g,ff,ff,0.000,f,f,0,f,g,0.0,50,bad
486,a,32.00,6.000,u,g,d,v,1.250,f,f,0,f,g,272.0,0,bad
487,b,23.42,0.585,u,g,c,h,0.085,t,f,0,f,g,180.0,0,bad
488,b,22.67,10.500,u,g,q,h,1.335,t,f,0,f,g,100.0,0,good


In [148]:
test_data

Unnamed: 0,A1,A2,A3,A4,A5,A6,A7,A8,A9,A10,A11,A12,A13,A14,A15,class
0,a,57.080000,19.500,u,g,c,v,5.500,t,t,7,f,g,0.0,3000,good
1,b,34.420000,4.250,u,g,i,bb,3.250,t,t,2,f,g,274.0,610,good
2,b,23.920000,0.665,u,g,c,v,0.165,f,f,0,f,g,100.0,0,good
3,b,49.580000,19.000,u,g,ff,ff,0.000,t,t,1,f,g,94.0,0,bad
4,b,28.580000,3.625,u,g,aa,v,0.250,f,f,0,t,g,100.0,0,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,a,38.580000,5.000,u,g,cc,v,13.500,t,f,0,t,g,980.0,0,bad
196,b,31.568171,0.500,u,g,c,bb,0.835,t,f,0,t,s,320.0,0,bad
197,b,26.000000,1.000,u,g,q,v,1.750,t,f,0,t,g,280.0,0,good
198,b,31.830000,0.040,y,p,m,v,0.040,f,f,0,f,g,0.0,0,bad


# Kategorik Değişkenleri Sayısallaştırma
- Kategorik değişkenler veri kümemizde bulunan sayısal olmayan değerlerdir.
Modelimizin daha doğru sonuçlar vermesi için LabelEncoder() kullanarak bu değişkenleri sayısallaştıracağız.

In [149]:
cat_cols = ['A1', 'A2', 'A3', 'A4',
            'A5', 'A6', 'A7', 'A8',
            'A9', 'A10', 'A11', 'A12', 
            'A13','A14','A15','class']
le = LabelEncoder()
for col in train_data.columns:
    if train_data[col].dtype == 'object': # Kategorik sütunları belirle
        le.fit(train_data[col].values) 
        train_data[col] = le.transform(train_data[col]) # Eğitim setindeki sütunları sayısallaştırır.
        test_data[col] = le.transform(test_data[col]) # Test setindeki sütunlarımızı sayısallaştırır.

In [150]:
X_train = train_data.iloc[:, :-1].values
y_train = train_data.iloc[:, -1].values
X_test = test_data.iloc[:, :-1].values
y_test = test_data.iloc[:, -1].values

# Karar Ağacı Modelimizin Oluşturulması


In [151]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None
    
    def fit(self, X, y):
        self.tree = self._build_tree(X, y)
    
    def predict(self, X):
        y_pred = np.zeros(X.shape[0])
        for i, x in enumerate(X):
            y_pred[i] = self._traverse_tree(x, self.tree)
        return y_pred
    
    def _build_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_labels = len(np.unique(y))
        
        # Durma kriterlerimiz.
        if (depth == self.max_depth or 
            num_labels == 1 or 
            num_samples < self.min_samples_split):
            return np.argmax(np.bincount(y))
        
        # En iyi bölünecek şekilde ayarlıyoruz.
        best_feature, best_threshold = self._find_best_split(X, y, num_samples, num_features)
        
        # Verimizi bölme işlerimiz.
        left_indices = X[:, best_feature] < best_threshold
        right_indices = ~left_indices
        left_tree = self._build_tree(X[left_indices], y[left_indices], depth+1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth+1)
        
        # Node(düğüm) oluşturuyoruz.
        return {'feature': best_feature, 'threshold': best_threshold, 
                'left': left_tree, 'right': right_tree}
    
    def _find_best_split(self, X, y, num_samples, num_features):
        best_gini = float('inf')
        best_feature = None
        best_threshold = None
        
        # Gini saflığını hesaplıyoruz. Kulanmış olduğumuz her bir feature ve threshold için.
        for feature in range(num_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = X[:, feature] < threshold
                right_indices = ~left_indices
                if (np.sum(left_indices) < self.min_samples_leaf or 
                    np.sum(right_indices) < self.min_samples_leaf):
                    continue
                left_labels = y[left_indices]
                right_labels = y[right_indices]
                gini = (len(left_labels)/num_samples)*self._gini_impurity(left_labels) + \
                       (len(right_labels)/num_samples)*self._gini_impurity(right_labels)
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold
                    
        return best_feature, best_threshold
    
    def _gini_impurity(self, labels):
        _, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        return 1 - np.sum(probabilities**2)
    
    def _traverse_tree(self, x, tree):
        if type(tree) != dict:
            return tree
        feature_value = x[tree['feature']]
        if feature_value < tree['threshold']:
            return self._traverse_tree(x, tree['left'])
        else:
            return self._traverse_tree(x, tree['right'])


In [152]:
# Modelimizi eğitiyoruz.
dtc = DecisionTreeClassifier()
dtc = DecisionTreeClassifier(max_depth=3, min_samples_split=10, min_samples_leaf=5)
dtc.fit(X_train, y_train)

In [153]:
# trainSet verileri üzerinde modelimizin performansının değerlendirilmesi
y_train_pred = dtc.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
tn_train, fp_train, fn_train, tp_train = confusion_matrix(y_train, y_train_pred).ravel()
tpr_train = tp_train / (tp_train + fn_train)
tnr_train = tn_train / (tn_train + fp_train)

In [154]:
# testSet verileri üzerinde modelimizin performansının değerlendirilmesi
y_test_pred = dtc.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
tn_test, fp_test, fn_test, tp_test = confusion_matrix(y_test, y_test_pred).ravel()
tpr_test = tp_test / (tp_test + fn_test)
tnr_test = tn_test / (tn_test + fp_test)

# Sonuçlarımızın yazdırılması ve .txt dosyasına kaydedilmesi


In [155]:
with open("performans_olcumleri.txt", "w") as f:
    f.write("Eğitim Sonuçları:\n")
    f.write(f"Accuracy: {train_accuracy}\n")
    f.write(f"True Positive Rate: {tpr_train}\n")
    f.write(f"True Negative Rate: {tnr_train}\n")
    f.write(f"True Positive Adedi: {tp_train}\n")
    f.write(f"True Negative Adedi: {tn_train}\n")
    f.write("\nTest Sonuçları:\n")
    f.write(f"Accuracy: {test_accuracy}\n")
    f.write(f"True Positive Rate: {tpr_test}\n")
    f.write(f"True Negative Rate: {tnr_test}\n")
    f.write(f"True Positive Adedi: {tp_test}\n")
    f.write(f"True Negative Adedi: {tn_test}\n")
#Test ve eğitim sonuçlarımızı ekrana yazdıralım
    print("Eğitim Sonuçları:")
    print(f"Accuracy: {train_accuracy}")
    print(f"True Positive Rate: {tpr_train}")
    print(f"True Negative Rate: {tnr_train}")
    print(f"True Positive Adedi: {tp_train}")
    print(f"True Negative Adedi: {tn_train}\n")

    print("Test Sonuçları:")
    print(f"Accuracy: {test_accuracy}")
    print(f"True Positive Rate: {tpr_test}")
    print(f"True Negative Rate: {tnr_test}")
    print(f"True Positive Adedi: {tp_test}")
    print(f"True Negative Adedi: {tn_test}\n")

Eğitim Sonuçları:
Accuracy: 0.8673469387755102
True Positive Rate: 0.9567307692307693
True Negative Rate: 0.8014184397163121
True Positive Adedi: 199
True Negative Adedi: 226

Test Sonuçları:
Accuracy: 0.81
True Positive Rate: 0.9292929292929293
True Negative Rate: 0.693069306930693
True Positive Adedi: 92
True Negative Adedi: 70



# Karar Ağacı Modelimizin Çizdirilmesi ve .png dosyasına kaydedilmesi


In [156]:
def draw_tree(tree, feature_names):
    from graphviz import Digraph

    g = Digraph('G', filename='decision_tree.gv', format='png')
    draw_node(g, '0', tree, feature_names)
    return g

def draw_node(g, name, tree, feature_names):
    if type(tree) != dict:
        g.node(name, label=str(tree), shape='ellipse')
    else:
        g.node(name, label=feature_names[tree['feature']] + '\n' + str(tree['threshold']))
        left_name = name + '0'
        right_name = name + '1'
        draw_node(g, left_name, tree['left'], feature_names)
        draw_node(g, right_name, tree['right'], feature_names)
        g.edge(name, left_name, label='<')
        g.edge(name, right_name, label='>=')

feature_names = ['A1', 'A2', 'A3', 'A4', 'A5', 'A6', 'A7', 'A8', 'A9', 'A10', 'A11', 'A12', 'A13', 'A14', 'A15','class']
tree_graph = draw_tree(dtc.tree, feature_names)

tree_graph.render()

'decision_tree.gv.png'