In [None]:
# Keegan Saunders
# CS 334 - HW6

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts
from sklearn import metrics

In [None]:
path = "house-votes-84.csv"
cols = ['class-name', 'handicapped-infants', 'water-project-cost-sharing', 'adoption-of-the-budget-resolution', 
        'physician-fee-freeze', 'el-salvador-aid', 'religious-groups-in-schools', 'anti-satellite-test-ban', 
        'aid-to-nicaraguan-contras', 'mx-missile', 'immigration', 'synfuels-corporation-cutback', 'education-spending', 
        'superfund-right-to-sue', 'crime', 'duty-free-exports', 'export-administration-act-south-africa']

features = cols[1:]

df = pd.read_csv(path, names=cols)
df.drop(df.index[0], inplace=True)

# o.h.e., republican = 0, democrat = 1

df['label'] = ''
for index in df.index:
    if df.loc[index, 'class-name'] == 'republican':
       df.loc[index, 'label'] = 0
    else: df.loc[index, 'label'] = 1

df.drop(columns='class-name', inplace=True)
df.head()

In [None]:
for i in df.index:
    for col in features:       
        if df.loc[i, col] == 'y':
           df.loc[i, col] = 1
        elif df.loc[i, col] == 'n':
            df.loc[i, col] = 0
        else: df.loc[i, col] = 0.5
            
df.head()

In [None]:
class Node:
    def __init__(self, pred):
        self.left = None
        self.right = None
        self.pred = pred
        self.feature_index = 0
        self.threshold = 0


class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.num_classes = len(set(y))
        self.num_features = X.shape[1]
        self.tree_ = self._expand(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _find_split(self, X, y):
        # cant split on less than 2 elements
        m = y.size
        if m <= 1:
            return None, None
        
        # count of classes
        num_parent = [np.sum(y == c) for c in range(self.num_classes)]
        #current gini
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None
        
        # go through features
        for idx in range(self.num_features):
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))
            num_left = [0] * self.num_classes
            num_right = num_parent.copy()
            
            #math
            for i in range(1, m):
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.num_classes)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.num_classes)
                )
                
                gini = (i * gini_left + (m - i) * gini_right) / m
                if thresholds[i] == thresholds[i - 1]:
                    continue
                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2
                    
        # return the index of the column with the best split, and the threshold
        return best_idx, best_thr

    def _expand(self, X, y, depth=0):
        class_samples = [np.sum(y == i) for i in range(self.num_classes)]
        predicted_class = np.argmax(class_samples)
        node = Node(predicted_class=predicted_class)
        
        #split until max depth
        if depth < self.max_depth:
            idx, thr = self._find_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._expand(X_left, y_left, depth + 1)
                node.right = self._expand(X_right, y_right, depth + 1)
        return node

    def _predict(self, inputs):
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

In [None]:
X = df[features].values
y = df['label'].values

X_train, X_test, y_train, y_test = tts(X, y, test_size=0.3, random_state=1)

In [None]:
clf = DecisionTreeClassifier(max_depth=2)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# An error is returned complaining about different data types within y_test or y_pred
# I really have no idea, as I have checked the values of them with np.unique and 
# the only values returned are 0,1 for both. Taking the loss on this.