In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from ML_utils import load_data,data_split,load_data_test_set
from sklearn.tree import DecisionTreeClassifier
import numpy as np
pd.set_option('future.no_silent_downcasting', True)
df=pd.read_csv('train.csv')

data=load_data(df,['state'],one_hot=True)

data_train,data_valid,data_test=data_split(data)

X_train=data_train[:,:-1]
y_train=data_train[:,-1]
X_val=data_valid[:,:-1]
y_val=data_valid[:,-1]
X_test=data_test[:,:-1]
y_test=data_test[:,-1]

X_train = X_train.astype(np.float32)
X_val = X_val.astype(np.float32)
X_test = X_test.astype(np.float32)
y_train = y_train.astype(np.float32)
y_val = y_val.astype(np.float32)
y_test = y_test.astype(np.float32)

y_train = y_train.reshape(-1,)
y_val = y_val.reshape(-1,1)
y_test = y_test.reshape(-1,1)
class DecisionTree:
    def __init__(self):
        self.tree = None

    def _gini_index(self, y):
        classes, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        gini = 1 - np.sum(probabilities ** 2)
        return gini

    def _split_dataset(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        right_mask = X[:, feature_index] > threshold
        X_left, y_left = X[left_mask], y[left_mask]
        X_right, y_right = X[right_mask], y[right_mask]
        return X_left, y_left, X_right, y_right

    def _find_best_split(self, X, y):
        best_gini = np.inf
        best_feature_index = None
        best_threshold = None

        for feature_index in range(X.shape[1]):
            unique_values = np.unique(X[:, feature_index])
            thresholds = (unique_values[:-1] + unique_values[1:]) / 2

            for threshold in thresholds:
                X_left, y_left, X_right, y_right = self._split_dataset(X, y, feature_index, threshold)
                gini_left = self._gini_index(y_left)
                gini_right = self._gini_index(y_right)
                gini = (len(y_left) * gini_left + len(y_right) * gini_right) / len(y)

                if gini < best_gini:
                    best_gini = gini
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _build_tree(self, X, y):
        if len(np.unique(y)) == 1:
            return np.unique(y)[0]

        best_feature_index, best_threshold = self._find_best_split(X, y)
        X_left, y_left, X_right, y_right = self._split_dataset(X, y, best_feature_index, best_threshold)

        tree = {
            'feature_index': best_feature_index,
            'threshold': best_threshold,
            'left': self._build_tree(X_left, y_left),
            'right': self._build_tree(X_right, y_right)
        }

        return tree

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _predict_sample(self, x, tree):
        if not isinstance(tree, dict):
            return tree

        feature_index = tree['feature_index']
        threshold = tree['threshold']

        if x[feature_index] <= threshold:
            return self._predict_sample(x, tree['left'])
        else:
            return self._predict_sample(x, tree['right'])
    
    def predict(self, X):
        predictions = []

        for x in X:
            prediction = self._predict_sample(x, self.tree)
            predictions.append(prediction)

        return np.array(predictions)
# Create a decision tree classifier


# Fit the model to the training data

dt=DecisionTree()
dt.fit(X_train, y_train)



In [12]:
dt_predictions = dt.predict(X_val)
dt_predictions = dt_predictions.reshape(-1,1)
def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)
print(accuracy(y_val, dt_predictions))



0.8995290423861853


In [None]:
### USE THE MODEL ON THE TEST SET
df_test=pd.read_csv('test.csv')
data_test=load_data_test_set(df_test,['state','id'],one_hot=True)
X_test=data_test
X_test = X_test.astype(np.float32)
y_pred=dt.predict(X_test)

y_pred=np.where(y_pred==1, 'yes', 'no')

id_column = np.arange(1, y_pred.shape[0] + 1)


# Create a DataFrame
df_output = pd.DataFrame({
    'id': id_column,
    'churn': y_pred
})


# Save the DataFrame as a CSV file
df_output.to_csv('output.csv', index=False)
