In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.impute import SimpleImputers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, accuracy_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
import pandas as pd

class LogisticRegr:
    def __init__(self):
        self.model = LogisticRegression(penalty="l2", solver='liblinear')
        self.flag = False
        self.flag_y_pred = None

    def fit(self, X, y):
        y_unique = np.unique(y)
        if len(y_unique) == 1:
            self.flag = True
            self.flag_y_pred = y_unique[0]
        else:
            self.model.fit(X, y)

    def predict(self, X):
        if self.flag:
            return np.full(len(X), self.flag_y_pred, dtype=int)
        else:
            return self.model.predict(X)

    def predict_proba(self, X):
        X = np.array(X)
        if X.ndim == 1:
            X = X.reshape(1, -1)

        if self.flag:
            proba = np.zeros((len(X), 2))
            proba[:, int(self.flag_y_pred)] = 1.0
            return proba
        else:
            return self.model.predict_proba(X)[:, 1]

    def loss(self, y, y_pred):
        return log_loss(y, y_pred, labels=[0,1])


class ModelTree:
    def __init__(self, model, max_depth=5, min_samples_leaf=10, verbose=True):
        self.model = model
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.verbose = verbose
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)
        return self.tree

    def predict(self, X):
        assert self.tree is not None, "ModelTree not fitted"
        return np.array([self._predict_node(self.tree, x) for x in X])

    def predict_proba(self, X):
        assert self.tree is not None, "ModelTree not fitted"
        return np.array([self._proba_node(self.tree, x) for x in X])

    def loss(self, y, y_pred):
        return self.model().loss(y, y_pred)

    def _build_tree(self, X, y, depth=0, container=None):
        if container is None:
            container = {"index_node_global": 0}

        node = self._create_node(X, y, depth, container)

        if depth >= self.max_depth or len(X) <= self.min_samples_leaf:
            if self.verbose:
                self._print_node(node, depth)
            return node

        split_result = self._split_node(node)

        if not split_result["did_split"]:
            if self.verbose:
                self._print_node(node, depth)
            return node

        node.update({
            "j_feature": split_result["j_feature"],
            "threshold": split_result["threshold"],
            "children": {
                "left": self._build_tree(*split_result["data"][0], depth + 1, container),
                "right": self._build_tree(*split_result["data"][1], depth + 1, container)
            }
        })

        return node

    def _fit_model(self, X, y):
        model = self.model()
        model.fit(X, y)
        preds = model.predict(X)
        loss = model.loss(y, preds)

        return loss, model

    def _create_node(self, X, y, depth, container):
        loss, model = self._fit_model(X, y)

        node = {
            'name': 'node',
            'index': container['index_node_global'],
            'loss': loss,
            'model': model,
            'data': (X, y),
            'n_samples': len(X),
            'j_feature': None,
            'threshold': None,
            'children': {
                'left': None,
                'right': None
            },
            'depth': depth
        }

        container["index_node_global"] += 1
        return node

    def _split_data(self, feature, X, y, threshold):
        left_idx = np.where(X[:, feature] <= threshold)[0]
        right_idx = np.setdiff1d(np.arange(len(X)), left_idx)

        left_X, left_y = X[left_idx], y[left_idx]
        right_X, right_y = X[right_idx], y[right_idx]

        return left_X, left_y, right_X, right_y

    def _split_node(self, node):
        X, y = node["data"]
        depth = node["depth"]
        N, d = X.shape

        best_split = {
            "did_split": False,
            "loss": node["loss"],
            "data": None,
            "j_feature": None,
            "threshold": None,
            "N": N
        }

        if depth >= self.max_depth:
            return best_split

        for j_feature in range(d):
            thresholds = np.unique(X[:, j_feature]) # Get all unique values

            for threshold in thresholds:
                X_left, y_left, X_right, y_right = self._split_data(j_feature, X, y, threshold)

                if (len(X_left) < self.min_samples_leaf or len(X_right) < self.min_samples_leaf):
                    continue

                left_loss, left_model = self._fit_model(X_left, y_left)
                right_loss, right_model = self._fit_model(X_right, y_right)
                loss_split = (len(X_left)*left_loss + len(X_right)*right_loss) / N

                if loss_split < best_split["loss"]:
                    best_split.update({
                        "did_split": True,
                        "loss": loss_split,
                        "data": [(X_left, y_left), (X_right, y_right)],
                        "j_feature": j_feature,
                        "threshold": threshold,
                    })

        return best_split

    def _predict_node(self, node, x):
        if node["children"]["left"] is None and node["children"]["right"] is None:
            return node["model"].predict(x.reshape(1, -1))[0]

        feature = node["j_feature"]
        threshold = node["threshold"]

        if x[feature] < threshold:
            return self._predict_node(node["children"]["left"], x)
        else:
            return self._predict_node(node["children"]["right"], x)

    def _proba_node(self, node, x):
        if node["children"]["left"] is None and node["children"]["right"] is None:
            return node["model"].predict_proba(x.reshape(1, -1))[0]

        feature = node["j_feature"]
        threshold = node["threshold"]

        if x[feature] < threshold:
            return self._proba_node(node["children"]["left"], x)
        else:
            return self._proba_node(node["children"]["right"], x)

    def _print_node(self, node, depth):
        print(f"Depth: {depth}, Node Index: {node['index']}, Loss: {node['loss']}, Samples: {node['n_samples']}")

# Load dataset
dataset = pd.read_csv('/content/drive/MyDrive/CAPSTONE/Colab/Code/Code/cleve.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values  # Assuming the target variable is in the last column

# Handle missing data
imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 11:13] = imputer.fit_transform(X[:, 11:13])

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9)

# Feature Scaling
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


model = LogisticRegr
tree = ModelTree(model=model, max_depth=10)
tree.fit(X_train, y_train)
predictions = tree.predict(X_test)

ImportError: cannot import name 'SimpleImputers' from 'sklearn.impute' (/usr/local/lib/python3.11/dist-packages/sklearn/impute/__init__.py)

In [None]:
from sklearn.utils import resample

class ModelForest:
    def __init__(self, model, n_trees=10, max_depth=5, min_samples_leaf=10, verbose=True, max_features="sqrt"):
        self.model = model
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.verbose = verbose
        self.max_features = max_features
        self.trees = []

    def fit(self, X, y):
        n_samples, n_features = X.shape

        if self.max_features == "sqrt":
            self.n_features = int(np.sqrt(n_features))
        elif self.max_features == "log2":
            self.n_features = int(np.log2(n_features))
        elif isinstance(self.max_features, int):
            self.n_features = self.max_features
        else:
            raise ValueError("max_features must be 'sqrt', 'log2', or an integer.")

        self.trees = []
        for i in range(self.n_trees):
            if self.verbose:
                print(f"Building tree {i + 1}/{self.n_trees}")

            X_sample, y_sample = resample(X, y, replace=True)

            tree = ModelTree(
                model=self.model,
                max_depth=self.max_depth,
                min_samples_leaf=self.min_samples_leaf,
                verbose=self.verbose
            )
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        predictions = np.array([tree.predict(X) for tree in self.trees])
        return np.round(np.mean(predictions, axis=0)).astype(int)  # Majority voting

    def predict_proba(self, X):
        probas = np.array([tree.predict_proba(X) for tree in self.trees])
        return np.mean(probas, axis=0)  # Average probabilities

    def loss(self, y, y_pred):
        return log_loss(y, y_pred, labels=[0, 1])

In [None]:
tree = ModelTree(LogisticRegr, min_samples_leaf=10)
tree.fit(X_train, y_train)
preds = tree.predict(X_test)
print(classification_report(y_test, preds))

In [None]:
forest = ModelForest(LogisticRegr, min_samples_leaf=10)
forest.fit(X_train, y_train)
preds = forest.predict(X_test)
print(classification_report(y_test, preds))

In [None]:
forest = ModelForest(LogisticRegr, n_trees=20, min_samples_leaf=5)
forest.fit(X_train, y_train)
preds = forest.predict(X_test)
print(classification_report(y_test, preds))

In [None]:
forest = ModelForest(LogisticRegr, n_trees=20, min_samples_leaf=2)
forest.fit(X_train, y_train)
preds = forest.predict(X_test)
print(classification_report(y_test, preds))