In [10]:
from google.colab import drive
drive.mount('/content/drive/Shared with me/CAPSTONE/Colab/Final/cleve.csv')

ValueError: Mountpoint must not contain a space.

In [None]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    log_loss, mean_squared_error, accuracy_score,
    confusion_matrix, classification_report, roc_auc_score,
    roc_curve, recall_score
)
from sklearn.utils import resample
from sklearn.base import BaseEstimator, ClassifierMixin


class LogisticRegr:
    def __init__(self, log_loss=True):
        self.use_log_loss = log_loss
        self.model = LogisticRegression(penalty="l2", solver='liblinear')
        self.flag = False
        self.flag_y_pred = None

    def fit(self, X, y):
        y_unique = np.unique(y)
        if len(y_unique) == 1:
            self.flag = True
            self.flag_y_pred = y_unique[0]
        else:
            self.model.fit(X, y)

    def predict(self, X):
        if self.flag:
            return np.full(len(X), self.flag_y_pred, dtype=int)
        else:
            return self.model.predict(X)

    def predict_proba(self, X):
        X = np.array(X)
        if X.ndim == 1:
            X = X.reshape(1, -1)

        if self.flag:
            proba = np.zeros((len(X), 2))
            proba[:, int(self.flag_y_pred)] = 1.0
            return proba
        else:
            return self.model.predict_proba(X)

    def loss(self, y, y_pred_proba_or_class):
        if self.use_log_loss:
            return log_loss(y, y_pred_proba_or_class, labels=[0, 1])
        else:
            return mean_squared_error(y, y_pred_proba_or_class)


class ModelTree:
    def __init__(self, model, max_depth=5, min_samples_leaf=10, log_loss=True, verbose=True):
        self.model = model
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.verbose = verbose
        self.log_loss = log_loss
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)
        return self.tree

    def predict(self, X):
        assert self.tree is not None, "ModelTree not fitted"
        return np.array([self._predict_node(self.tree, x) for x in X])

    def predict_proba(self, X):
        assert self.tree is not None, "ModelTree not fitted"
        probas = np.array([self._proba_node(self.tree, x) for x in X])
        if probas.ndim == 1:
            probas = np.column_stack([1 - probas, probas])
        return probas

    def _build_tree(self, X, y, depth=0, container=None):
        if container is None:
            container = {"index_node_global": 0}

        node = self._create_node(X, y, depth, container)

        if depth >= self.max_depth or len(X) <= self.min_samples_leaf:
            if self.verbose:
                self._print_node(node, depth)
            return node

        split_result = self._split_node(node)

        if not split_result["did_split"]:
            if self.verbose:
                self._print_node(node, depth)
            return node

        node.update({
            "j_feature": split_result["j_feature"],
            "threshold": split_result["threshold"],
            "children": {
                "left": self._build_tree(*split_result["data"][0], depth + 1, container),
                "right": self._build_tree(*split_result["data"][1], depth + 1, container)
            }
        })

        return node

    def _fit_model(self, X, y):
        model = self.model(log_loss=self.log_loss)
        model.fit(X, y)
        preds = model.predict_proba(X) if self.log_loss else model.predict(X)
        loss = model.loss(y, preds)
        return loss, model

    def _create_node(self, X, y, depth, container):
        loss, model = self._fit_model(X, y)

        node = {
            'name': 'node',
            'index': container['index_node_global'],
            'loss': loss,
            'model': model,
            'data': (X, y),
            'n_samples': len(X),
            'j_feature': None,
            'threshold': None,
            'children': {
                'left': None,
                'right': None
            },
            'depth': depth
        }

        container["index_node_global"] += 1
        return node

    def _split_data(self, feature, X, y, threshold):
        left_idx = np.where(X[:, feature] <= threshold)[0]
        right_idx = np.setdiff1d(np.arange(len(X)), left_idx)

        left_X, left_y = X[left_idx], y[left_idx]
        right_X, right_y = X[right_idx], y[right_idx]

        return left_X, left_y, right_X, right_y

    def _split_node(self, node):
        X, y = node["data"]
        depth = node["depth"]
        N, d = X.shape

        best_split = {
            "did_split": False,
            "loss": node["loss"],
            "data": None,
            "j_feature": None,
            "threshold": None,
            "N": N
        }

        if depth >= self.max_depth:
            return best_split

        for j_feature in range(d):
            thresholds = np.unique(X[:, j_feature])

            for threshold in thresholds:
                X_left, y_left, X_right, y_right = self._split_data(j_feature, X, y, threshold)

                if (len(X_left) < self.min_samples_leaf or len(X_right) < self.min_samples_leaf):
                    continue

                left_loss, _ = self._fit_model(X_left, y_left)
                right_loss, _ = self._fit_model(X_right, y_right)
                loss_split = (len(X_left)*left_loss + len(X_right)*right_loss) / N

                if loss_split < best_split["loss"]:
                    best_split.update({
                        "did_split": True,
                        "loss": loss_split,
                        "data": [(X_left, y_left), (X_right, y_right)],
                        "j_feature": j_feature,
                        "threshold": threshold,
                    })

        return best_split

    def _predict_node(self, node, x):
        if node["children"]["left"] is None and node["children"]["right"] is None:
            return node["model"].predict(x.reshape(1, -1))[0]

        feature = node["j_feature"]
        threshold = node["threshold"]

        if x[feature] < threshold:
            return self._predict_node(node["children"]["left"], x)
        else:
            return self._predict_node(node["children"]["right"], x)

    def _proba_node(self, node, x):
        if node["children"]["left"] is None and node["children"]["right"] is None:
            return node["model"].predict_proba(x.reshape(1, -1))[0]

        feature = node["j_feature"]
        threshold = node["threshold"]

        if x[feature] < threshold:
            return self._proba_node(node["children"]["left"], x)
        else:
            return self._proba_node(node["children"]["right"], x)

    def _print_node(self, node, depth):
        print(f"Depth: {depth}, Node Index: {node['index']}, Loss: {node['loss']}, Samples: {node['n_samples']}")


class EnhancedModelForest(BaseEstimator, ClassifierMixin):
    def __init__(self, model, n_trees=10, max_depth=5, min_samples_leaf=10,
                 verbose=True, max_features="sqrt", boosting=False, learning_rate=0.1, log_loss=True):
        self.model = model
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.min_samples_leaf = min_samples_leaf
        self.verbose = verbose
        self.max_features = max_features
        self.boosting = boosting
        self.learning_rate = learning_rate
        self.log_loss = log_loss
        self.trees = []
        self.tree_weights = []

    def fit(self, X, y):
        n_samples, n_features = X.shape

        if self.max_features == "sqrt":
            self.n_features = int(np.sqrt(n_features))
        elif self.max_features == "log2":
            self.n_features = int(np.log2(n_features))
        elif isinstance(self.max_features, int):
            self.n_features = self.max_features
        else:
            raise ValueError("max_features must be 'sqrt', 'log2', or an integer.")

        self.trees = []
        self.tree_weights = []

        if not self.boosting:
            for i in range(self.n_trees):
                if self.verbose:
                    print(f"Building bagged tree {i + 1}/{self.n_trees}")
                X_sample, y_sample = resample(X, y, replace=True)
                tree = ModelTree(
                    model=self.model,
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    verbose=self.verbose,
                    log_loss=self.log_loss
                )
                tree.fit(X_sample, y_sample)
                self.trees.append(tree)
                self.tree_weights.append(1.0)
        else:
            sample_weights = np.ones(len(y)) / len(y)
            for i in range(self.n_trees):
                if self.verbose:
                    print(f"Building boosted tree {i + 1}/{self.n_trees}")
                X_sample, y_sample, sample_weights_sampled = self._boost_sample(X, y, sample_weights)
                tree = ModelTree(
                    model=self.model,
                    max_depth=self.max_depth,
                    min_samples_leaf=self.min_samples_leaf,
                    verbose=self.verbose,
                    log_loss=self.log_loss
                )
                tree.fit(X_sample, y_sample)
                preds = tree.predict(X)
                incorrect = (preds != y)
                error = np.sum(sample_weights * incorrect) / np.sum(sample_weights)
                if error > 0.5:
                    if self.verbose:
                        print(f"Discarding tree {i+1} with error {error:.4f}")
                    continue
                elif error == 0:
                    tree_weight = 1.0
                else:
                    tree_weight = self.learning_rate * np.log((1 - error) / error)
                sample_weights *= np.exp(tree_weight * incorrect)
                sample_weights /= np.sum(sample_weights)
                self.trees.append(tree)
                self.tree_weights.append(tree_weight)

    def _boost_sample(self, X, y, sample_weights):
        indices = np.random.choice(len(X), size=len(X), replace=True, p=sample_weights)
        return X[indices], y[indices], sample_weights[indices]

    def predict(self, X):
        if not self.trees:
            raise ValueError("Model not fitted yet.")
        if not self.boosting:
            predictions = np.array([tree.predict(X) for tree in self.trees])
            return np.round(np.mean(predictions, axis=0)).astype(int)
        else:
            predictions = np.array([tree.predict(X) for tree in self.trees])
            weights = np.array(self.tree_weights).reshape(-1, 1)
            weighted_pred = np.sum(predictions * weights, axis=0) / np.sum(weights)
            return np.round(weighted_pred).astype(int)

    def predict_proba(self, X):
        if not self.trees:
            raise ValueError("Model not fitted yet.")
        if not self.boosting:
            probas = np.array([tree.predict_proba(X) for tree in self.trees])
            return np.mean(probas, axis=0)
        else:
            probas = np.array([tree.predict_proba(X) for tree in self.trees])
            weights = np.array(self.tree_weights).reshape(-1, 1, 1)
            weighted_proba = np.sum(probas * weights, axis=0) / np.sum(weights)
            return weighted_proba

    def score(self, X, y):
        y_pred = self.predict(X)
        return accuracy_score(y, y_pred)

    def loss(self, y, y_pred_or_proba):
        if self.log_loss:
            return log_loss(y, y_pred_or_proba, labels=[0, 1])
        else:
            return mean_squared_error(y, y_pred_or_proba)


In [None]:
dataset = pd.read_csv('/content/drive/MyDrive/CAPSTONE/Colab/Final/cleve.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X[:, 11:13] = imputer.fit_transform(X[:, 11:13])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9)

NameError: name 'pd' is not defined

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import ParameterSampler
from scipy.stats import randint, uniform
import numpy as np

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', EnhancedModelForest(model=LogisticRegr))
])

param_distributions = {
    'model__n_trees': randint(5, 21),
    'model__max_depth': randint(3, 10),
    'model__min_samples_leaf': randint(5, 20),
    'model__boosting': [True, False],
    'model__learning_rate': uniform(0.01, 0.5),
    'model__log_loss': [True, False]
}

n_iter = 50
param_list = list(ParameterSampler(param_distributions, n_iter=n_iter, random_state=42))

best_score = -np.inf
best_params = None
best_model = None
report = None

for params in param_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    score = accuracy_score(y_test, preds)

    if score > best_score:
        best_score = score
        best_params = params
        best_model = pipeline
        report = classification_report(y_test, preds)

print("\nBest parameters found:")
print(best_params)

print("\nTest accuracy:")
print(best_score)

print(report)


Building boosted tree 1/17
Depth: 3, Node Index: 3, Loss: 0.1638271133139882, Samples: 20
Depth: 3, Node Index: 4, Loss: 2.2204460492503136e-16, Samples: 12
Depth: 4, Node Index: 7, Loss: 0.07120887328449989, Samples: 13
Depth: 4, Node Index: 8, Loss: 2.2204460492503136e-16, Samples: 40
Depth: 4, Node Index: 10, Loss: 0.12219408203463303, Samples: 16
Depth: 4, Node Index: 11, Loss: 0.05972722924935247, Samples: 13
Depth: 4, Node Index: 15, Loss: 0.09398392445150706, Samples: 17
Depth: 4, Node Index: 16, Loss: 0.06958602709076306, Samples: 18
Depth: 5, Node Index: 19, Loss: 2.2204460492503136e-16, Samples: 12
Depth: 5, Node Index: 20, Loss: 0.09487955710031278, Samples: 21
Depth: 4, Node Index: 21, Loss: 2.2204460492503136e-16, Samples: 13
Depth: 3, Node Index: 23, Loss: 0.10319895565659981, Samples: 12
Depth: 3, Node Index: 24, Loss: 0.07764695512290512, Samples: 20
Building boosted tree 2/17
Depth: 4, Node Index: 4, Loss: 2.2204460492503136e-16, Samples: 25
Depth: 4, Node Index: 5, Lo

KeyboardInterrupt: 

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import ParameterSampler
from scipy.stats import randint, uniform
import numpy as np

# Assume EnhancedModelForest and LogisticRegr are defined
# pipeline: scaler -> model
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', EnhancedModelForest(model=LogisticRegr))
])

# Define search space
param_distributions = {
    'model__n_trees': randint(5, 21),
    'model__max_depth': randint(3, 10),
    'model__min_samples_leaf': randint(5, 20),
    'model__boosting': [True, False],
    'model__learning_rate': uniform(0.01, 0.5),
    'model__log_loss': [True, False]
}

# Sample parameter combinations
n_iter = 50
param_list = list(ParameterSampler(param_distributions, n_iter=n_iter, random_state=42))

best_score = -np.inf
best_params = None
best_model = None
report = None

for params in param_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    score = accuracy_score(y_test, preds)

    if score > best_score:
        best_score = score
        best_params = params
        best_model = pipeline
        report = classification_report(y_test, preds)

print("\nBest parameters found:")
print(best_params)

print(f"\nTest accuracy: {best_score}")

print(report)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

# Define both parameter sets
param_sets = [
    {'model__boosting': False, 'model__learning_rate': 0.026525366450274195,
     'model__log_loss': False, 'model__max_depth': 3, 'model__min_samples_leaf': 5, 'model__n_trees': 7},

    {'model__boosting': False, 'model__learning_rate': 0.01703991135754223,
     'model__log_loss': True, 'model__max_depth': 3, 'model__min_samples_leaf': 12, 'model__n_trees': 7}
]

for i, params in enumerate(param_sets, start=1):
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', EnhancedModelForest(model=LogisticRegr))
    ])

    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    acc = accuracy_score(y_test, preds)
    report = classification_report(y_test, preds)

    print(f"\n--- Results for Param Set {i} ---")
    print(f"Parameters: {params}")
    print(f"Accuracy: {acc:.4f}")
    print("Classification Report:")
    print(report)


In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.utils.fixes import loguniform
from sklearn.model_selection import ParameterSampler
from scipy.stats import randint, uniform
import numpy as np

pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', EnhancedModelForest(model=LogisticRegr))
])

param_distributions = {
    'model__n_trees': randint(6, 9),  # zoom around 7
    'model__max_depth': [3],          # fixed at 3 (both sets agreed)
    'model__min_samples_leaf': randint(4, 14),  # range between 5–12 with buffer
    'model__boosting': [False],       # only False performed well
    'model__learning_rate': uniform(0.015, 0.015),  # zoom in around 0.017–0.0265
    'model__log_loss': [True, False]  # both worked, test again
}


# Sample parameter combinations
n_iter = 50
param_list = list(ParameterSampler(param_distributions, n_iter=n_iter, random_state=42))

best_score = -np.inf
best_params = None
best_model = None
report = None

for params in param_list:
    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    score = accuracy_score(y_test, preds)

    if score > best_score:
        best_score = score
        best_params = params
        best_model = pipeline
        report = classification_report(y_test, preds)

print("\nBest parameters found:")
print(best_params)

print(f"\nTest accuracy: {best_score}")

print(report)
