In [2]:
pip install scikit-learn


In [41]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Nodo del Árbol de Decisión
class DecisionNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# Implementación del Árbol de Decisión
class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=5):
        self.max_depth = max_depth
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        unique_labels = np.unique(y)

        # Criterio de parada
        if len(unique_labels) == 1 or depth >= self.max_depth or num_samples <= 1:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        # Encontrar la mejor división
        best_feature, best_threshold = self._best_split(X, y, num_features)

        if best_feature is None:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        # Dividir los datos
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return DecisionNode(feature_index=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)

    def _best_split(self, X, y, num_features):
        best_gain = -1
        split_index, split_threshold = None, None

        for feature_index in range(num_features):
            feature_values = X[:, feature_index]
            possible_thresholds = np.unique(feature_values)

            for threshold in possible_thresholds:
                gain = self._information_gain(y, feature_values, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_index = feature_index
                    split_threshold = threshold

        return split_index, split_threshold

    def _information_gain(self, y, feature_values, threshold):
        # Calcular la ganancia de información
        parent_entropy = self._entropy(y)

        left_indices = feature_values <= threshold
        right_indices = feature_values > threshold

        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return 0

        n = len(y)
        n_left, n_right = len(y[left_indices]), len(y[right_indices])

        entropy_left = self._entropy(y[left_indices])
        entropy_right = self._entropy(y[right_indices])

        child_entropy = (n_left / n) * entropy_left + (n_right / n) * entropy_right

        information_gain = parent_entropy - child_entropy
        return information_gain

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature_index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

# Cargar el dataset
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

# Codificación de variables categóricas
labelencoder = LabelEncoder()
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
for col in categorical_columns:
    df[col] = labelencoder.fit_transform(df[col])

# Separar características (X) y variable objetivo (y)
X = df.drop('NObeyesdad', axis=1).values
y = labelencoder.fit_transform(df['NObeyesdad'])  # Codificar la variable objetivo

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo de Árbol de Decisión personalizado
clf_custom = DecisionTreeClassifierCustom(max_depth=9)
clf_custom.fit(X_train, y_train)

# Realizar predicciones
y_pred = clf_custom.predict(X_test)

# Evaluación del modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Precisión del modelo: {accuracy:.2f}')
print('\nMatriz de Confusión:')
print(confusion_matrix(y_test, y_pred))
print('\nReporte de Clasificación:')
print(classification_report(y_test, y_pred))


# validacion cruzada

In [46]:
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
from sklearn.model_selection import KFold

# Nodo del Árbol de Decisión
class DecisionNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# Implementación del Árbol de Decisión
class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=5, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        unique_labels = np.unique(y)

        # Criterio de parada
        if len(unique_labels) == 1 or depth >= self.max_depth or num_samples < self.min_samples_split or num_samples <= self.min_samples_leaf:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        # Encontrar la mejor división
        best_feature, best_threshold = self._best_split(X, y, num_features)

        if best_feature is None:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        # Dividir los datos
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        if len(left_indices) < self.min_samples_leaf or len(right_indices) < self.min_samples_leaf:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return DecisionNode(feature_index=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)

    def _best_split(self, X, y, num_features):
        best_gain = -1
        split_index, split_threshold = None, None

        for feature_index in range(num_features):
            feature_values = X[:, feature_index]
            possible_thresholds = np.unique(feature_values)

            for threshold in possible_thresholds:
                gain = self._information_gain(y, feature_values, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_index = feature_index
                    split_threshold = threshold

        return split_index, split_threshold

    def _information_gain(self, y, feature_values, threshold):
        # Calcular la ganancia de información
        parent_entropy = self._entropy(y)

        left_indices = feature_values <= threshold
        right_indices = feature_values > threshold

        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return 0

        n = len(y)
        n_left, n_right = len(y[left_indices]), len(y[right_indices])

        entropy_left = self._entropy(y[left_indices])
        entropy_right = self._entropy(y[right_indices])

        child_entropy = (n_left / n) * entropy_left + (n_right / n) * entropy_right

        information_gain = parent_entropy - child_entropy
        return information_gain

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature_index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

# Cargar el dataset
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

# Codificación de variables categóricas
labelencoder = LabelEncoder()
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
for col in categorical_columns:
    df[col] = labelencoder.fit_transform(df[col])

# Separar características (X) y variable objetivo (y)
X = df.drop('NObeyesdad', axis=1).values
y = labelencoder.fit_transform(df['NObeyesdad'])  # Codificar la variable objetivo

# Validación cruzada para evaluar el modelo
kf = KFold(n_splits=5, shuffle=True, random_state=42)
accuracies = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    # Entrenar el modelo de Árbol de Decisión personalizado
    clf_custom = DecisionTreeClassifierCustom(max_depth=7, min_samples_split=5, min_samples_leaf=5)
    clf_custom.fit(X_train, y_train)

    # Realizar predicciones
    y_pred = clf_custom.predict(X_test)

    # Evaluación del modelo
    accuracy = accuracy_score(y_test, y_pred)
    accuracies.append(accuracy)

    # Imprimir la matriz de confusión y el reporte de clasificación para cada partición
    print(f"Matriz de Confusión para la partición actual:\n{confusion_matrix(y_test, y_pred)}")
    print(f"Reporte de Clasificación para la partición actual:\n{classification_report(y_test, y_pred)}")

# Imprimir los resultados de validación cruzada
print(f'Precisión promedio del modelo con validación cruzada: {np.mean(accuracies):.2f}')
print(f'Desviación estándar de la precisión: {np.std(accuracies):.2f}')


Matriz de Confusión para la partición actual:
[[54  2  0  0  0  0  0]
 [ 5 46  0  0  0 10  1]
 [ 0  0 76  2  0  0  0]
 [ 0  0  2 56  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  3  0  0  0 43 10]
 [ 0  0  1  0  0  2 47]]
Reporte de Clasificación para la partición actual:
              precision    recall  f1-score   support

           0       0.92      0.96      0.94        56
           1       0.90      0.74      0.81        62
           2       0.96      0.97      0.97        78
           3       0.97      0.97      0.97        58
           4       1.00      1.00      1.00        63
           5       0.78      0.77      0.77        56
           6       0.81      0.94      0.87        50

    accuracy                           0.91       423
   macro avg       0.91      0.91      0.90       423
weighted avg       0.91      0.91      0.91       423

Matriz de Confusión para la partición actual:
[[49  1  0  0  0  0  0]
 [ 3 51  0  0  0  6  0]
 [ 0  0 54  1  0  0  4]
 [ 0  0  4 57  0  0

# random forest

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Clase para el Árbol de Decisión
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        dataset = np.c_[X, y]
        self.tree = self._build_tree(dataset)

    def _build_tree(self, dataset, depth=0):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_features = X.shape

        if (depth >= self.max_depth or num_samples < self.min_samples_split or len(set(y)) == 1):
            leaf_value = self._most_common_label(y)
            return leaf_value

        # Obtener mejor división
        best_feature, best_threshold = self._best_split(dataset, num_features)

        if best_feature is None:
            leaf_value = self._most_common_label(y)
            return leaf_value

        # Crear nodos hijos
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_child = self._build_tree(dataset[left_indices], depth + 1)
        right_child = self._build_tree(dataset[right_indices], depth + 1)
        return {"feature": best_feature, "threshold": best_threshold, "left": left_child, "right": right_child}

    def _best_split(self, dataset, num_features):
        best_gain = -float("inf")
        best_feature, best_threshold = None, None
        for feature in range(num_features):
            thresholds = np.unique(dataset[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(dataset, feature, threshold)

                if gain > best_gain:
                    best_gain, best_feature, best_threshold = gain, feature, threshold
        return best_feature, best_threshold

    def _information_gain(self, dataset, feature, threshold):
        parent_entropy = self._entropy(dataset[:, -1])

        left_split = dataset[dataset[:, feature] <= threshold]
        right_split = dataset[dataset[:, feature] > threshold]

        if len(left_split) == 0 or len(right_split) == 0:
            return 0

        n = len(dataset)
        n_left, n_right = len(left_split), len(right_split)
        child_entropy = (n_left / n) * self._entropy(left_split[:, -1]) + (n_right / n) * self._entropy(right_split[:, -1])

        info_gain = parent_entropy - child_entropy
        return info_gain

    def _entropy(self, y):
        hist = np.bincount(y.astype(int))
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        counter = Counter(y)
        return counter.most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if not isinstance(node, dict):
            return node

        if x[node["feature"]] <= node["threshold"]:
            return self._traverse_tree(x, node["left"])
        else:
            return self._traverse_tree(x, node["right"])

# Clase para el Random Forest
class RandomForest:
    def __init__(self, n_estimators=10, max_depth=10, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples = X.shape[0]

        for _ in range(self.n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(n_samples, n_samples, replace=True)
            X_sample = X[indices]
            y_sample = y[indices]

            # Crear un árbol y entrenarlo
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # Votación mayoritaria
        y_pred = [Counter(tree_preds[:, i]).most_common(1)[0][0] for i in range(X.shape[0])]
        return y_pred

# Cargar los datos
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

# Preprocesamiento: codificación de variables categóricas
label_encoders = {}
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', ]

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separación de características y etiquetas
X = df.drop('NObeyesdad', axis=1).values  # Características
y = df['NObeyesdad'].factorize()[0]  # Etiqueta como numérico

# División del dataset en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Crear el modelo Random Forest desde cero
rf_custom = RandomForest(n_estimators=10, max_depth=10, min_samples_split=2)

# Entrenar el modelo
rf_custom.fit(X_train, y_train)

# Realizar predicciones
y_pred = rf_custom.predict(X_test)

# Evaluación del modelo
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[83  3  0  0  7  0  0]
 [ 1 86  1  0  0  0  0]
 [ 0  4 74  1  0  0  0]
 [ 0  0  0 99  0  3  0]
 [ 4  0  0  0 82  0  0]
 [ 0  0  0  6  0 82  0]
 [ 0  0  0  1  0  0 97]]
              precision    recall  f1-score   support

           0       0.94      0.89      0.92        93
           1       0.92      0.98      0.95        88
           2       0.99      0.94      0.96        79
           3       0.93      0.97      0.95       102
           4       0.92      0.95      0.94        86
           5       0.96      0.93      0.95        88
           6       1.00      0.99      0.99        98

    accuracy                           0.95       634
   macro avg       0.95      0.95      0.95       634
weighted avg       0.95      0.95      0.95       634



# sin train mark

In [None]:
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix

# Clase para el Árbol de Decisión
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        dataset = np.c_[X, y]
        self.tree = self._build_tree(dataset)

    def _build_tree(self, dataset, depth=0):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_features = X.shape

        if (depth >= self.max_depth or num_samples < self.min_samples_split or len(set(y)) == 1):
            leaf_value = self._most_common_label(y)
            return leaf_value

        # Obtener mejor división
        best_feature, best_threshold = self._best_split(dataset, num_features)

        if best_feature is None:
            leaf_value = self._most_common_label(y)
            return leaf_value

        # Crear nodos hijos
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_child = self._build_tree(dataset[left_indices], depth + 1)
        right_child = self._build_tree(dataset[right_indices], depth + 1)
        return {"feature": best_feature, "threshold": best_threshold, "left": left_child, "right": right_child}

    def _best_split(self, dataset, num_features):
        best_gain = -float("inf")
        best_feature, best_threshold = None, None
        for feature in range(num_features):
            thresholds = np.unique(dataset[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(dataset, feature, threshold)

                if gain > best_gain:
                    best_gain, best_feature, best_threshold = gain, feature, threshold
        return best_feature, best_threshold

    def _information_gain(self, dataset, feature, threshold):
        parent_entropy = self._entropy(dataset[:, -1])

        left_split = dataset[dataset[:, feature] <= threshold]
        right_split = dataset[dataset[:, feature] > threshold]

        if len(left_split) == 0 or len(right_split) == 0:
            return 0

        n = len(dataset)
        n_left, n_right = len(left_split), len(right_split)
        child_entropy = (n_left / n) * self._entropy(left_split[:, -1]) + (n_right / n) * self._entropy(right_split[:, -1])

        info_gain = parent_entropy - child_entropy
        return info_gain

    def _entropy(self, y):
        hist = np.bincount(y.astype(int))
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        counter = Counter(y)
        return counter.most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if not isinstance(node, dict):
            return node

        if x[node["feature"]] <= node["threshold"]:
            return self._traverse_tree(x, node["left"])
        else:
            return self._traverse_tree(x, node["right"])

# Clase para el Random Forest
class RandomForest:
    def __init__(self, n_estimators=10, max_depth=10, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        n_samples = X.shape[0]

        for _ in range(self.n_estimators):
            # Bootstrap sampling
            indices = np.random.choice(n_samples, n_samples, replace=True)
            X_sample = X[indices]
            y_sample = y[indices]

            # Crear un árbol y entrenarlo
            tree = DecisionTree(max_depth=self.max_depth, min_samples_split=self.min_samples_split)
            tree.fit(X_sample, y_sample)
            self.trees.append(tree)

    def predict(self, X):
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        # Votación mayoritaria
        y_pred = [Counter(tree_preds[:, i]).most_common(1)[0][0] for i in range(X.shape[0])]
        return y_pred

# Cargar los datos
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

# Preprocesamiento: codificación de variables categóricas
label_encoders = {}
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

# Separación de características y etiquetas
X = df.drop('NObeyesdad', axis=1).values  # Características
y = df['NObeyesdad'].factorize()[0]  # Etiqueta como numérico

# Implementación manual de la división del dataset en entrenamiento y prueba
np.random.seed(42)
shuffled_indices = np.random.permutation(len(X))
test_set_size = int(len(X) * 0.3)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

# Crear el modelo Random Forest desde cero
rf_custom = RandomForest(n_estimators=10, max_depth=10, min_samples_split=5)

# Entrenar el modelo
rf_custom.fit(X_train, y_train)

# Realizar predicciones
y_pred = rf_custom.predict(X_test)

# Evaluación del modelo
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[81  6  0  0  6  0  0]
 [ 3 84  1  0  0  0  0]
 [ 0  3 76  0  0  0  0]
 [ 0  2  0 97  0  3  0]
 [ 6  0  0  0 80  0  0]
 [ 0  0  0  2  0 85  0]
 [ 0  0  0  1  0  0 97]]
              precision    recall  f1-score   support

           0       0.90      0.87      0.89        93
           1       0.88      0.95      0.92        88
           2       0.99      0.96      0.97        79
           3       0.97      0.95      0.96       102
           4       0.93      0.93      0.93        86
           5       0.97      0.98      0.97        87
           6       1.00      0.99      0.99        98

    accuracy                           0.95       633
   macro avg       0.95      0.95      0.95       633
weighted avg       0.95      0.95      0.95       633



# árbol de decicion todo implementado

In [36]:
import numpy as np
import pandas as pd
from collections import Counter

# Clase para el Árbol de Decisión
class DecisionTree:
    def __init__(self, max_depth=10, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None

    def fit(self, X, y):
        dataset = np.c_[X, y]
        self.tree = self._build_tree(dataset)

    def _build_tree(self, dataset, depth=0):
        X, y = dataset[:, :-1], dataset[:, -1]
        num_samples, num_features = X.shape

        if (depth >= self.max_depth or num_samples < self.min_samples_split or len(set(y)) == 1):
            leaf_value = self._most_common_label(y)
            return leaf_value

        # Obtener mejor división
        best_feature, best_threshold = self._best_split(dataset, num_features)

        if best_feature is None:
            leaf_value = self._most_common_label(y)
            return leaf_value

        # Crear nodos hijos
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        left_child = self._build_tree(dataset[left_indices], depth + 1)
        right_child = self._build_tree(dataset[right_indices], depth + 1)
        return {"feature": best_feature, "threshold": best_threshold, "left": left_child, "right": right_child}

    def _best_split(self, dataset, num_features):
        best_gain = -float("inf")
        best_feature, best_threshold = None, None
        for feature in range(num_features):
            thresholds = np.unique(dataset[:, feature])
            for threshold in thresholds:
                gain = self._information_gain(dataset, feature, threshold)

                if gain > best_gain:
                    best_gain, best_feature, best_threshold = gain, feature, threshold
        return best_feature, best_threshold

    def _information_gain(self, dataset, feature, threshold):
        parent_entropy = self._entropy(dataset[:, -1])

        left_split = dataset[dataset[:, feature] <= threshold]
        right_split = dataset[dataset[:, feature] > threshold]

        if len(left_split) == 0 or len(right_split) == 0:
            return 0

        n = len(dataset)
        n_left, n_right = len(left_split), len(right_split)
        child_entropy = (n_left / n) * self._entropy(left_split[:, -1]) + (n_right / n) * self._entropy(right_split[:, -1])

        info_gain = parent_entropy - child_entropy
        return info_gain

    def _entropy(self, y):
        hist = np.bincount(y.astype(int))
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        counter = Counter(y)
        return counter.most_common(1)[0][0]

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.tree) for x in X])

    def _traverse_tree(self, x, node):
        if not isinstance(node, dict):
            return node

        if x[node["feature"]] <= node["threshold"]:
            return self._traverse_tree(x, node["left"])
        else:
            return self._traverse_tree(x, node["right"])

# Implementación de codificación manual de variables categóricas
def manual_label_encoder(column):
    unique_values = list(set(column))
    value_to_int = {val: idx for idx, val in enumerate(unique_values)}
    return [value_to_int[val] for val in column]

# Implementación manual de la matriz de confusión
def confusion_matrix_manual(y_true, y_pred):
    unique_classes = np.unique(y_true)
    matrix = np.zeros((len(unique_classes), len(unique_classes)), dtype=int)

    for true, pred in zip(y_true, y_pred):
        matrix[int(true)][int(pred)] += 1

    return matrix

# Implementación manual de precision, recall, f1-score
def precision_recall_f1(cm):
    precisions = []
    recalls = []
    f1_scores = []

    for i in range(len(cm)):
        tp = cm[i, i]
        fp = sum(cm[:, i]) - tp
        fn = sum(cm[i, :]) - tp

        # Precision, Recall, F1-Score
        precision = tp / (tp + fp) if (tp + fp) > 0 else 0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        f1 = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

    return precisions, recalls, f1_scores

# Cargar los datos
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

# Preprocesamiento: codificación de variables categóricas
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']

for col in categorical_columns:
    df[col] = manual_label_encoder(df[col])

# Codificación de la columna objetivo 'NObeyesdad'
df['NObeyesdad'] = manual_label_encoder(df['NObeyesdad'])

# Manejo de valores faltantes (solo en columnas numéricas)
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
df[numeric_columns] = df[numeric_columns].fillna(df[numeric_columns].median())

# Detectar y manejar valores anómalos usando el rango intercuartílico (IQR)
numeric_columns = df.select_dtypes(include=['int64', 'float64']).columns
for col in numeric_columns:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower_bound, lower_bound, df[col])
    df[col] = np.where(df[col] > upper_bound, upper_bound, df[col])

# Normalización de los datos
for col in numeric_columns:
    df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

# Separación de características y etiquetas
X = df.drop('NObeyesdad', axis=1).values  # Características
y = df['NObeyesdad'].factorize()[0]  # Etiqueta como numérico

# Implementación manual de la división del dataset en entrenamiento y prueba
np.random.seed(42)
shuffled_indices = np.random.permutation(len(X))
test_set_size = int(len(X) * 0.3)
test_indices = shuffled_indices[:test_set_size]
train_indices = shuffled_indices[test_set_size:]

X_train, X_test = X[train_indices], X[test_indices]
y_train, y_test = y[train_indices], y[test_indices]

# Entrenar y evaluar un solo Árbol de Decisión
print("\nEvaluación con Árbol de Decisión único:")
dt_custom = DecisionTree(max_depth=10, min_samples_split=5)
dt_custom.fit(X_train, y_train)
y_pred_dt = dt_custom.predict(X_test)

# Evaluación del Árbol de Decisión
cm_dt = confusion_matrix_manual(y_test, y_pred_dt)
precisions_dt, recalls_dt, f1_scores_dt = precision_recall_f1(cm_dt)

print("Matriz de Confusión (Árbol de Decisión):")
print(cm_dt)

f1_score_avrg = 0 
for i, (p, r, f1) in enumerate(zip(precisions_dt, recalls_dt, f1_scores_dt)):
    f1_score_avrg += f1
    print(f"Clase {i} - Precision: {p:.2f}, Recall: {r:.2f}, F1-Score: {f1:.2f}")

f1_score_avrg /= len(precisions_dt)
print(f"F1-Score Promedio: {f1_score_avrg:.2f}")



Evaluación con Árbol de Decisión único:
Matriz de Confusión (Árbol de Decisión):
[[74 12  0  0  7  0  0]
 [ 2 85  1  0  0  0  0]
 [ 0  4 74  1  0  0  0]
 [ 0  0  1 98  0  3  0]
 [ 2  0  0  0 84  0  0]
 [ 0  0  0  5  0 82  0]
 [ 0  0  0  1  0  0 97]]
Clase 0 - Precision: 0.95, Recall: 0.80, F1-Score: 0.87
Clase 1 - Precision: 0.84, Recall: 0.97, F1-Score: 0.90
Clase 2 - Precision: 0.97, Recall: 0.94, F1-Score: 0.95
Clase 3 - Precision: 0.93, Recall: 0.96, F1-Score: 0.95
Clase 4 - Precision: 0.92, Recall: 0.98, F1-Score: 0.95
Clase 5 - Precision: 0.96, Recall: 0.94, F1-Score: 0.95
Clase 6 - Precision: 1.00, Recall: 0.99, F1-Score: 0.99
F1-Score Promedio: 0.94


# árbol de decision pero con mas parametros

In [51]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np

# Nodo del Árbol de Decisión
class DecisionNode:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

# Implementación del Árbol de Decisión
class DecisionTreeClassifierCustom:
    def __init__(self, max_depth=5, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.root = None

    def fit(self, X, y):
        self.root = self._build_tree(X, y, depth=0)

    def _build_tree(self, X, y, depth):
        num_samples, num_features = X.shape
        unique_labels = np.unique(y)

        # Criterio de parada
        if len(unique_labels) == 1 or depth >= self.max_depth or num_samples < self.min_samples_split or num_samples <= self.min_samples_leaf:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        # Encontrar la mejor división
        best_feature, best_threshold = self._best_split(X, y, num_features)

        if best_feature is None:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        # Dividir los datos
        left_indices = X[:, best_feature] <= best_threshold
        right_indices = X[:, best_feature] > best_threshold

        if len(left_indices) < self.min_samples_leaf or len(right_indices) < self.min_samples_leaf:
            leaf_value = self._most_common_label(y)
            return DecisionNode(value=leaf_value)

        left_subtree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_subtree = self._build_tree(X[right_indices], y[right_indices], depth + 1)

        return DecisionNode(feature_index=best_feature, threshold=best_threshold, left=left_subtree, right=right_subtree)

    def _best_split(self, X, y, num_features):
        best_gain = -1
        split_index, split_threshold = None, None

        for feature_index in range(num_features):
            feature_values = X[:, feature_index]
            possible_thresholds = np.unique(feature_values)

            for threshold in possible_thresholds:
                gain = self._information_gain(y, feature_values, threshold)

                if gain > best_gain:
                    best_gain = gain
                    split_index = feature_index
                    split_threshold = threshold

        return split_index, split_threshold

    def _information_gain(self, y, feature_values, threshold):
        # Calcular la ganancia de información
        parent_entropy = self._entropy(y)

        left_indices = feature_values <= threshold
        right_indices = feature_values > threshold

        if len(y[left_indices]) == 0 or len(y[right_indices]) == 0:
            return 0

        n = len(y)
        n_left, n_right = len(y[left_indices]), len(y[right_indices])

        entropy_left = self._entropy(y[left_indices])
        entropy_right = self._entropy(y[right_indices])

        child_entropy = (n_left / n) * entropy_left + (n_right / n) * entropy_right

        information_gain = parent_entropy - child_entropy
        return information_gain

    def _entropy(self, y):
        hist = np.bincount(y)
        ps = hist / len(y)
        return -np.sum([p * np.log2(p) for p in ps if p > 0])

    def _most_common_label(self, y):
        return np.bincount(y).argmax()

    def predict(self, X):
        return np.array([self._traverse_tree(x, self.root) for x in X])

    def _traverse_tree(self, x, node):
        if node.value is not None:
            return node.value

        if x[node.feature_index] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)

# Cargar el dataset
df = pd.read_csv('ObesityDataSet_raw_and_data_sinthetic.csv')

# Codificación de variables categóricas
labelencoder = LabelEncoder()
categorical_columns = ['Gender', 'family_history_with_overweight', 'FAVC', 'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS']
for col in categorical_columns:
    df[col] = labelencoder.fit_transform(df[col])

# Separar características (X) y variable objetivo (y)
X = df.drop('NObeyesdad', axis=1).values
y = labelencoder.fit_transform(df['NObeyesdad'])  # Codificar la variable objetivo

# Dividir el conjunto de datos en entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Entrenar el modelo de Árbol de Decisión personalizado
clf_custom = DecisionTreeClassifierCustom(max_depth=9, min_samples_split=6, min_samples_leaf=4)
clf_custom.fit(X_train, y_train)

# Realizar predicciones
y_pred = clf_custom.predict(X_test)

# Evaluación del modelo
accuracy = accuracy_score(y_test, y_pred)
print(f'Precisión del modelo: {accuracy:.2f}')
print('\nMatriz de Confusión:')
print(confusion_matrix(y_test, y_pred))
print('\nReporte de Clasificación:')
print(classification_report(y_test, y_pred))


Precisión del modelo: 0.96

Matriz de Confusión:
[[53  3  0  0  0  0  0]
 [ 5 54  0  0  0  3  0]
 [ 0  0 76  2  0  0  0]
 [ 0  0  2 56  0  0  0]
 [ 0  0  0  0 63  0  0]
 [ 0  0  0  0  0 55  1]
 [ 0  0  1  0  0  2 47]]

Reporte de Clasificación:
              precision    recall  f1-score   support

           0       0.91      0.95      0.93        56
           1       0.95      0.87      0.91        62
           2       0.96      0.97      0.97        78
           3       0.97      0.97      0.97        58
           4       1.00      1.00      1.00        63
           5       0.92      0.98      0.95        56
           6       0.98      0.94      0.96        50

    accuracy                           0.96       423
   macro avg       0.95      0.95      0.95       423
weighted avg       0.96      0.96      0.95       423

