In [None]:
import numpy as np
import pandas as pd

class DecisionTreeRootNodeDetector:
    def __init__(self):
        pass

    def entropy(self, labels):
        _, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        return -np.sum(probabilities * np.log2(probabilities))

    def information_gain(self, parent_labels, splits_labels):
        parent_entropy = self.entropy(parent_labels)
        splits_entropy = 0
        total_samples = sum(len(split) for split in splits_labels)
        for split_labels in splits_labels:
            split_weight = len(split_labels) / total_samples
            splits_entropy += split_weight * self.entropy(split_labels)
        return parent_entropy - splits_entropy

    def find_root_node(self, features, labels):
        best_info_gain = -1
        best_feature_index = None

        for feature_index in range(features.shape[1]):
            feature_values = features[:, feature_index]
            unique_values = np.unique(feature_values)
            splits_labels = []
            for value in unique_values:
                split_indices = np.where(feature_values == value)[0]
                split_labels = labels[split_indices]
                splits_labels.append(split_labels)

            info_gain = self.information_gain(labels, splits_labels)
            if info_gain > best_info_gain:
                best_info_gain = info_gain
                best_feature_index = feature_index

        return best_feature_index

# Load dataset
dataset = pd.read_csv("/content/new_dataset.csv")  # Replace 'your_dataset.csv' with the path to your CSV file

# Convert non-numeric columns to strings
for col in dataset.columns:
    if not pd.api.types.is_numeric_dtype(dataset[col]):
        dataset[col] = dataset[col].astype(str)

# Extract features and labels
features = dataset.drop(columns=['indicator']).values
labels = dataset['indicator'].values

# Initialize and use the DecisionTreeRootNodeDetector
detector = DecisionTreeRootNodeDetector()
root_node_index = detector.find_root_node(features, labels)
print("Root node feature index:", root_node_index)


Root node feature index: 0


In [None]:
import numpy as np
import pandas as pd

class DecisionTreeRootNodeDetector:
    def __init__(self):
        pass

    def bin_continuous_feature(self, feature_values, binning_type='equal_width', num_bins=10):
        binned_values = np.zeros_like(feature_values, dtype=int)  # Initialize array for binned values

        for i, column in enumerate(feature_values.T):  # Iterate over each feature column
            numeric_values = column.astype(float)  # Convert to float to handle non-numeric values

            if binning_type == 'equal_width':
                min_value = np.min(numeric_values)
                max_value = np.max(numeric_values)
                bins = np.linspace(min_value, max_value, num_bins + 1)
            elif binning_type == 'frequency':
                bins = np.quantile(numeric_values, np.linspace(0, 1, num_bins + 1))
            else:
                raise ValueError("Invalid binning type. Choose 'equal_width' or 'frequency'.")

            binned_values[:, i] = np.digitize(numeric_values, bins) - 1  # Subtract 1 to start bin indexing from 0

        return binned_values

# Load C code embeddings dataset
dataset = pd.read_csv("/content/new_dataset.csv")  # Adjust the path to your dataset

# Convert the entire dataset to features
features = dataset.drop(columns=['indicator']).values

# Initialize DecisionTreeRootNodeDetector
detector = DecisionTreeRootNodeDetector()

# Binning example
binned_values = detector.bin_continuous_feature(features, binning_type='equal_width', num_bins=10)
print("Binned values (Equal Width Binning):", binned_values)


Binned values (Equal Width Binning): [[ 5  8  5 ... 10 10 10]
 [ 5  7  5 ... 10 10 10]
 [ 5  8  5 ... 10 10 10]
 ...
 [ 3  1  6 ... 10 10 10]
 [ 2  1  5 ... 10 10 10]
 [ 3  1  5 ... -1 -1 -1]]


In [2]:
import numpy as np
import pandas as pd

class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf

    def fit(self, X, y):
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_classes = len(np.unique(y))

        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or \
           (num_classes == 1) or \
           (num_samples < self.min_samples_split):
            leaf_value = self._most_common_label(y)
            return {'leaf': True, 'value': leaf_value}

        best_feature_index, best_threshold = self._find_best_split(X, y)

        if best_feature_index is None:
            leaf_value = self._most_common_label(y)
            return {'leaf': True, 'value': leaf_value}

        left_indices = np.where(X[:, best_feature_index] <= best_threshold)[0]
        right_indices = np.where(X[:, best_feature_index] > best_threshold)[0]

        left_subtree = self._grow_tree(X[left_indices, :], y[left_indices], depth + 1)
        right_subtree = self._grow_tree(X[right_indices, :], y[right_indices], depth + 1)

        return {'leaf': False, 'feature_index': best_feature_index, 'threshold': best_threshold,
                'left': left_subtree, 'right': right_subtree}

    def _find_best_split(self, X, y):
        best_info_gain = -1
        best_feature_index = None
        best_threshold = None

        num_samples, num_features = X.shape

        for feature_index in range(num_features):
            feature_values = X[:, feature_index]
            unique_values = np.unique(feature_values)
            thresholds = (unique_values[:-1] + unique_values[1:]) / 2

            for threshold in thresholds:
                left_indices = np.where(feature_values <= threshold)[0]
                right_indices = np.where(feature_values > threshold)[0]

                if len(left_indices) < self.min_samples_leaf or len(right_indices) < self.min_samples_leaf:
                    continue

                splits_labels = [y[left_indices], y[right_indices]]
                info_gain = self._information_gain(y, splits_labels)

                if info_gain > best_info_gain:
                    best_info_gain = info_gain
                    best_feature_index = feature_index
                    best_threshold = threshold

        return best_feature_index, best_threshold

    def _information_gain(self, parent_labels, splits_labels):
        parent_entropy = self._entropy(parent_labels)
        splits_entropy = 0
        total_samples = sum(len(split) for split in splits_labels)
        for split_labels in splits_labels:
            split_weight = len(split_labels) / total_samples
            splits_entropy += split_weight * self._entropy(split_labels)
        return parent_entropy - splits_entropy

    def _entropy(self, labels):
        _, counts = np.unique(labels, return_counts=True)
        probabilities = counts / len(labels)
        return -np.sum(probabilities * np.log2(probabilities))

    def _most_common_label(self, labels):
        return np.bincount(labels).argmax()

    def _predict(self, inputs):
        node = self.tree_
        while not node['leaf']:
            if inputs[node['feature_index']] <= node['threshold']:
                node = node['left']
            else:
                node = node['right']
        return node['value']

# Load dataset
dataset = pd.read_csv("/content/new_dataset.csv")  # Adjust the path to your dataset

# Display first few rows of the dataset to understand its structure
print(dataset.head())

# Check for missing values
print("Missing values in dataset:")
print(dataset.isnull().sum())

# Drop rows with missing values
dataset.dropna(inplace=True)

# Convert non-numeric columns to numeric
dataset['indicator'] = dataset['indicator'].map({'code_only': 0, 'code_comm': 1})

# Extract features and labels
X = dataset.drop(columns=['indicator']).values
y = dataset['indicator'].values

# Initialize DecisionTree
tree = DecisionTree()

# Fit the model
tree.fit(X, y)

# Example prediction
example_input = X[0]
predicted_label = tree.predict([example_input])
print("Predicted label:", predicted_label[0])


          0         1         2         3         4         5         6  \
0 -0.560180  0.397133 -0.067608 -1.361568 -1.189112  0.362005 -2.113351   
1 -0.617345  0.004933  0.107997 -1.275459 -1.116872  0.391739 -2.048924   
2 -0.597761  0.410195 -0.095720 -1.338432 -1.206770  0.355120 -2.098167   
3 -0.546645  0.208720 -0.045373 -1.222537 -1.078728  0.421885 -2.104985   
4 -0.643125  0.430050 -0.008826 -1.351897 -1.171904  0.367173 -2.125630   

          7         8         9  ...       760       761       762       763  \
0 -0.945830  0.967215 -1.035563  ... -2.182985 -1.788343 -1.500597  0.575761   
1 -1.050489  0.850765 -1.035608  ... -2.184924 -1.784352 -1.194703  0.194767   
2 -0.965952  0.973628 -1.008978  ... -2.188181 -1.790508 -1.496068  0.613387   
3 -1.122851  0.968126 -0.895802  ... -2.176035 -1.723567 -1.477411  0.382508   
4 -0.877730  0.865652 -1.048714  ... -2.250539 -1.726540 -1.531785  0.549569   

        764       765       766       767  score  indicator  
0 -1.4