In [3]:
# You are not allowed to import any additional packages/libraries.
import numpy as np
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# This function computes the gini impurity of a label array.
def gini(y):
  a, counts = np.unique(y, return_counts=True)
  probabilities = counts / len(y)
  gini = 1 - np.sum(probabilities**2)
  return gini

# This function computes the entropy of a label array.
def entropy(y):
  a, counts = np.unique(y, return_counts=True)
  probabilities = counts / len(y)
  entropy_value = -np.sum(probabilities * np.log2(probabilities))
  return entropy_value


# The decision tree classifier class.
# Tips: You may need another node class and build the decision tree recursively.
class DecisionTree():
    def __init__(self, criterion='gini', max_depth=None):
        self.criterion = criterion
        self.max_depth = max_depth
        self.tree = None

    def gini(self,y):
      a, counts = np.unique(y, return_counts=True)
      probabilities = counts / len(y)
      gini = 1 - np.sum(probabilities**2)
      return gini

# This function computes the entropy of a label array.
    def entropy(self,y):
      a, counts = np.unique(y, return_counts=True)
      probabilities = counts / len(y)
      entropy_value = -np.sum(probabilities * np.log2(probabilities))
      return entropy_value
    # This function computes the impurity based on the criterion.
    def impurity(self, y):
        if self.criterion == 'gini':
            return gini(y)
        elif self.criterion == 'entropy':
            return entropy(y)


    # This function fits the given data using the decision tree algorithm.
    def fit(self, X, y, depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)

        # If only one class in the data or max depth reached, return a leaf node
        if len(unique_classes) == 1 or (self.max_depth is not None and depth == self.max_depth):
            return {'class': unique_classes[0]}

        # Find the best split
        best_split = self._find_best_split(X, y)

        # If no split is found, return a leaf node
        if best_split is None:
            return {'class': np.bincount(y).argmax()}

        # Split the data
        left_mask = X[:, best_split['feature']] <= best_split['threshold']
        right_mask = ~left_mask

        # Recursively build left and right subtrees
        left_subtree = self.fit(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self.fit(X[right_mask], y[right_mask], depth + 1)
        self.tree={
            'feature': best_split['feature'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree
        }
        return self.tree
        # return {
        #     'feature': best_split['feature'],
        #     'threshold': best_split['threshold'],
        #     'left': left_subtree,
        #     'right': right_subtree
        # }

    def _find_best_split(self, X, y):
        num_samples, num_features = X.shape
        best_split = None
        best_gini = float('inf')

        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                left_mask = X[:, feature] <= value
                right_mask = ~left_mask

                left_gini = self.gini(y[left_mask])
                right_gini = self.gini(y[right_mask])
                gini = (len(y[left_mask]) / num_samples) * left_gini + (len(y[right_mask]) / num_samples) * right_gini
                if gini < best_gini:
                    best_gini = gini
                    best_split = {'feature': feature, 'threshold': value}

        return best_split


    # This function takes the input data X and predicts the class label y according to your trained model.
    def predict(self, X):
      return np.array([self._predict_tree(x, self.tree) for x in X])
    def _predict_tree(self, x, node):
      if 'class' in node:
        return node['class']
      if x[node['feature']] <= node['threshold']:
        return self._predict_tree(x, node['left'])
      else:
        return self._predict_tree(x, node['right'])

    # This function plots the feature importance of the decision tree.
    def plot_feature_importance_img(self, columns):
        if self.tree is None:
            raise RuntimeError("The model has not been trained. Call fit() before plotting feature importance.")

        feature_importance = self._calculate_feature_importance(self.tree,columns)
        sorted_idx = np.argsort(feature_importance)

        plt.figure(figsize=(10, 6))
        plt.barh(range(len(sorted_idx)), feature_importance[sorted_idx])
        plt.yticks(range(len(sorted_idx)), np.array(columns)[sorted_idx])
        plt.xlabel('Feature Importance')
        plt.ylabel('Feature')
        plt.title('Decision Tree Feature Importance')
        plt.show()

    def _calculate_feature_importance(self, tree,columns):
        if 'class' in tree:
            return np.zeros_like(columns, dtype=float)

        importance = np.zeros_like(columns, dtype=float)
        importance[tree['feature']] += 1

        importance_left = self._calculate_feature_importance(tree['left'],columns)
        importance_right = self._calculate_feature_importance(tree['right'],columns)

        return importance + importance_left + importance_right

# The AdaBoost classifier class.
class AdaBoost():
    def __init__(self, criterion='gini', n_estimators=200):
        self.criterion = criterion
        self.n_estimators = n_estimators

    # This function fits the given data using the AdaBoost algorithm.
    # You need to create a decision tree classifier with max_depth = 1 in each iteration.
    def fit(self, X, y):
        pass

    # This function takes the input data X and predicts the class label y according to your trained model.
    def predict(self, X):
        pass

# Do not modify the main function architecture.
# You can only modify the value of the random seed and the the arguments of your Adaboost class.
if __name__ == "__main__":
# Data Loading
    train_df = DataFrame(read_csv("train.csv"))
    test_df = DataFrame(read_csv("test.csv"))
    X_train = train_df.drop(["target"], axis=1)
    y_train = train_df["target"]
    X_test = test_df.drop(["target"], axis=1)
    y_test = test_df["target"]

    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy()

# Set random seed to make sure you get the same result every time.
# You can change the random seed if you want to.
    np.random.seed(0)

# Decision Tree
    print("Part 1: Decision Tree")
    data = np.array([0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1])
    print(f"gini of [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]: {gini(data)}")
    print(f"entropy of [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]: {entropy(data)}")
    tree = DecisionTree(criterion='gini', max_depth=7)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    print("Accuracy (gini with max_depth=7):", accuracy_score(y_test, y_pred))
    tree = DecisionTree(criterion='entropy', max_depth=7)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    print("Accuracy (entropy with max_depth=7):", accuracy_score(y_test, y_pred))

# # AdaBoost
#     print("Part 2: AdaBoost")
#     # Tune the arguments of AdaBoost to achieve higher accuracy than your Decision Tree.
#     ada = AdaBoost(criterion='gini', n_estimators=200)
#     ada.fit(X_train, y_train)
#     y_pred = ada.predict(X_test)
#     print("Accuracy:", accuracy_score(y_test, y_pred))





Part 1: Decision Tree
gini of [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]: 0.4628099173553719
entropy of [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]: 0.9456603046006401
Accuracy (gini with max_depth=7): 0.7049180327868853
Accuracy (entropy with max_depth=7): 0.7049180327868853


In [5]:
# You are not allowed to import any additional packages/libraries.
import numpy as np
from pandas import DataFrame, read_csv
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score

# This function computes the gini impurity of a label array.
def gini(y):
    a, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    gini = 1 - np.sum(probabilities**2)
    return gini

# This function computes the entropy of a label array.
def entropy(y):
    a, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    entropy_value = -np.sum(probabilities * np.log2(probabilities))
    return entropy_value
        
# The decision tree classifier class.
# Tips: You may need another node class and build the decision tree recursively.
class DecisionTree():
    def __init__(self, criterion='gini', max_depth=None):
        self.criterion = criterion
        self.max_depth = max_depth 
        self.tree = None
    
    # This function computes the impurity based on the criterion.
    def impurity(self, y):
        if self.criterion == 'gini':
            return gini(y)
        elif self.criterion == 'entropy':
            return entropy(y)
    def gini(self,y):
        a, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        gini = 1 - np.sum(probabilities**2)
        return gini
    def entropy(self,y):
        a, counts = np.unique(y, return_counts=True)
        probabilities = counts / len(y)
        entropy_value = -np.sum(probabilities * np.log2(probabilities))
        return entropy_value
    # This function fits the given data using the decision tree algorithm.
    def fit(self, X, y,depth=0):
        num_samples, num_features = X.shape
        unique_classes = np.unique(y)
        if len(unique_classes) == 1 or (self.max_depth is not None and depth == self.max_depth):
            return {'class': unique_classes[0]}
        best_split = self._find_best_split(X, y)
        if best_split is None:
            return {'class': np.bincount(y).argmax()}
        left_mask = X[:, best_split['feature']] <= best_split['threshold']
        right_mask = ~left_mask
        left_subtree = self.fit(X[left_mask], y[left_mask], depth + 1)
        right_subtree = self.fit(X[right_mask], y[right_mask], depth + 1)
        self.tree={
            'feature': best_split['feature'],
            'threshold': best_split['threshold'],
            'left': left_subtree,
            'right': right_subtree
        }
        return self.tree
    def _find_best_split(self, X, y):
        num_samples, num_features = X.shape
        best_split = None
        best_gini = float('inf')

        for feature in range(num_features):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                left_mask = X[:, feature] <= value
                right_mask = ~left_mask
                left_gini = self.gini(y[left_mask])
                right_gini = self.gini(y[right_mask])
                gini = (len(y[left_mask]) / num_samples) * left_gini + (len(y[right_mask]) / num_samples) * right_gini
                if gini < best_gini:
                    best_gini = gini
                    best_split = {'feature': feature, 'threshold': value}

        return best_split
    # This function takes the input data X and predicts the class label y according to your trained model.
    def predict(self, X):
        return np.array([self._predict_tree(x, self.tree) for x in X])
    def _predict_tree(self, x, node):
        if 'class' in node:
            return node['class']
        if x[node['feature']] <= node['threshold']:
            return self._predict_tree(x, node['left'])
        else:
            return self._predict_tree(x, node['right'])
    # This function plots the feature importance of the decision tree.
    def plot_feature_importance_img(self, columns):
        pass

# The AdaBoost classifier class.
class Decision:
    def __init__(self,feature,threshold,polarity):
        self.feature =feature
        self.threshold=threshold
        self.polarity=polarity
    def predict(self,X):
        return self.polarity*(X[:,self.feature]>self.threshold)

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y, weights):
        self.feature, self.threshold, self.polarity = self.create_weak_learner(X, y, weights)

    def create_weak_learner(self, X, y, weights):
        m, n = X.shape
        best_feature = 0
        threshold = 0
        polarity = 1
        min_error = float('inf')

        for feature in range(n):
            unique_values = np.unique(X[:, feature])
            for value in unique_values:
                for sign in [1, -1]:
                    predictions = sign * np.ones(m)
                    error = np.sum(weights * (predictions != y))

                    if error < min_error:
                        min_error = error
                        best_feature = feature
                        threshold = value
                        polarity = sign

        return best_feature, threshold, polarity

    def predict(self, X):
        return self.polarity * (X[:, self.feature] > self.threshold)

class AdaBoost():
    def __init__(self, criterion='gini', n_estimators=200,max_depth=None):
        self.criterion = criterion 
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.alphas=[]
        self.DTs=[]

    # This function fits the given data using the AdaBoost algorithm.
    # You need to create a decision tree classifier with max_depth = 1 in each iteration.
    def fit(self, X, y):
        m=len(y)
        weights=np.ones(m)/m
        for i in range(self.n_estimators):
            DT=self.create_weak_learner(X,y,weights)
            predictions = DT.predict(X)
            weighted_e=np.sum(weights*(predictions!=y))
            alpha =0.9*np.log((1-weighted_e)/max(weighted_e,1e-10))
            weights=weights*np.exp(-alpha*y*predictions)
            weights/=np.sum(weights)
            self.alphas.append(alpha)
            self.DTs.append(DT)
    def create_weak_learner(self,X,y,weights):
        m,n=X.shape
        best_feature=0
        threshold=0
        polarity=1
        min_error=float('inf')
        for feature in range(n):
            unique_values=np.unique(X[:,feature])
            for value in unique_values:
                for sign in [1,-1]:
                    predictions =sign*np.ones(m)
                    error =np.sum(weights*(predictions!=y))
                    if error<min_error:
                        min_error=error
                        best_feature=feature
                        threshold=value
                        polarity=sign
        return Decision(feature=best_feature,threshold=threshold,polarity=polarity)
        # {'feature':best_feature,'threshold':threshold,'polarity':polarity}
    def predict(self,X):
        predictions=np.zeros(len(X))
        for alpha ,DT in zip(self.alphas,self.DTs):
            predictions +=alpha*DT.predict(X)
        return np.sign(predictions).astype(int)

    # This function takes the input data X and predicts the class label y according to your trained model.


# Do not modify the main function architecture.
# You can only modify the value of the random seed and the the arguments of your Adaboost class.
if __name__ == "__main__":
# Data Loading
    train_df = DataFrame(read_csv("train.csv"))
    test_df = DataFrame(read_csv("test.csv"))
    X_train = train_df.drop(["target"], axis=1)
    y_train = train_df["target"]
    X_test = test_df.drop(["target"], axis=1)
    y_test = test_df["target"]

    X_train = X_train.to_numpy()
    y_train = y_train.to_numpy()
    X_test = X_test.to_numpy()
    y_test = y_test.to_numpy()

# Set random seed to make sure you get the same result every time.
# You can change the random seed if you want to.
    np.random.seed(0)

# Decision Tree
    print("Part 1: Decision Tree")
    data = np.array([0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1])
    print(f"gini of [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]: {gini(data)}")
    print(f"entropy of [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]: {entropy(data)}")
    tree = DecisionTree(criterion='gini', max_depth=7)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    print("Accuracy (gini with max_depth=7):", accuracy_score(y_test, y_pred))
    tree = DecisionTree(criterion='entropy', max_depth=7)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)
    print("Accuracy (entropy with max_depth=7):", accuracy_score(y_test, y_pred))

# AdaBoost
    print("Part 2: AdaBoost")
    # Tune the arguments of AdaBoost to achieve higher accuracy than your Decision Tree.
    ada = AdaBoost(criterion='gini', n_estimators=1000)
    ada.fit(X_train, y_train)
    y_pred = ada.predict(X_test)
    print("Accuracy:", accuracy_score(y_test, y_pred))


Part 1: Decision Tree
gini of [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]: 0.4628099173553719
entropy of [0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1]: 0.9456603046006401
Accuracy (gini with max_depth=7): 0.7049180327868853
Accuracy (entropy with max_depth=7): 0.7049180327868853
Part 2: AdaBoost
Accuracy: 0.5081967213114754
