In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Function to read data from CSV files
def read_data(filename):
    X_data = np.genfromtxt(filename, delimiter=',', skip_header=1, usecols=(0, 1))
    y_data = np.genfromtxt(filename, delimiter=',', skip_header=1, usecols=(2))
    return X_data, y_data

# Function for weak classifier training
def weak_classifier(X, y, D):
    # Combine features X and labels y into one array
    X_sorted = np.hstack((X, y[:, np.newaxis]))
    # Combine X_sorted and sample weights D
    X_unsorted = np.hstack((X_sorted, D[:, np.newaxis]))
    min_error = 10000
    # Iterate over features for split search
    for j in range(len(X[0])):
        # Sort X_unsorted based on the current feature j
        X_sorted = np.asarray(sorted(X_unsorted, key=lambda a: a[j]))
        # Iterate over sorted samples to find best split
        for i in range(len(X_sorted)):
            left = X_sorted[X_sorted[:, j] <= X_sorted[i][j]]
            right = X_sorted[X_sorted[:, j] > X_sorted[i][j]]
            for l in range(2):
                error = 0
                errorcount = 0
                maj_left = -1 if l == 0 else 1
                maj_right = -1 * maj_left
                for k in range(len(right)):
                    error += right[k][3] if right[k][2] != maj_right else 0
                    errorcount +=1 if right[k][2] != maj_right else 0
                for k in range(len(left)):
                    error += left[k][3] if left[k][2] != maj_left else 0
                    errorcount += 1 if left[k][2] != maj_left else 0
                if error < min_error:
                    min_error = error
                    best_split_value = X_sorted[i][j]
                    best_split_feature = j
                    left_class = maj_left
                    right_class = maj_right
    beta_t = 0.5 * np.log((1 - min_error) / min_error)
    return beta_t, best_split_feature, best_split_value, left_class, right_class

def update_weights(X, y, D, model_t):
    # Extract model parameters
    beta_t = model_t[0]
    split_feat = model_t[1]
    split_value = model_t[2]
    majright = model_t[4]
    majleft = model_t[3]

    # Initialize arrays for intermediate computations
    y_h = np.asarray([0] * len(X))[:, np.newaxis]
    D_plus = np.asarray([0] * len(D))[:, np.newaxis]
    Z_norm = 0

    # Combine features X, labels y, and intermediate arrays into one array
    X_sorted = np.hstack((np.hstack((X, y[:, np.newaxis])), y_h))
    X_sorted = np.hstack((np.hstack((X_sorted, D[:, np.newaxis])), D_plus))

    # Update sample weights based on weak classifier predictions
    for i in range(len(X_sorted)):
        if X_sorted[i][split_feat] <= split_value:
            # Update class assignment based on split condition
            X_sorted[i][3] = -1 if X_sorted[i][2] != majleft else 1
        if X_sorted[i][split_feat] > split_value:
            X_sorted[i][3] = -1 if X_sorted[i][2] != majright else 1

    # Update sample weights and compute normalization factor Z_norm
    for i in range(len(X_sorted)):
        # Update sample weight using AdaBoost update rule
        X_sorted[i][5] = (X_sorted[i][4] * np.exp(beta_t)) if X_sorted[i][3] == -1 else (X_sorted[i][4] * np.exp(-beta_t))
        # Accumulate weights for normalization
        Z_norm += (X_sorted[i][4] * np.exp(beta_t)) if X_sorted[i][3] == -1 else (X_sorted[i][4] * np.exp(-beta_t))

    # Normalize the updated sample weights
    return np.asarray(X_sorted[:, 5] / Z_norm)



In [None]:
def weak_predict(X_test, model):
    # Initialize predictions list
    y_pred = [0] * len(X_test)
    # Extract model parameters
    beta_t = model[0]
    split_feature = int(model[1])
    split_value = model[2]
    maj_left = model[3]
    maj_right = model[4]
    # Make predictions for each sample in X_test
    for i in range(len(X_test)):
        y_pred[i] = maj_left if X_test[i][split_feature] <= split_value else maj_right
    # Apply beta_t to predictions
    return beta_t * np.asarray(y_pred)


def adaboost_train(num_iter, X_train, y_train):
    # Initialize list to store weak classifiers
    hlist = []
    # Initialize sample weights D
    D = np.asarray([1/len(X_train)] * len(X_train))
    # Iterate through num_iter rounds
    for i in range(num_iter):
        # Train weak classifier
        wk_classifier = weak_classifier(X_train, y_train, D)
        # Update sample weights D
        D = update_weights(X_train, y_train, D, wk_classifier)
        # Append weak classifier to hlist
        hlist.append(wk_classifier)
    return hlist


In [None]:
def eval_model(X_test, y_test, hlist):
    # Initialize error count
    error = 0
    # Initialize array for predictions
    y_pred = np.asarray([0.0]*len(X_test))

    # Aggregate predictions from all weak classifiers
    for i in range(len(hlist)):
        y_pred += weak_predict(X_test, hlist[i])

    # Calculate error by comparing predictions to ground truth labels
    for i in range(len(y_pred)):
        error += 1 if np.sign(y_pred[i]) != y_test[i] else 0

    # Calculate accuracy based on error count
    accuracy = 1 - (error/len(y_pred))
    return accuracy


In [None]:

def main():
    X_train, y_train = read_data("/content/train_adaboost.csv")
    X_test, y_test = read_data("/content/test_adaboost.csv")
    h_list = np.asarray(adaboost_train(400, X_train, y_train))
    accuracytest = []
    for i in range(len(h_list)):
        accuracytest.append(eval_model(X_test, y_test, h_list[:i+1]))
    iters = np.linspace(1, 400, 400)
    print("accuracy after training 400 weak classifiers: ", accuracytest[-1] * 100, "%")


main()





accuracy after training 400 weak classifiers:  97.0 %
