In [None]:
import numpy as np
import matplotlib.pyplot as plt
from joblib import Parallel, delayed
import time
import time
import psutil

# Function to read data from CSV files
def read_data(filename):
    X_data = np.genfromtxt ( filename, delimiter=',', skip_header=1, usecols=(0, 1))
    y_data = np.genfromtxt ( filename, delimiter=',', skip_header=1, usecols=(2))
    return X_data, y_data

# Function to calculate error for each feature and split value
def calculate_error(feature_index, X_sorted, min_error, best_split_value, best_split_feature, left_class, right_class):
    for i in range(len(X_sorted)):
        left = X_sorted[X_sorted[:, feature_index] <= X_sorted[i][feature_index]]
        right = X_sorted[X_sorted[:, feature_index] > X_sorted[i][feature_index]]
        for l in range(2):
            error = 0
            maj_left = -1 if l == 0 else 1
            maj_right = -1 * maj_left
            for k in range(len(right)):
                error += right[k][3] if right[k][2] != maj_right else 0
            for k in range(len(left)):
                error += left[k][3] if left[k][2] != maj_left else 0
            if error < min_error:
                min_error = error
                best_split_value = X_sorted[i][feature_index]
                best_split_feature = feature_index
                left_class = maj_left
                right_class = maj_right
    return min_error, best_split_value, best_split_feature, left_class, right_class

# Function for weak classifier training
def weak_classifier(X, y, D):
    X_sorted = np.hstack((X, y[:, np.newaxis]))
    X_unsorted = np.hstack((X_sorted, D[:, np.newaxis]))
    min_error = 10000
    # Define the number of parallel jobs (adjust as needed)
    num_jobs = -1  # Use all available cores
    results = Parallel(n_jobs=num_jobs)(
        delayed(calculate_error)(j, np.asarray(sorted(X_unsorted, key=lambda a: a[j])), min_error, 0, 0, 0, 0)
        for j in range(len(X[0]))
    )
    # Get the updated variables from the results
    for result in results:
        min_error, best_split_value, best_split_feature, left_class, right_class = result
    beta_t = 0.5 * np.log((1 - min_error) / min_error)
    return beta_t, best_split_feature, best_split_value, left_class, right_class

# Function to update sample weights
def update_weights(X, y, D, model_t):
    beta_t = model_t[0]
    split_feat = model_t[1]
    split_value = model_t[2]
    majright = model_t[4]
    majleft = model_t[3]
    y_h = np.asarray([0] * len(X))[:, np.newaxis]
    D_plus = np.asarray([0] * len(D))[:, np.newaxis]
    Z_norm = 0
    X_sorted = np.hstack((np.hstack((X, y[:, np.newaxis])), y_h))
    X_sorted = np.hstack((np.hstack((X_sorted, D[:, np.newaxis])), D_plus))

    def update_weight_single(indices):
        updated_weights = []
        for i in indices:
            if X_sorted[i][split_feat] <= split_value:
                X_sorted[i][3] = -1 if X_sorted[i][2] != majleft else 1
            if X_sorted[i][split_feat] > split_value:
                X_sorted[i][3] = -1 if X_sorted[i][2] != majright else 1
            weight = (X_sorted[i][4] * np.exp(beta_t)) if X_sorted[i][3] == -1 else (X_sorted[i][4] * np.exp(-beta_t))
            updated_weights.append(weight)
        return updated_weights

    # Distribute workload in smaller chunks
    chunk_size = len(X_sorted) // 10  # Adjust chunk size as needed
    updated_weights_chunks = Parallel(n_jobs=-1)(
        delayed(update_weight_single)(range(start, min(start + chunk_size, len(X_sorted)))) for start in range(0, len(X_sorted), chunk_size)
    )

    # Flatten the list of lists
    updated_weights = [weight for sublist in updated_weights_chunks for weight in sublist]

    for i in range(len(X_sorted)):
        X_sorted[i][5] = updated_weights[i]
        Z_norm += updated_weights[i]

    return np.asarray(X_sorted[:, 5] / Z_norm)




In [None]:
def weak_predict(X_test, model):
    # Initialize predictions list
    y_pred = [0] * len(X_test)
    # Extract model parameters
    beta_t = model[0]
    split_feature = int(model[1])
    split_value = model[2]
    maj_left = model[3]
    maj_right = model[4]
    # Make predictions for each sample in X_test
    for i in range(len(X_test)):
        y_pred[i] = maj_left if X_test[i][split_feature] <= split_value else maj_right
    # Apply beta_t to predictions
    return beta_t * np.asarray(y_pred)


def adaboost_train(num_iter, X_train, y_train):
    # Initialize list to store weak classifiers
    hlist = []
    # Initialize sample weights D
    D = np.asarray([1/len(X_train)] * len(X_train))
    # Iterate through num_iter rounds
    for i in range(num_iter):
        # Train weak classifier
        wk_classifier = weak_classifier(X_train, y_train, D)
        # Update sample weights D
        D = update_weights(X_train, y_train, D, wk_classifier)
        # Append weak classifier to hlist
        hlist.append(wk_classifier)
    return hlist


In [None]:
def eval_model(X_test, y_test, hlist):
    # Define a function to predict using a single weak classifier
    def predict_weak_classifier(X_test, h):
        return weak_predict(X_test, h)

    # Parallelize predictions for all weak classifiers in hlist
    y_pred_all = Parallel(n_jobs=-1)(delayed(predict_weak_classifier)(X_test, h) for h in hlist)
    # Aggregate predictions from all weak classifiers
    y_pred = np.sum(y_pred_all, axis=0)

    # Calculate error and accuracy
    error = np.sum(np.sign(y_pred) != y_test)
    accuracy = 1 - (error / len(y_test))
    return accuracy


In [None]:
def main():
    start_time = time.time()  # Record start time
    X_train, y_train = read_data("/content/train_adaboost.csv")
    X_test, y_test = read_data("/content/test_adaboost.csv")
    h_list = np.asarray(adaboost_train(400, X_train, y_train))
    accuracytest = []
    for i in range(len(h_list)):
        accuracytest.append(eval_model(X_test, y_test, h_list[:i+1]))
    end_time = time.time()
    iters = np.linspace(1, 400, 400)
    print("accuracy after training 400 weak classifiers: ", accuracytest[-1] * 100, "%")
    print("Time taken:", round((end_time - start_time) / 60, 2), "minutes")  # Print time taken in minutes




main()





accuracy after training 400 weak classifiers:  76.0 %
Time taken: 3.35 minutes
