In [23]:
import pandas as pd
import math
import numpy as np

In [24]:
#calculate entropy
def entropy(data, outcome_label):
     
    """
    y is the outcome data series
    weight is the weight series
    """
    unique_type = data[outcome_label].value_counts() #update the code to not calculate in a loop
    probabilities = unique_type / len(data[outcome_label])
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy

#calculate information gain for each feature in each level
def information_gain_fun(data, features, outcome_label, n_subset):

    """
    X is the feature data
    y is the outcome data series
    weight is the weight series
    """
    total_entropy = entropy(data, outcome_label)

    len_features = len(features)

    if len_features < n_subset:
        selected_features = features
    else:
        selected_features = np.random.choice(features, size = n_subset, replace=False)

    output_table = pd.DataFrame(columns = ['factor','decision_value'])

    for i1 in range(len(selected_features)):
        feature = selected_features[i1]
        length_feature = len(data[feature])
        feature_categorpy = data[feature].unique()
        
        expected_entropy = 0

        for category in sorted(feature_categorpy):
            #subcategory data
            sub_data = data[data[feature] == category]

            #subcategory entropy
            sub_entropy = entropy(sub_data, outcome_label)
            proportion= len(sub_data)/length_feature #proportion of one category of a feature
            sub_entropy_category = proportion*sub_entropy

            #total weight of this feature
            expected_entropy += sub_entropy_category

        #information gain of this feature
        information_gain = total_entropy - expected_entropy

        row = {'factor': feature,"decision_value":information_gain}
        output_table = pd.concat([output_table, pd.DataFrame([row])], ignore_index = True)

    # Find the index of the maximum value in the 'decision_value' column
    max_index = output_table['decision_value'].idxmax()

    # Get the factor with max value
    max_factor = output_table.loc[max_index, 'factor']

    return max_factor


def ID3 (data, features, outcome_label, current_depth, max_depth, n_subset):
    trees = {}
    
    #If all target labels are the same, return label
    if data[outcome_label].nunique() == 1:
        output_y = data[outcome_label].unique()[0]
        return output_y

    # If no more features2 are available, return the most common label
    elif len(features) == 0 or (current_depth >= max_depth) :
        return data[outcome_label].mode()[0]
        
    else:
        best_feature1 = information_gain_fun(data, features, outcome_label, n_subset)
        #Innitial tree with best features2:
        trees[best_feature1] = {}

        for i in data[best_feature1].unique():

            split_tree_data = data[data[best_feature1] == i]
            
            if split_tree_data.empty:
                trees[best_feature1][i] = data[outcome_label].mode()[0]

            else:
                new_features = [f for f in features if f != best_feature1]
                update_data = split_tree_data.loc[:, split_tree_data.columns != best_feature1]
                subtree = ID3(update_data, new_features, outcome_label, current_depth+1, max_depth, n_subset) #call ID3 again
                trees[best_feature1][i] = subtree
                
        return trees

In [25]:
def predict_one (trees, predictor_data):

    if not isinstance(trees, dict):
        return trees
    
    parent_node = next(iter(trees)) #parent node
    subtree = trees[parent_node] #subtree of parent_node
    node_value = predictor_data[parent_node]  #the value of the parent in the first observation

    if node_value in subtree:
        return predict_one(subtree[node_value], predictor_data)
    
    else: 
        return None
    
def predict(trees, verify_data, features): #test
    predict_values = []

    for i, row in verify_data.iterrows():
        predictors = row[features].to_dict()
        predict_value = predict_one(trees, predictors) #predicted y
        predict_values.append(predict_value)
        
    return predict_values
    
def calculate_error(predictions, true_labels):
    incorrect = predictions != true_labels
    error = np.sum(incorrect) / len(true_labels)
    return error


def predict_ensemble(classifiers, data, features):
    """
    Predict using the ensemble of weak learners and their weights (alphas).
    """
    all_tree_predictions = np.zeros((len(classifiers), len(data)))

    # Iterate over each classifier and its corresponding alpha value
    for i, stump in enumerate(classifiers):
        stump_preds = predict(stump, data, features)  # Get the predictions from the current weak classifier

        all_tree_predictions[i] = stump_preds

    # Combine predictions
    final_predictions = np.sign(np.sum(all_tree_predictions, axis=0))

    return np.sign(final_predictions)# Return the sign of the weighted sum as the final prediction

def calculate_bias_variance(predictions, true_labels):
    
    # Calculate average predictions across iterations
    avg_predictions = np.mean(predictions, axis=0)
    
    # Bias: (Average prediction - True label)^2
    bias = np.mean((avg_predictions - true_labels) ** 2)
    
    # Variance: Variance of predictions across iterations
    variance = np.mean(np.var(predictions, axis=0))
    
    return bias, variance


In [26]:
#regard unknown as a particular attribute
def data_preprocessing_attribute(data, features, continuous):
    for var in features:
        if var in continuous:
            media = data[var].median() #replace with median
            data[var] = data[var].apply(lambda x:"no" if x < media else 'yes')

    return data

#load data
train_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\bank\\train.csv",header = None, 
names = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])

test_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\bank\\test.csv", header = None, 
names = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])

features = ['age', 'job', 'marital','education', 'default', 'balance', 'housing','loan', 'contact', 'day','month', 
            'duration','campaign','pdays','previous', 'poutcome']

continuous = ['age', 'balance', 'day','duration','campaign','pdays','previous']

#load data
train_data_att = data_preprocessing_attribute(train_data.copy(), features, continuous)
test_data_att = data_preprocessing_attribute(test_data.copy(), features, continuous)

train_data_att['y'] = train_data_att['y'].map(lambda label: 1 if label == 'yes' else -1) 
test_data_att['y'] = test_data_att['y'].map(lambda label: 1 if label == 'yes' else -1) 

In [27]:

outcome_label = 'y'
weight_label = 'weight_columns'
current_depth = 0
max_depth = 2 #len(train_data_att.columns) #range from 1-16
n_subsets = [2, 4, 6]
n_trees = 500

In [28]:
#random forest algorithms
random_trees = []
train_errors1 = []
train_errors2 = []
train_errors3 = []
test_errors1 = []
test_errors2 = []
test_errors3 = []

for n_subset in n_subsets:

    for i in range(n_trees):

        current_depth = 0

        # Generate random indices with replacement
        indices = np.random.choice(range(len(train_data_att)), size=len(train_data_att), replace=True)

        # Create the bootstrap sample using the selected indices
        bootstrap_train_data_att = train_data_att.iloc[indices]

        decision_stump = ID3 (bootstrap_train_data_att, features, outcome_label, current_depth, max_depth, n_subset)#with only two levels
        random_trees.append(decision_stump)

        #compare true and test in training dataset
        train_predictions = predict_ensemble(random_trees, bootstrap_train_data_att, features)
        train_error = calculate_error(train_predictions, bootstrap_train_data_att[outcome_label])
    
        #compare true and test in testing dataset
        test_predictions = predict_ensemble(random_trees, test_data_att, features)
        test_error = calculate_error(test_predictions, test_data_att[outcome_label])

        if n_subset == 2:
            train_errors1.append(1-train_error)
            test_errors1.append(1-test_error)

        elif n_subset == 4:
            train_errors2.append(1-train_error)
            test_errors2.append(1-test_error)

        else:
            train_errors3.append(1-train_error)
            test_errors3.append(1-test_error)

        print(train_error, test_error)

KeyboardInterrupt: 

In [2]:
n_trees = 500
n_iterations = 100
max_depth = 2 #len(data.columns) #range from 1-16
n_subsets = [2, 4, 6]
bias1 = []
bias2 = []
bias3 = []
variance1 = []
variance2 = []
variance3 = []
stamp1 = []
stamp2 = []
stamp3 = []

for i in range(n_iterations):
    current_depth = 0
    # Generate random indices with replacement
    indices = np.random.choice(range(len(train_data_att)), size=1000, replace=True)

    # Create the bootstrap sample using the selected indices
    sample_train = train_data_att.iloc[indices]
    
    for n_subset in n_subsets:

        for i in range(n_trees):

            indices2 = np.random.choice(range(len(sample_train)), size=len(sample_train), replace=True)
            
            booststrap_train = sample_train.iloc[indices2]

            decision_stump = ID3 (booststrap_train, features, outcome_label, current_depth, max_depth, n_subset) #with only two levels
            
            if i == 0:
                predictions_test = predict(decision_stump, test_data_att, features)
                bias_single, variance_single = calculate_bias_variance(predictions_test, test_data_att[outcome_label])

                if n_subset == 2:
                    bias1.append(bias_single)
                    variance1.append(1-test_error)
                    stamp1.append(decision_stump)

                elif n_subset == 4:
                    bias2.append(bias_single)
                    variance2.append(1-test_error)
                    stamp2.append(decision_stump)

                else:
                    bias3.append(bias_single)
                    variance3.append(1-test_error)
                    stamp3.append(decision_stump)

NameError: name 'X_train' is not defined