In [72]:
import pandas as pd
import math
import numpy as np

In [73]:
#calculate entropy
def entropy(y):
    """
    y is the outcome data series
    """
    unique_type = y.value_counts() #update the code to not calculate in a loop
    probabilities = unique_type / len(y)
    entropy = -np.sum(p * np.log2(p) for p in probabilities if p > 0)

    return entropy

#calculate information gain for each feature in each level
def information_gain_fun(X, y):

    """
    X is the feature data
    y is the outcome data series
    """
    total_entropy = entropy(y)
    
    output_table = pd.DataFrame(columns = ['factor','decision_value'])

    features = X.columns
    len_features = len(features)

    for i1 in range(len_features):
        feature = X.columns[i1]
        length_feature = len(X[feature])
        feature_categorpy = X[feature].unique()
        
        expected_entropy = 0

        for category in sorted(feature_categorpy):
            
            #subcategory data
            sub_feature = X[X[feature] == category]
            sub_y = y[sub_feature.index]

            #subcategory entropy
            sub_entropy = entropy(sub_y)
            proportion= len(sub_feature)/length_feature #proportion of one category of a feature
            sub_entropy_category = proportion*sub_entropy

            #total weight of this feature
            expected_entropy += sub_entropy_category

        #information gain of this feature
        information_gain = total_entropy - expected_entropy

        row = {'factor': feature,"decision_value":information_gain}
        output_table = pd.concat([output_table, pd.DataFrame([row])], ignore_index = True)

    # Find the index of the maximum value in the 'decision_value' column
    max_index = output_table['decision_value'].idxmax()

    # Get the factor with max value
    max_factor = output_table.loc[max_index, 'factor']

    return max_factor

def ID3 (X, y, current_depth, max_depth):
    trees = {}
    features2 = X.columns
    
    #If all target labels are the same, return label
    if y.nunique() == 1:
        output_y = y.unique()[0]
        return output_y

    # If no more features2 are available, return the most common label
    elif len(features2) == 0 or (current_depth >= max_depth) :
        return y.mode()[0]
        
    else:
        best_feature1 = information_gain_fun(X, y)
        #Innitial tree with best features2:
        trees[best_feature1] = {}

        for i in X[best_feature1].unique():

            split_tree_data = X[X[best_feature1] == i]
            if split_tree_data.empty:
                trees[best_feature1][i] = y.mode()[0]

            else:
                update_X = split_tree_data.loc[:, split_tree_data.columns != best_feature1]
                subtree = ID3(update_X, y, current_depth+1, max_depth) #call ID3 again
                trees[best_feature1][i] = subtree
        return trees

In [74]:
def predict(trees, predictor_data):

    if not isinstance(trees, dict):
        return trees
    
    parent_node = next(iter(trees)) #parent node
    subtree = trees[parent_node] #subtree of parent_node
    node_value = predictor_data[parent_node]  #the value of the parent in the first observation

    if node_value in subtree:
        return predict(subtree[node_value], predictor_data)
    
    else: 
        return None
    
def calculate_error(predictions, true_labels):
    incorrect = predictions != true_labels
    error = np.sum(incorrect) / len(true_labels)
    return error

def predict_ensemble(bagging_trees, X_train):

    all_tree_predictions = np.zeros((len(bagging_trees), len(X_train)))

    for i, stump in enumerate(bagging_trees):
        stump_preds = X_train.apply(lambda row: predict(stump, row), axis=1)
        all_tree_predictions[i] = stump_preds

    # Combine predictions
    final_predictions = np.sign(np.sum(all_tree_predictions, axis=0))

    return final_predictions

def calculate_bias_variance(predictions, true_labels):
    
    # Calculate average predictions across iterations
    avg_predictions = np.mean(predictions, axis=0)
    
    # Bias: (Average prediction - True label)^2
    bias = np.mean((avg_predictions - true_labels) ** 2)
    
    # Variance: Variance of predictions across iterations
    variance = np.mean(np.var(predictions, axis=0))
    
    return bias, variance


In [75]:

#regard unknown as a particular attribute
def data_preprocessing_attribute(data, features, continuous):
    for var in features:
        if var in continuous:
            media = data[var].median() #replace with median
            data[var] = data[var].apply(lambda x:"no" if x < media else 'yes')

    return data

#load data
train_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\bank\\train.csv",header = None, 
names = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])

test_data = pd.read_csv("D:\\EIC-Code\\00-Python\\Machine-Learning-HW\\DecisionTree\\bank\\test.csv", header = None, 
names = ['age','job','marital','education','default','balance','housing','loan','contact','day','month','duration','campaign','pdays','previous','poutcome','y'])

features = ['age', 'job', 'marital','education', 'default', 'balance', 'housing','loan', 'contact', 'day','month', 
            'duration','campaign','pdays','previous', 'poutcome']

continuous = ['age', 'balance', 'day','duration','campaign','pdays','previous']

#load data
train_data_att = data_preprocessing_attribute(train_data.copy(), features, continuous)
test_data_att = data_preprocessing_attribute(test_data.copy(), features, continuous)

X_train = train_data_att[features]
y_train = train_data_att['y'].map(lambda label: 1 if label == 'yes' else -1) 

X_test = test_data_att[features]
y_test = test_data_att['y'].map(lambda label: 1 if label == 'yes' else -1) 

In [76]:
#bagging trees
bagging_trees = []
n_trees = 500
train_errors = []
test_errors = []

for i in range(n_trees):

    current_depth = 0
    max_depth = len(X_train.columns) #range from 1-16

    # Generate random indices with replacement
    indices = np.random.choice(range(len(X_train)), size=len(X_train), replace=True)

    # Create the bootstrap sample using the selected indices
    X_bootstrap_train = X_train.iloc[indices]
    y_bootstrap_train = y_train.iloc[indices]
    
    decision_stump = ID3(X_bootstrap_train, y_bootstrap_train, current_depth, max_depth) #with only two levels
    
    #compare true and test in training dataset
    train_predictions = predict_ensemble(bagging_trees, X_bootstrap_train)
    train_error = calculate_error(train_predictions, y_bootstrap_train)
    train_errors.append(train_error)

    #compare true and test in testing dataset
    test_predictions = predict_ensemble(bagging_trees, X_test)
    test_error = calculate_error(test_predictions, y_test)
    test_errors.append(test_error)
    print(train_error, test_error)

In [71]:
largest_bagging_trees = []
bias_singles = []
variance_singles = []
bias_alls = []
variance_alls = []
n_trees = 500
n_iterations = 100

for i in range(n_iterations):

    current_depth = 0
    max_depth = len(X_train.columns) #range from 1-16

    # Generate random indices with replacement
    indices1 = np.random.choice(range(len(X_train)), size = 1000, replace = True)

    # Create the bootstrap sample using the selected indices
    X_sample_train = X_train.iloc[indices1]
    y_sample_train = y_train.iloc[indices1]

    for i in range(n_trees):

        indices2 = np.random.choice(range(len(X_sample_train)), size=len(X_sample_train), replace=True)
        
        X_bootstrap_train = X_sample_train.iloc[indices2]
        y_bootstrap_train = y_sample_train.iloc[indices2]

        decision_stump = ID3 (X_bootstrap_train, y_bootstrap_train, current_depth, max_depth) #with only two levels
        
        if i == 0:
            predictions_test = X_test.apply(lambda row: predict(decision_stump, row), axis=1)
            incorrect_train = predictions_test != y_test

            bias_single, variance_single = calculate_bias_variance (predictions_test, y_test)

            bias_singles.append(bias_single)
            variance_singles.append(variance_single)

            largest_bagging_trees.append(decision_stump)

all_predictions = predict_ensemble(largest_bagging_trees, X_test)
all_bias, all_variance = calculate_bias_variance (all_predictions, y_test)
bias_alls.append(all_bias)
variance_alls.append(all_variance)

KeyboardInterrupt: 