In [149]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [165]:
#random gen dataset
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
x_all_categorical_arr = np.random.randint(0, 2, (100, 10))
x_all_numerical_arr = np.random.rand(100, 10)
y_categorical_arr = np.random.randint(0, 2, 100)
#balanced_arr = np.concatenate([balanced_categorical_arr, balanced_numerical_arr], axis=1)
print(f"all_categorical_arr: {all_categorical_arr.shape=}")
print(f"all_numerical_arr: {all_numerical_arr.shape=}")
#print(f"balanced_arr: {balanced_arr.shape=}")

all_categorical_arr: all_categorical_arr.shape=(100, 10)
all_numerical_arr: all_numerical_arr.shape=(100, 10)


In [166]:
x_data = spark.createDataFrame(all_numerical_arr)

In [172]:
y_data = spark.createDataFrame(y_categorical_arr,['y'])

In [173]:
#Index
x_indexed=x_data.withColumn("id",monotonically_increasing_id())
y_indexed=y_data.withColumn("id",monotonically_increasing_id())

In [195]:
# DEVELOPMENT: create joined df for computation with one id
joined_df = x_indexed.join(y_indexed, "id").drop('id')
x_train = joined_df.drop('y')
y_train = joined_df.select('y')
print(x_train.columns)
print(y_train.columns)

['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10']
['y']


In [None]:
# bootstrap function definition
RANDOM_SEED = 42

# weighted bootstrap subdataset


# boostrap data setsplit




In [None]:
# bootstrap sampling per tree
# create variations of the joined_df baseed on bootstramp algorithm

In [None]:
joined_df.distinct()

In [402]:
# Define entropy for classification evaluation crtieria
# receives a probably as input to calculate entropy
def class_entropy(y_df):

    #convert to rdd
    y_rdd = y_df.rdd
    
    #perform count
    total_count = y_rdd.count()
    
    if total_count ==0:
        return 0
    else:
        class_count = y_rdd.map(lambda x: (x, 1)).reduceByKey(lambda a, b: a + b)
        class_p = class_count.map(lambda p: p[1]/total_count)
    
        #evaluate probability
        entropy = class_p.map(lambda p: -p * np.log2(p))
    
        return entropy.collect()[0]

In [444]:
#for each partition, find the best split given a feature index
# x_train & y_train are RDD of x and y variables respectively
# feature_index represents the feature being trained
# parent_data_count is a global sum broadcasted at the start of computation

def find_split(joined_df, feature_index):
    
    #split x_train & y_train
    #x_train = joined_df.drop(joined_df.columns[-1])
    y_train = joined_df.select(joined_df.columns[-1])
    split_data = joined_df.select(col(joined_df.columns[feature_index]).alias("feature"),col(joined_df.columns[-1]).alias("y"))
    
    #init variables
    parent_entropy = class_entropy(y_train)
    parent_data_count = split_data.count() 
    schema = StructType([
        StructField("feature", IntegerType(), True),
        StructField("index", IntegerType(), True),
        StructField("split_value", FloatType(), True),
        StructField("info_gain", FloatType(), True),
    ])
    best_IG = 0
    best_split = 0
    best_idx = 0
    
    
    #for each point in x_train for feature_index, compute information gain
    for (idx,split_col) in enumerate(split_data.rdd.collect()): 
        
        #get split value
        split_value = split_col[0]
        
        #threshold
        split_data = split_data.withColumn("x_child_left", when(split_data[0] <= split_value,True).otherwise(False))
        split_data = split_data.withColumn("x_child_right", when(split_data[0] > split_value,True).otherwise(False))
        
        #join and get
        #joined_df = x_threshold.join(y_train,col('index'))
        y_child_left = split_data.filter(col("x_child_left")).select(col('y'))
        y_child_right = split_data.filter(col("x_child_right")).select(col('y'))
    
        #calculate entropy
        entropy_left = class_entropy(y_child_left)
        entropy_right = class_entropy(y_child_right)
        
        #calculate Information Gain
        num_left, num_right = y_child_left.count(), y_child_right.count()
        if num_left != 0 and num_right != 0:
            left_weighted = entropy_left * (num_left / parent_data_count)
            right_weighted = entropy_right * (num_right/parent_data_count)
            IG =  float(parent_entropy  - left_weighted - right_weighted)

            #check best
            if IG > best_IG:
                best_IG = IG
                best_split = split_value
                best_idx = idx
    
    IG_df = spark.createDataFrame([(feature_index,best_idx,float(best_split),float(best_IG))], schema)
        
    return IG_df
        

In [445]:
#test
a = find_split(joined_df, 4)

                                                                                

In [446]:
a.show()

+-------+-----+-----------+-----------+
|feature|index|split_value|  info_gain|
+-------+-----+-----------+-----------+
|      4|   48|  0.3609739|0.015505971|
+-------+-----+-----------+-----------+



In [None]:
# Define node class
class Node:
    """
    The Node class represents the node information from the tree

    Attributes:
        criteria (string): the split criterioa used "gini" or "info_gain"
        gini (float): The Gini impurity of the node if available
        info_gain (float): The information gain of the node if available
        num_data (int): The total number of samples in the node.
        num_samples_per_feature (list): The number of samples per class in the node.
        prediction (int): prediction of the node
        feature_index (int): The index of the feature used for splitting the node.
        split_value (float): The splitting value used for the feature_index
        left (Node): The left child node.
        right (Node): The right child node.
        partition (int): The partition number the Node is currently operating on
    """

    def __init__(self, criteria, num_data, num_samples_per_feature, predicted_class):
        self.criteria = criteria
        self.num_data = num_data
        self.num_samples_per_feature = num_samples_per_feature
        self.predicted_class = predicted_class
        self.feature_index = 0
        self.split_value = 0
        self.left = None
        self.right = None
        self.partition = None
        
    def set_gini(self, gini):
        self.criteria = "gini"
        self.gini = gini
    
    def set_info_gain(self,info_gain):
        self.criteria = "info_gain"
        self.info_gain = info_gain

In [None]:
# algo to build tree 
NUM_FEATURES = 10

# maintain of list of node - Parent & Child nodes

# each tree
# (i) for each feature: find_split
# (ii) Mapbypartition(find_split)