In [1]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
import random as rand
from pyspark.sql.window import Window
import time
from collections import deque

In [2]:
# Initialize Spark session
spark = SparkSession.builder.appName("randomforest").getOrCreate()


24/05/27 17:11:20 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
#random gen dataset
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
x_all_categorical_arr = np.random.randint(0, 2, (1000, 10))
x_all_numerical_arr = np.random.rand(1000, 10)
y_categorical_arr = np.random.randint(0, 2, 1000)
#balanced_arr = np.concatenate([balanced_categorical_arr, balanced_numerical_arr], axis=1)
print(f"all_categorical_arr: {x_all_categorical_arr.shape=}")
print(f"all_numerical_arr: {x_all_numerical_arr.shape=}")
#print(f"balanced_arr: {balanced_arr.shape=}")

all_categorical_arr: x_all_categorical_arr.shape=(1000, 10)
all_numerical_arr: x_all_numerical_arr.shape=(1000, 10)


In [4]:
x_data = spark.createDataFrame(x_all_numerical_arr)

In [5]:
y_data = spark.createDataFrame(y_categorical_arr,['y'])

In [6]:
#Index
x_indexed=x_data.withColumn("id",monotonically_increasing_id())
y_indexed=y_data.withColumn("id",monotonically_increasing_id())

In [7]:
# DEVELOPMENT: create joined df for computation with one id
joined_df = x_indexed.join(y_indexed, "id").drop('id')
x_train = joined_df.drop('y')
y_train = joined_df.select('y')
print(x_train.columns)
print(y_train.columns)

['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10']
['y']


In [19]:
# bootstrap function definition
RANDOM_SEED = 42

# weighted bootstrap subdataset

#partition the dataframe dataset
joined_df = joined_df.repartition(10)
joined_df.cache()

DataFrame[_1: double, _2: double, _3: double, _4: double, _5: double, _6: double, _7: double, _8: double, _9: double, _10: double, y: bigint]

In [20]:
sc.uiWebUrl

'http://192.168.0.197:4040'

In [21]:
# Define entropy for classification evaluation crtieria
# receives a probably as input to calculate entropy

def class_entropy(df,count_val=0, sum_val=0):
    #  entropy calculation for binary classification
    if count_val==0 or sum_val==0 or count_val == 0:
        counts = df.rdd.map(lambda x: (x[0],1)).reduceByKey(lambda x,y: x+y)
        sum_y = df.rdd.map(lambda x: (1,x[0])).reduceByKey(lambda x,y: x+y).collect()
        totals = counts.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x+y).collect()
    
    else:
        sum_y = [(1,sum_val)]
        totals = [(1,count_val)]

    #check if empty
    if len(totals)==0 or len(sum_y) == 0:
        total = 0
        prob_1 = 0
        prob_0 = 0
        y = 0
        
    #calculate probabilities
    if len(sum_y) > 0 and len(totals)> 0:
        total = totals[0][1]
        y = sum_y[0][1]
        prob_1 = y/total
        prob_0 = 1-prob_1

    #calculate entropy
    if prob_1 == 0 or prob_0 ==0:
        entropy = 0
    else: 
        entropy = -prob_1 * np.log2(prob_1) - prob_0 * np.log2(prob_0)
    
    return entropy, total, y

In [680]:
counts_a = joined_df.select('y').rdd.map(lambda x: (x[0],1)).reduceByKey(lambda x,y: x+y)

In [610]:
sum_y = joined_df.select('y').rdd.map(lambda x: (1,x[0])).reduceByKey(lambda x,y: x+y).collect()[0][1]
totals = test.rdd.map(lambda x: (1,x[1])).reduceByKey(lambda x,y: x+y).collect()

In [17]:
class_entropy(joined_df.select('y'))

(0.9993506898146103, 1000, 485)

In [18]:
class_entropy(joined_df.select('y'),count_val = 500, sum_val= 50)

(0.4689955935892812, 500, 50)

In [22]:
# Build tree from splitting

# each tree
# (i) for each feature: find_split
# (ii) Mapbypartition(find_split)

def feature_split(dataset, feature_array):
    '''
    Input: 
    partition: a pyspark dataframe partition to be called
    feature_array: a feature array for the tree that is intiialized earlier on
    '''
    joined_df.select('y')
    #define schema
    schema = StructType([
        StructField("feature", IntegerType(), True),
        StructField("split_value", FloatType(), True),
        StructField("info_gain", FloatType(), True),
        StructField("left_count", IntegerType(), True),
        StructField("left_sum", IntegerType(), True)
        
    ])
    feature_df = spark.createDataFrame([], schema)
    
    # for each feature array, get a split and append the dataframe 
    for feature_index in feature_array:
        
        # find split
        result_df = new_split_2(dataset, feature_index)
        
        #add feature  
        feature_df = feature_df.union(result_df)
        
    return feature_df


In [381]:
'''
It 
#to delete
parent_entropy, total_count = class_entropy(joined_df.select('y'))
global broadcast_parent_entropy 
broadcast_parent_entropy = spark.sparkContext.broadcast(parent_entropy)
global broadcast_parent_count 
broadcast_parent_count = spark.sparkContext.broadcast(total_count)

result_df, info_gain, distinct_values, joined  = new_split_2(joined_df, 0)
'''

In [386]:
info_gain.show()

+-------------------+----------+--------+-------------------+------------------+--------------------+
|        split_value|left_count|left_sum|          left_prob|      left_entropy|           info_gain|
+-------------------+----------+--------+-------------------+------------------+--------------------+
|0.23754413087432746|        31|      15| 0.4838709677419355|0.9992492479956565|  0.6899441748741564|
| 0.2310747965880714|        29|      14| 0.4827586206896552|0.9991421039919088|  0.7099602315951563|
| 0.9629920038589946|        97|      49| 0.5051546391752577|0.9999233329473267|0.029785808793903024|
| 0.7234201136885849|        71|      36| 0.5070422535211268|0.9998568991526107| 0.28981304335445635|
| 0.5670162609866678|        54|      27|                0.5|               1.0| 0.45971144175280987|
| 0.9699120461072704|        98|      49|                0.5|               1.0| 0.01971144175280992|
|0.26899340443509767|        34|      18| 0.5294117647058824|0.9975025463691153|  

In [387]:
split = result_df.rdd.collect()[0][1]
print(split)

0.06707647442817688


In [418]:
joined_df.select('_1','y').where(col('_1') <= split).count()

6

In [385]:
info_gain.show()

+-------------------+----------+--------+-------------------+------------------+--------------------+
|        split_value|left_count|left_sum|          left_prob|      left_entropy|           info_gain|
+-------------------+----------+--------+-------------------+------------------+--------------------+
|0.23754413087432746|        31|      15| 0.4838709677419355|0.9992492479956565|  0.6899441748741564|
| 0.2310747965880714|        29|      14| 0.4827586206896552|0.9991421039919088|  0.7099602315951563|
| 0.9629920038589946|        97|      49| 0.5051546391752577|0.9999233329473267|0.029785808793903024|
| 0.7234201136885849|        71|      36| 0.5070422535211268|0.9998568991526107| 0.28981304335445635|
| 0.5670162609866678|        54|      27|                0.5|               1.0| 0.45971144175280987|
| 0.9699120461072704|        98|      49|                0.5|               1.0| 0.01971144175280992|
|0.26899340443509767|        34|      18| 0.5294117647058824|0.9975025463691153|  

In [399]:
joined.count()

793

+---------------+
|sum(left_count)|
+---------------+
|            793|
+---------------+



In [23]:
def new_split_2(joined_df, feature_index):
    
    # Select relevant columns
    feature_col_name = joined_df.columns[feature_index]
    y_col_name = joined_df.columns[-1]
    split_data = joined_df.select(feature_col_name, y_col_name)\
        .withColumnRenamed(feature_col_name,"feature")\
        .withColumnRenamed(y_col_name,"y")
    
    
    # Sample distinct values for splitting
    distinct_values = split_data.select("feature")\
        .withColumnRenamed("feature","split_value")\
        .distinct()\
        .sample(False, 0.1)
    
    #broadcast join
    joined = split_data.join(distinct_values.hint('broadcast'),\
                             split_data["feature"] <= distinct_values["split_value"], "inner")

    # Calculate conditional entropy 
    conditional_counts = joined.groupBy("split_value").agg(
        F.count("y").alias("left_count"),
        F.sum("y").alias("left_sum")
    )
      
    # Calculate probabilities and entropy
    conditional_counts = conditional_counts.withColumn("left_prob", F.col("left_sum") / F.col("left_count"))
    conditional_counts = conditional_counts.withColumn(
        "left_entropy",
        -F.col("left_prob") * F.log2(F.col("left_prob"))\
        - (1 - F.col("left_prob")) * F.log2(1 - F.col("left_prob"))
    )


    # Calculate Information Gain
    info_gain = conditional_counts.withColumn(
        "info_gain",
        broadcast_parent_entropy.value - (F.col("left_entropy") * (F.col("left_count") / broadcast_parent_count.value))
    )
    
    # Get the best split
    best_split = info_gain.orderBy(F.desc("info_gain")).first()
    
    schema = StructType([
        StructField("feature", IntegerType(), True),
        StructField("split_value", FloatType(), True),
        StructField("info_gain", FloatType(), True),
        StructField("left_count", IntegerType(), True),
        StructField("left_sum", IntegerType(), True)
        
    ])
    
    # Prepare output DataFrame
    if best_split is None or best_split["info_gain"] is None or best_split["split_value"] is None:
        result_df = spark.createDataFrame([(feature_index, float(0), float(0),int(0),int(0))], schema)
    else:
        result_df = spark.createDataFrame([(feature_index, \
                                            float(best_split["split_value"]), \
                                            best_split["info_gain"], \
                                            info_gain.select("left_count").first()[0], \
                                            info_gain.select('left_sum').first()[0])], schema)
    
    
    return result_df

In [24]:
def split_node_2(df,feature_array):

    #init
    y_label = df.columns[-1]
    node_parent = {}
    best_gain = -1
    best_feature= 0 
    best_split = 0
    
    #get first tree
    feature_df = feature_split(df,feature_array)
    
    #find best
    best_split_info = feature_df.orderBy(col("info_gain").desc()).first()
    
    if best_split_info is not None:
        best_feature = best_split_info[0]
        best_split = best_split_info[1]
        best_gain = best_split_info[2] if best_split_info[2] is not None else 0
        left_count = best_split_info[3]
        left_sum = best_split_info[4]
    
    #generate split
    left_df = df.filter(col(df.columns[best_feature]) <= best_split)
    right_df = df.filter(col(df.columns[best_feature]) > best_split)
        
    #assign dictinary value of; key: feature, split_value, gain, and child
    node_parent['feature'] = best_feature
    node_parent['split_value'] = best_split
    node_parent['gain'] = best_gain
    node_parent['left_count'] = left_count
    node_parent['left_sum'] = left_sum
    
    return node_parent, left_df, right_df

In [671]:
node_parent, left_df, right_df  = split_node_2(joined_df,[0])

In [672]:
node_parent

{'feature': 0,
 'split_value': 0.08279866725206375,
 'gain': -0.08812908828258514,
 'left_count': 74,
 'left_sum': 37}

In [46]:
def build_decision_tree_bfs(df, feature_array, max_depth):
    """ Build the decision tree using a breadth-first approach. """
    label_col = df.columns[-1]
    global broadcast_parent_entropy
    global broadcast_parent_count
    global broad_left_entropy
    global broad_left_count
    global broad_right_entropy
    global broad_right_count
    global y_sum
    
    # Initialize a queue for BFS with root node
    queue = deque([{"node_df": df, "depth": 0, "path": [],"parent_count":0,"parent_entropy":0,"parent_ysum":0}])
    root = None
    
    
    while queue:
        # Get the front node of the queue
        current = queue.popleft()
        current_df = current["node_df"]
        current_depth = current["depth"]
        path = current["path"]
        
        if current_depth==0:
            #calculate first parent entropy
            parent_entropy, parent_count, sum_y = class_entropy(df.select(label_col))
            
            #broadcast
            broadcast_parent_entropy = spark.sparkContext.broadcast(parent_entropy)
            broadcast_parent_count = spark.sparkContext.broadcast(parent_count)
            y_sum = spark.sparkContext.broadcast(sum_y)
        else:
            #update entropies & counts
            if path[-1] == "right":
                parent_count = broadcast_total_count.value - node_parent['left_count']
                y_count = y_sum.value - node_parent['left_sum']
            else:
                parent_count = node_parent['left_count']
                y_count = node_parent['left_sum']
            
            #compute the parent_entropy
            parent_entropy, parent_count, sum_y = class_entropy(current_df.select(label_col), \
                                                                      parent_count, \
                                                                      y_count)
            #broadcast 
            broadcast_parent_entropy = spark.sparkContext.broadcast(parent_entropy)
            broadcast_parent_count = spark.sparkContext.broadcast(parent_count)
            y_sum = spark.sparkContext.broadcast(sum_y)
        
        
        if current_df is None or current_df.rdd.isEmpty():
            continue
        
        # Check for stopping conditions
        #distinct_count = current_df.select(label_col).distinct().count()
        if current_depth == max_depth:
            most_common_label = current_df.groupBy(label_col).count().orderBy("count", ascending=False).first()[label_col]
            node = {"label": most_common_label}
        else:
 
            # Split the node
            node_parent, left_df, right_df = split_node_2(current_df, feature_array)
            node = {
                "feature": node_parent['feature'], 
                "threshold": node_parent['split_value'], 
                "left": None, 
                "right": None
            }
            
            # Enqueue children nodes
            if left_df is not None and not left_df.rdd.isEmpty():
                queue.append({"node_df": left_df,\
                              "depth": current_depth + 1,\
                              "path": path + ["left"],\
                              "parent_count": broadcast_parent_count.value,\
                              "parent_entropy": broadcast_parent_entropy.value,\
                              "parent_ysum":y_sum.value\
                             })
                left_df.cache()
            if right_df is not None and not right_df.rdd.isEmpty():
                queue.append({"node_df": right_df, "depth": current_depth + 1, "path": path + ["right"]})
                right_df.cache()
        
        # Set the node in the correct position in the tree
        if not path:  # the root node
            root = node
        else:
            parent = root
            # Navigate to the correct parent node to attach the current node
            for p in path[:-1]:
                parent = parent[p]
            parent[path[-1]] = node
    
    # Unpersist cached DataFrames
    for item in queue:
        if item["node_df"] is not None:
            item["node_df"].unpersist()
    
    return root

In [26]:
def random_forest_train_bfs(df, num_trees, num_feature, max_depth):

    #init
    trees = {}
    
    # Pre-calculate the feature columns
    columns = df.columns
    feature_columns = df.columns[:-1]
    column_size = len(columns)
    
    #set as indices list
    column_indices = range(len(feature_columns))
    feature_array = [*column_indices]

    # def train_trees
    def train_tree(df, column_size, feature_array, num_feature, max_depth):
            
        #sample feature
        sampled = rand.sample(feature_array, num_feature)
            
        #add y column
        sampled.append(column_size-1)

        # Create a  DataFrame from the part of the partition
        selected_columns = [columns[i] for i in sampled]
        sampled_df = joined_df.select(selected_columns)
            
        # build information of new sample_trees
        sample_df_columns = sampled_df.columns
        sampled_df_features = sample_df_columns[:-1]
        sampled_columns = range(len(sampled_df_features))
        sampled_array = [*sampled_columns]
        
        tree = build_decision_tree_bfs(sampled_df, sampled_array, max_depth)
            
        return tree
    
    # Apply our function to each partition of the DataFrame
    for tree_idx in range(num_trees):
        tree = train_tree(df, column_size, feature_array, num_feature, max_depth)
        trees[tree_idx] = tree
        df.cache()


    # Unpersist the DataFrame as it's no longer needed
    df.unpersist()

    return trees

In [27]:
def random_forest_train_dfs(df, num_trees, num_feature, max_depth):

    #init
    trees = {}
    
    # Pre-calculate the feature columns
    columns = df.columns
    feature_columns = df.columns[:-1]
    column_size = len(columns)
    
    #set as indices list
    column_indices = range(len(feature_columns))
    feature_array = [*column_indices]

    # def train_trees
    def train_tree(df, column_size, feature_array, num_feature, max_depth):
            
        #sample feature
        sampled = rand.sample(feature_array, num_feature)
            
        #add y column
        sampled.append(column_size-1)

        # Create a  DataFrame from the part of the partition
        selected_columns = [columns[i] for i in sampled]
        sampled_df = joined_df.select(selected_columns)
            
        # build information of new sample_trees
        sample_df_columns = sampled_df.columns
        sampled_df_features = sample_df_columns[:-1]
        sampled_columns = range(len(sampled_df_features))
        sampled_array = [*sampled_columns]
        
        tree = build_decision_tree(sampled_df, sampled_array, max_depth)
            
        return tree
    
    
    # Apply our function to each partition of the DataFrame
    for tree_idx in range(num_trees):
        tree = train_tree(df, column_size, feature_array, num_feature, max_depth)
        trees[tree_idx] = tree
        df.cache()

    # Unpersist the DataFrame as it's no longer needed
    df.unpersist()

    return trees

In [28]:
def build_decision_tree(df, feature_array, max_depth, current_depth=0):
    
    """ Recursively build the decision tree. """
    
    #init
    global broadcast_parent_entropy
    global broadcast_parent_count
    global broad_left_entropy
    global broad_left_count
    global broad_right_entropy
    global broad_right_count
    global y_sum
    
    #set y label
    label_col = df.columns[-1]
    #distinct_count = df.select(label_col).distinct().count()
    
    if df is None or df.rdd.isEmpty():
        return df
    elif current_depth == max_depth:
        # Return the most common label
        most_common_label = df.groupBy(label_col).count().orderBy("count", ascending=False).first()[label_col]
        return {"label": df.select(label_col).first()[0]}
        #return most_common_label 
    #elif distinct_count == 1:
    #   return {"label": df.select(label_col).first()[0]}
    
    #get first split entropy
    if current_depth ==0:
        parent_entropy, total_count,sum_y = class_entropy(df.select(label_col))
        broadcast_parent_entropy = spark.sparkContext.broadcast(parent_entropy)
        broadcast_parent_count = spark.sparkContext.broadcast(total_count)
        y_sum = spark.sparkContext.broadcast(sum_y)
    
    #split node
    node_parent, left_df, right_df = split_node_2(df,feature_array)
    
    # Cache potentially reused DataFrames
    if left_df is not None and not left_df.rdd.isEmpty():
        left_df.cache()
    if right_df is not None and not right_df.rdd.isEmpty():
        right_df.cache()

    
    #split if not empty    
    if left_df is None or left_df == []:
        #right
        broad_right_count = broadcast_total_count.value - node_parent['left_count']
        y_count = y_sum.value - node_parent['left_sum']
        broadcast_parent_count = spark.sparkContext.broadcast(broad_right_count)
        
        #update entropies & counts
        broad_right_entropy, broad_right_count, sum_y = class_entropy(right_df.select(label_col), \
                                                                      broadcast_total_count.value, \
                                                                      y_count)
        broadcast_parent_entropy = spark.sparkContext.broadcast(broad_right_entropy)
        y_sum = spark.sparkContext.broadcast(sum_y)
        
        #right tree
        right_subtree = build_decision_tree(right_df, feature_array, max_depth, current_depth + 1)
        
    elif right_df is None or right_df ==[]:
        
        #left
        broad_left_count = node_parent['left_count']
        y_count = node_parent['left_sum']
        broadcast_parent_count = spark.sparkContext.broadcast(broad_left_count)
        
        #update entropies & counts
        broad_left_entropy, broad_left_count,sum_y = class_entropy(left_df.select(label_col),broadcast_total_count.value, y_count)
        broadcast_parent_entropy = spark.sparkContext.broadcast(broad_left_entropy)
        y_sum = spark.sparkContext.broadcast(sum_y)
        
        #left
        left_subtree = build_decision_tree(left_df, feature_array, max_depth, current_depth + 1)
        
    elif (left_df is None) and (right_df is None):
        return {"feature": node_parent['feature'] , "threshold": node_parent['split_value'], "left": None, "right": None}
    else:
        #left
        broad_left_count = node_parent['left_count']
        y_count = node_parent['left_sum']
        broadcast_parent_count = spark.sparkContext.broadcast(broad_left_count)
        
        #update entropies & counts
        broad_left_entropy, broad_left_count, sum_y = class_entropy(left_df.select(label_col),broadcast_total_count.value, y_count)
        broadcast_parent_entropy = spark.sparkContext.broadcast(broad_left_entropy)
        y_sum = spark.sparkContext.broadcast(sum_y)
        
        #left subtree
        left_subtree = build_decision_tree(left_df, feature_array, max_depth, current_depth + 1)
        
        #right
        broad_right_count = broadcast_total_count.value - node_parent['left_count']
        y_count = y_sum.value - node_parent['left_sum']
        broadcast_parent_count = spark.sparkContext.broadcast(broad_right_count)
        
        #update entropies & counts
        broad_right_entropy, broad_right_count, sum_y = class_entropy(right_df.select(label_col), broadcast_total_count.value,y_sum.value)
        broadcast_parent_entropy = spark.sparkContext.broadcast(broad_right_entropy)
        y_sum = spark.sparkContext.broadcast(sum_y)
        
        right_subtree = build_decision_tree(right_df, feature_array, max_depth, current_depth + 1)

    # Clear cache after use
    if left_df is not None:
        left_df.unpersist()
    if right_df is not None:
        right_df.unpersist()
        
    
    # return node structure
    return {
        "feature": node_parent['feature'] , 
        "threshold": node_parent['split_value'], 
        "left": left_subtree, 
        "right": right_subtree
    }

In [696]:
dfs_tree = build_decision_tree(joined_df, [0,2,3], max_depth=2)

  entropy = -prob_1 * np.log2(prob_1) - prob_0 * np.log2(prob_0)


# Evaluation of dfs vs bfs

In [None]:
import time
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True)
spark.conf.set("spark.sql.adaptive.coalescePartitions.parallelismFirst", False)

#DFS approach 
dfs_start_PFirst = time.time()
dfs_trees_PFirst = random_forest_train_dfs(joined_df, num_trees=3 , num_feature = 3, max_depth=5)
dfs_end_PFirst = time.time()
dfs_time_PFirst = dfs_end_PFirst - dfs_start_PFirst

  entropy = -prob_1 * np.log2(prob_1) - prob_0 * np.log2(prob_0)
24/05/26 17:57:21 WARN CacheManager: Asked to cache already cached data.        
                                                                                

In [None]:
print(dfs_time_PFirst)

In [None]:
dfs_trees_PFirst

In [734]:
import time
spark.conf.set("spark.sql.adaptive.coalescePartitions.enabled", True)
spark.conf.set("spark.sql.adaptive.coalescePartitions.parallelismFirst", False)

#DFS approach 
dfs_start = time.time()
dfs_trees = random_forest_train_dfs(joined_df, num_trees=3 , num_feature = 3, max_depth=5)
dfs_end = time.time()
dfs_time = dfs_end - dfs_start

  entropy = -prob_1 * np.log2(prob_1) - prob_0 * np.log2(prob_0)
24/05/26 17:50:42 WARN CacheManager: Asked to cache already cached data.        
24/05/26 17:52:32 WARN CacheManager: Asked to cache already cached data.        


In [735]:
print(dfs_time)

371.3925850391388


In [736]:
# show values
dfs_trees

{0: {'feature': 2,
  'threshold': 0.0018727439455688,
  'left': {'feature': 0,
   'threshold': 0.0,
   'left': DataFrame[_8: double, _1: double, _3: double, y: bigint],
   'right': {'feature': 0,
    'threshold': 0.0,
    'left': DataFrame[_8: double, _1: double, _3: double, y: bigint],
    'right': {'feature': 0,
     'threshold': 0.0,
     'left': DataFrame[_8: double, _1: double, _3: double, y: bigint],
     'right': {'feature': 2,
      'threshold': 0.0018727439455688,
      'left': DataFrame[_8: double, _1: double, _3: double, y: bigint],
      'right': DataFrame[_8: double, _1: double, _3: double, y: bigint]}}}},
  'right': {'feature': 2,
   'threshold': 0.03417558968067169,
   'left': {'feature': 1,
    'threshold': 0.17401044070720673,
    'left': {'feature': 0,
     'threshold': 0.0,
     'left': DataFrame[_8: double, _1: double, _3: double, y: bigint],
     'right': {'feature': 0,
      'threshold': 0.0,
      'left': DataFrame[_8: double, _1: double, _3: double, y: bigint],


In [467]:
#measuring time for one DFS tree
dfs_tree_start = time.time()
dfs_tree = build_decision_tree(joined_df, [0,2,3], max_depth=5)
dfs_tree_end = time.time()
dfs_tree_time = dfs_tree_end - dfs_tree_start

                                                                                

In [468]:
print(dfs_tree_time)

37.03761076927185


In [45]:
#BFS approach 
bfs_start = time.time()
bfs_trees = random_forest_train_bfs(joined_df, num_trees=3 , num_feature = 3, max_depth=5)
bfs_end = time.time()
bfs_time = bfs_end - bfs_start

  entropy = -prob_1 * np.log2(prob_1) - prob_0 * np.log2(prob_0)
24/05/27 18:54:12 WARN CacheManager: Asked to cache already cached data.        
24/05/27 18:54:17 WARN BlockManager: Block rdd_3025_8 already exists on this machine; not re-adding it
24/05/27 18:55:11 WARN CacheManager: Asked to cache already cached data.        
24/05/27 18:56:13 WARN CacheManager: Asked to cache already cached data.        


In [731]:
print(bfs_time)

268.3045129776001


In [732]:
bfs_trees

{0: {'feature': 1,
  'threshold': 0.009205667302012444,
  'left': {'feature': 0,
   'threshold': 0.47529298067092896,
   'left': {'feature': 0,
    'threshold': 0.47529298067092896,
    'left': {'feature': 0,
     'threshold': 0.0,
     'left': None,
     'right': {'feature': 0,
      'threshold': 0.0,
      'left': None,
      'right': {'label': 0}}},
    'right': None},
   'right': {'feature': 0,
    'threshold': 0.0,
    'left': None,
    'right': {'feature': 0,
     'threshold': 0.0,
     'left': None,
     'right': {'feature': 0,
      'threshold': 0.0,
      'left': None,
      'right': {'label': 0}}}}},
  'right': {'feature': 0,
   'threshold': 0.012003026902675629,
   'left': {'feature': 0,
    'threshold': 0.0,
    'left': None,
    'right': {'feature': 1,
     'threshold': 0.9095953702926636,
     'left': {'feature': 0,
      'threshold': 0.0,
      'left': None,
      'right': {'label': 1}},
     'right': {'feature': 0,
      'threshold': 0.0,
      'left': None,
      'righ

In [47]:
#measuring time for one BFS tree
bfs_tree_start = time.time()
bfs_tree = build_decision_tree_bfs(joined_df, [0,2,3], max_depth=5)
bfs_tree_end = time.time()
bfs_tree_time = bfs_tree_end - bfs_tree_start

  entropy = -prob_1 * np.log2(prob_1) - prob_0 * np.log2(prob_0)
                                                                                

In [640]:
print(bfs_tree_time)

33.81710481643677


In [723]:
print("DFS RF with Parallelism True: ", dfs_time)
print("DFS RF with Parallliesm False: ", dfs_time_PFirst)
print("BFS RF: ", bfs_time)
print("DFS Tree: ", dfs_tree_time)
print("BFS Tree: ", bfs_tree_time)

DFS RF with Parallelism True:  192.8394010066986
DFS RF with Parallliesm False:  161.65961813926697
BFS RF:  135.3711497783661
DFS Tree:  37.03761076927185
BFS Tree:  33.81710481643677
