In [1]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
import random as rand

In [2]:
# Initialize Spark session
spark = SparkSession.builder.appName("randomforest").getOrCreate()

24/05/04 22:28:00 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
#random gen dataset
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
x_all_categorical_arr = np.random.randint(0, 2, (100, 10))
x_all_numerical_arr = np.random.rand(100, 10)
y_categorical_arr = np.random.randint(0, 2, 100)
#balanced_arr = np.concatenate([balanced_categorical_arr, balanced_numerical_arr], axis=1)
print(f"all_categorical_arr: {x_all_categorical_arr.shape=}")
print(f"all_numerical_arr: {x_all_numerical_arr.shape=}")
#print(f"balanced_arr: {balanced_arr.shape=}")

all_categorical_arr: x_all_categorical_arr.shape=(100, 10)
all_numerical_arr: x_all_numerical_arr.shape=(100, 10)


In [4]:
x_data = spark.createDataFrame(x_all_numerical_arr)

In [5]:
y_data = spark.createDataFrame(y_categorical_arr,['y'])

In [6]:
#Index
x_indexed=x_data.withColumn("id",monotonically_increasing_id())
y_indexed=y_data.withColumn("id",monotonically_increasing_id())

In [7]:
# DEVELOPMENT: create joined df for computation with one id
joined_df = x_indexed.join(y_indexed, "id").drop('id')
x_train = joined_df.drop('y')
y_train = joined_df.select('y')
print(x_train.columns)
print(y_train.columns)

['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10']
['y']


In [8]:
# bootstrap function definition
RANDOM_SEED = 42

# weighted bootstrap subdataset

#partition the dataframe dataset
joined_df = joined_df.repartition(10)

In [9]:
# Define entropy for classification evaluation crtieria
# receives a probably as input to calculate entropy

def class_entropy(df):
    #  entropy calculation for binary classification
    col_name = "y"
    counts = df.groupBy(col_name).count()
    total = df.count()
    return counts.withColumn("prob", F.col("count") / total).select(
        F.sum(-F.col("prob") * F.log2(F.col("prob"))).alias("entropy")
    ).first()["entropy"]

def prob(df):
    #  probably calcluation for binary classification
    count = np.count_nonzero(df)   
    if count == 0:
        return float(0)
    else:
        total = len(df)
        prob = np.divide(count,total)
        return float(prob)

class_entropy_udf = udf(class_entropy, ArrayType(DoubleType()))
prob_udf = udf(prob,FloatType())

In [40]:
def new_split(joined_df, feature_index):
    
    # Select relevant columns
    feature_col_name = joined_df.columns[feature_index]
    y_col_name = joined_df.columns[-1]
    split_data = joined_df.select(feature_col_name, y_col_name)\
        .withColumnRenamed(feature_col_name,"feature")\
        .withColumnRenamed(y_col_name,"y")
    
    # Calculate parent entropy
    parent_entropy = class_entropy(joined_df.select("y"))
    parent_data_count = joined_df.count()
    
    # Calculate potential splits and their Information Gain
    distinct_values = split_data.select("feature")\
        .withColumnRenamed("feature","split_value")\
        .distinct().orderBy("split_value")\
        .sample(False, min(1.0, 10/parent_data_count))
    
    # Cartesian join to get split mask
    splits_info = distinct_values.crossJoin(split_data)\
        .withColumn(
        "is_left", F.col("feature") <= F.col("split_value")
    )
    
    #aggregate list
    entropies = splits_info.groupBy("split_value", "is_left").agg(
        F.count("y").alias("count"),
        F.sum("y").alias("sum"),
        prob_udf(F.collect_list("y")).alias("prob")
    )
    entropies = entropies.withColumn("entropy",\
                                    -F.col("prob") * F.log2(F.col("prob")) \
                                    -(1-F.col("prob")) * F.log2((1-F.col("prob")))
                                    )
    # Calculate Information Gain for each split
    info_gain = entropies.groupBy("split_value").agg(
        (parent_entropy - F.sum(F.col("entropy") * (F.col("count") / parent_data_count))).alias("info_gain")
    )
    
    # Get the best split
    best_split = info_gain.orderBy(F.desc("info_gain")).first()
    
    schema = StructType([
        StructField("feature", IntegerType(), True),
        StructField("split_value", FloatType(), True),
        StructField("info_gain", FloatType(), True),
    ])
    
    # Prepare output DataFrame
    if best_split is None:
        result_df = spark.createDataFrame([(feature_index, float(0), float(0))], schema)
    else:
        result_df = spark.createDataFrame([(feature_index, float(best_split["split_value"]), best_split["info_gain"])], schema)
    
    return result_df

IndentationError: unindent does not match any outer indentation level (<tokenize>, line 38)

In [31]:
# Build tree from splitting

# each tree
# (i) for each feature: find_split
# (ii) Mapbypartition(find_split)

def feature_split(dataset, feature_array):
    
    ''' 
    Input: 
    partition: a pyspark dataframe partition to be called by foreachPartition,
    feature_array: a broadcasted feature array for the tree that is intiialized earlier on
    '''
    #define schema
    schema = StructType([
        StructField("feature", IntegerType(), True),
        StructField("split_value", FloatType(), True),
        StructField("info_gain", FloatType(), True),
    ])
    feature_df = spark.createDataFrame([], schema)
    
    # for each feature array, get a split and append the dataframe 
    for feature_index in feature_array:
        
        # find split
        feature_split = new_split(dataset, feature_index)
        
        #add feature  
        feature_df = feature_df.union(feature_split)
        
    return feature_df

In [32]:
def split_node(df,feature_array):

    #init
    y_label = df.columns[-1]
    node_parent = {}
    best_gain = -1
    best_feature= 0 
    best_split = 0
    
    #get first tree
    feature_df = feature_split(df,feature_array)
    feature_list = feature_df.rdd.collect()
    
    #init
    for feature in feature_list:
        
        #assign val
        feature_idx = feature[0]
        split_val = feature[1]
        gain = feature[2]
        
        if gain is None:
            gain = 0
        
        #check best
        if gain > best_gain:
            best_gain = gain
            best_feature = feature_idx
            best_split = split_val
        
    #generate split
    left_df = df.filter(col(joined_df.columns[best_feature]) <= best_split)
    right_df = df.filter(col(joined_df.columns[best_feature]) > best_split)
        
    #assign dictinary value of; key: feature, split_value, gain, and child
    node_parent['feature'] = best_feature
    node_parent['split_value'] = best_split
    node_parent['gain'] = best_gain
    
    
    return node_parent, left_df, right_df

In [33]:
def build_decision_tree(df, feature_array, max_depth, current_depth=0):
    
    """ Recursively build the decision tree. """
    #set y label
    label_col = df.columns[-1]
    distinct_count = df.select(label_col).distinct().count()
    
    if current_depth == max_depth or distinct_count == 1 :
        return df
        # Return the most common label
        most_common_label = df.groupBy(label_col).count().orderBy("count", ascending=False).first()[label_col]
        #return most_common_label
    
    #split node
    node_parent, left_df, right_df = split_node(df,feature_array)
    
    #split if not empty
    if left_df is None or left_df == []:
        right_subtree = build_decision_tree(right_df, feature_array, max_depth, current_depth + 1)
    elif right_df is None or right_df ==[]:
        left_subtree = build_decision_tree(left_df, feature_array, max_depth, current_depth + 1)
    elif (left_df is None) and (right_df is None):
        return {"feature": node_parent['feature'] , "threshold": node_parent['split_value'], "left": None, "right": None}
    else:
        left_subtree = build_decision_tree(left_df, feature_array, max_depth, current_depth + 1)
        right_subtree = build_decision_tree(right_df, feature_array, max_depth, current_depth + 1)

    return {"feature": node_parent['feature'] , "threshold": node_parent['split_value'], "left": left_subtree, "right": right_subtree}
    

In [35]:
node_parent, left_df, right_df = split_node(joined_df,[0,1,2])

In [36]:
tree = build_decision_tree(joined_df, [0,2,3], max_depth=5)
tree

                                                                                

IllegalArgumentException: requirement failed: Sampling fraction (1.25) must be on interval [0, 1] without replacement

In [15]:
def random_forest_train(df, num_trees, sample_size, num_feature, max_depth):
    #init
    trees = []
    feature_matrix = {}
    
    # Broadcast key parameters
    max_depth_broadcast = sc.broadcast(max_depth)
    
    #set rf parameters
    for i in range(num_trees):
    
        #sample features
        feature_array = rand.sample(range(len(df.columns[:-1])),5)
        feature_broadcast = sc.broadcast(feature_array)
        
        #sample df
        sample_df = df.sample(True, sample_size)
    
        # Apply function on DataFrame
        #foreach = df.foreachPartition(process_row)
        tree = build_decision_tree(sample_df,feature_array,max_depth)
        trees.append(tree)
    
    return trees

In [18]:
trees = random_forest_train(joined_df, num_trees=3, sample_size=0.5 ,num_feature = 3, max_depth=5)

                                                                                

In [19]:
trees

[{'feature': 1,
  'threshold': 0.11191961914300919,
  'left': DataFrame[_1: double, _2: double, _3: double, _4: double, _5: double, _6: double, _7: double, _8: double, _9: double, _10: double, y: bigint],
  'right': {'feature': 9,
   'threshold': 0.11323804408311844,
   'left': DataFrame[_1: double, _2: double, _3: double, _4: double, _5: double, _6: double, _7: double, _8: double, _9: double, _10: double, y: bigint],
   'right': {'feature': 1,
    'threshold': 0.665036678314209,
    'left': {'feature': 9,
     'threshold': 0.9338750243186951,
     'left': {'feature': 9,
      'threshold': 0.23754332959651947,
      'left': DataFrame[_1: double, _2: double, _3: double, _4: double, _5: double, _6: double, _7: double, _8: double, _9: double, _10: double, y: bigint],
      'right': DataFrame[_1: double, _2: double, _3: double, _4: double, _5: double, _6: double, _7: double, _8: double, _9: double, _10: double, y: bigint]},
     'right': DataFrame[_1: double, _2: double, _3: double, _4: do