In [4]:
import numpy as np
import pandas as pd
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.sql import functions as F, SparkSession
import findspark 
import typing as t


In [7]:
def init_spark(app_name="HelloWorldApp", execution_mode="local[*]"):
    findspark.init()
    spark = t.cast(
        SparkSession,
        SparkSession.builder.master(execution_mode).appName(app_name).getOrCreate(),
    )
    sc = spark.sparkContext
    return spark, sc

spark, sc = init_spark()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/05/04 23:06:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [8]:
#random gen dataset
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
x_all_categorical_arr = np.random.randint(0, 2, (100, 10))
x_all_numerical_arr = np.random.rand(100, 10)
y_categorical_arr = np.random.randint(0, 2, 100)
#balanced_arr = np.concatenate([balanced_categorical_arr, balanced_numerical_arr], axis=1)
print(f"all_categorical_arr: {x_all_categorical_arr.shape=}")
print(f"all_numerical_arr: {x_all_numerical_arr.shape=}")
#print(f"balanced_arr: {balanced_arr.shape=}")

all_categorical_arr: x_all_categorical_arr.shape=(100, 10)
all_numerical_arr: x_all_numerical_arr.shape=(100, 10)


In [9]:
x_data = spark.createDataFrame(x_all_numerical_arr)

In [10]:
y_data = spark.createDataFrame(y_categorical_arr,['y'])

In [11]:
#Index
x_indexed=x_data.withColumn("id",monotonically_increasing_id())
y_indexed=y_data.withColumn("id",monotonically_increasing_id())

In [12]:
# DEVELOPMENT: create joined df for computation with one id
joined_df = x_indexed.join(y_indexed, "id").drop('id')
x_train = joined_df.drop('y')
y_train = joined_df.select('y')
print(x_train.columns)
print(y_train.columns)

['_1', '_2', '_3', '_4', '_5', '_6', '_7', '_8', '_9', '_10']
['y']


In [7]:
# bootstrap function definition
RANDOM_SEED = 42

# weighted bootstrap subdataset

#partition the dataframe dataset
joined_df.repartition(10).rdd.getNumPartitions()

                                                                                

10

In [8]:
# bootstrap sampling per tree
# create variations of the joined_df baseed on bootstramp algorithm

In [9]:
# boostrap data setsplit

In [43]:
# Define entropy for classification evaluation crtieria
# receives a probably as input to calculate entropy


def class_entropy(df, total):
    # Example entropy calculation for binary classification
    col_name = "y"
    counts = df.agg(F.count(col_name).alias("count"))
    return (
        counts.withColumn("prob", F.col("count") / total)
        .select(F.sum(-F.col("prob") * F.log2(F.col("prob"))).alias("entropy"))
        .first()["entropy"]
    )


def prob(df):
    # Example entropy calculation for binary classification
    count = np.count_nonzero(df)
    if count == 0:
        return float(0)
    else:
        total = len(df)
        prob = np.divide(count, total)
        return float(prob)


class_entropy_udf = udf(class_entropy, ArrayType(DoubleType()))
prob_udf = udf(prob, FloatType())

In [34]:
import line_profiler
import line_profiler
import functools

def profile_lines(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        profiler = line_profiler.LineProfiler()
        profiler.add_function(func)
        profiler.enable_by_count()
        
        result = func(*args, **kwargs)
        
        profiler.print_stats()
        return result
    
    return wrapper

In [39]:
@profile_lines
def new_split(joined_df, feature_index):
    
    # Select relevant columns
    feature_col_name = joined_df.columns[feature_index]
    y_col_name = joined_df.columns[-1]
    split_data = joined_df.select(feature_col_name, y_col_name)\
        .withColumnRenamed(feature_col_name,"feature")\
        .withColumnRenamed(y_col_name,"y")
    
    # Calculate parent entropy
    parent_data_count = joined_df.count()
    parent_entropy = class_entropy(joined_df.select("y"), parent_data_count)
    
    # Calculate potential splits and their Information Gain
    distinct_values = split_data.select("feature")\
        .withColumnRenamed("feature","split_value")\
        .distinct().orderBy("split_value")
    
    # Cartesian join to get split mask
    splits_info = distinct_values.crossJoin(split_data)\
        .withColumn(
        "is_left", F.col("feature") <= F.col("split_value")
    )
    
    #aggregate list
    entropies = splits_info.groupBy("split_value", "is_left").agg(
        F.count("y").alias("count"),
        F.sum("y").alias("sum"),
        prob_udf(F.collect_list("y")).alias("prob")
    )
    entropies = entropies.withColumn("entropy",\
                                    -F.col("prob") * F.log2(F.col("prob")) \
                                    -(1-F.col("prob")) * F.log2((1-F.col("prob")))
                                    )
    # Calculate Information Gain for each split
    info_gain = entropies.groupBy("split_value").agg(
        (parent_entropy - F.sum(F.col("entropy") * (F.col("count") / parent_data_count))).alias("info_gain")
    )
    
    # Get the best split
    best_split = info_gain.orderBy(F.desc("info_gain")).first()
    
    schema = StructType([
        StructField("feature", IntegerType(), True),
        StructField("split_value", FloatType(), True),
        StructField("info_gain", FloatType(), True),
    ])
    
    # Prepare output DataFrame
    result_df = spark.createDataFrame([(feature_index, float(best_split["split_value"]), best_split["info_gain"])], schema)
    
    return result_df

In [46]:
new_split(joined_df, 2)

Timer unit: 1e-09 s

Total time: 1.30635 s
File: /var/folders/f3/y3235fxs3cl733b7qyjn10y80000gn/T/ipykernel_34469/503364529.py
Function: new_split at line 1

Line #      Hits         Time  Per Hit   % Time  Line Contents
     1                                           @profile_lines
     2                                           def new_split(joined_df, feature_index):
     3                                               
     4                                               # Select relevant columns
     5         1      96000.0  96000.0      0.0      feature_col_name = joined_df.columns[feature_index]
     6         1       2000.0   2000.0      0.0      y_col_name = joined_df.columns[-1]
     7         2   16635000.0    8e+06      1.3      split_data = joined_df.select(feature_col_name, y_col_name)\
     8         1    1559000.0    2e+06      0.1          .withColumnRenamed(feature_col_name,"feature")\
     9         1     940000.0 940000.0      0.1          .withColumnRenamed(y_co

DataFrame[feature: int, split_value: float, info_gain: float]

In [15]:
test = new_split(joined_df,0).show()

+-------+-----------+-----------+
|feature|split_value|  info_gain|
+-------+-----------+-----------+
|      0|0.057842676|0.021017073|
+-------+-----------+-----------+



In [16]:
# Build tree from splitting

# each tree
# (i) for each feature: find_split
# (ii) Mapbypartition(find_split)

def feature_split(dataset, feature_array):
    
    ''' 
    Input: 
    partition: a pyspark dataframe partition to be called by foreachPartition,
    feature_array: a broadcasted feature array for the tree that is intiialized earlier on
    '''
    #define schema
    schema = StructType([
        StructField("feature", IntegerType(), True),
        StructField("split_value", FloatType(), True),
        StructField("info_gain", FloatType(), True),
    ])
    feature_df = spark.createDataFrame([], schema)
    
    # for each feature array, get a split and append the dataframe 
    for feature_index in feature_array:
        
        # find split
        feature_split = new_split(dataset, feature_index)
        
        #add feature  
        feature_df = feature_df.union(feature_split)
        
    return feature_df

In [17]:
test = feature_split(joined_df,[0,1,2,3])

In [99]:
def grow_tree(df,feature_array, max_depth=3):
    

    #init
    y_label = df.columns[-1]
    node = {}
    np.zeros()

    #get first tree
    feature_df = feature_split(df,feature_array)
    feature_list = feature_df.collect()
    
    #init
    feature_idx = feature[0]
    best_split = feature[1]
    gain = feature[2]
    

    #generate split
    left_df = df.filter(col(joined_df.columns[feature_idx]) <= best_split)
    right_df = df.filter(col(joined_df.columns[feature_idx]) <= best_split)
    
    
    return (feature_df, left_df,right_df)

In [91]:
import random
import math as m

def random_forest_train(df, num_trees, max_depth=3):
    trees = []
    num_features = int(len(df.columns[:-1]))
    
    for _ in range(num_trees):
        
        #sample dataset with replacement
        # to be replaced with sampling method from Jason
        sampled_df = df.sample(withReplacement=True, fraction=1.0)
        
        # sample features 
        # to be replaced with a more updated version if available
        feature_array = random.sample(range(num_features), k=int(m.log(num_features, 2) + 1))
        
        tree = grow_tree(sampled_df, feature_array, max_depth)
        trees.append(tree)
        
        #create node
        
    return trees