In [1]:
import time
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from sparkmeasure import StageMetrics

In [2]:
data_path = "/opt/spark/work-dir/data/FRACTAL"
sample_fraction = 0.000001
num_executors = 2

In [3]:
spark = SparkSession.builder.appName("fractal-rf") \
    .master("spark://spark-master:7077") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.hadoop.fs.s3a.aws.credentials.provider", "com.amazonaws.auth.DefaultAWSCredentialsProviderChain") \
    .config("spark.executor.memory", "8g") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", str(num_executors)) \
    .config("spark.driver.maxResultSize", "4g") \
    .config("spark.eventLog.enabled", "true") \
    .config("spark.eventLog.dir", "/opt/spark/spark-events") \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/11/10 20:39:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
stage_metrics = StageMetrics(spark)
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

Spark UI: http://2b8282c8bdeb:4040


In [5]:
def prepare_data(df):
    df = df.withColumn("z_raw", col("xyz")[2]).withColumn(
        "ndvi",
        when(
            (col("Infrared") + col("Red")) != 0,
            (col("Infrared") - col("Red")) / (col("Infrared") + col("Red")),
        ).otherwise(0),
    )
    return df.select(
        "z_raw",
        "Intensity",
        "Red",
        "Green",
        "Blue",
        "Infrared",
        "ndvi",
        col("Classification").alias("label"),
    )


def load_sample(spark, path, fraction, cols):
    df = spark.read.parquet(path).select(*cols).sample(fraction=fraction, seed=62)
    df = prepare_data(df).cache()
    print(f"Loaded {df.count()} rows from {path}")
    return df

In [6]:
cols = ["xyz", "Intensity", "Classification", "Red", "Green", "Blue", "Infrared"]

stage_metrics.begin()
start_time = time.time()

train = load_sample(spark, f"{data_path}/train/", sample_fraction, cols)
val = load_sample(spark, f"{data_path}/val/", sample_fraction, cols)
test = load_sample(spark, f"{data_path}/test/", sample_fraction, cols)

                                                                                

Loaded 7350 rows from /opt/spark/work-dir/data/FRACTAL/train/


                                                                                

Loaded 912 rows from /opt/spark/work-dir/data/FRACTAL/val/




Loaded 969 rows from /opt/spark/work-dir/data/FRACTAL/test/


                                                                                

In [8]:
z_assembler = VectorAssembler(
    inputCols=["z_raw"], outputCol="z_vec", handleInvalid="skip"
)
z_scaler = StandardScaler(
    inputCol="z_vec", outputCol="z", withMean=False, withStd=True
)
assembler = VectorAssembler(
    inputCols=["z", "Intensity", "Red", "Green", "Blue", "Infrared", "ndvi"],
    outputCol="features",
    handleInvalid="skip",
)

rf = RandomForestClassifier(
    labelCol="label",
    featuresCol="features",
    numTrees=5,
    maxDepth=20,
    seed=62
)

pipeline = Pipeline(stages=[z_assembler, z_scaler, assembler, rf])

In [9]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="label", predictionCol="prediction", metricName="accuracy"
)

model = pipeline.fit(train)

25/11/10 20:44:28 WARN DAGScheduler: Broadcasting large task binary with size 1113.2 KiB
25/11/10 20:45:26 WARN DAGScheduler: Broadcasting large task binary with size 1807.0 KiB
25/11/10 20:46:47 WARN DAGScheduler: Broadcasting large task binary with size 2.7 MiB
25/11/10 20:48:28 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
25/11/10 20:50:27 WARN DAGScheduler: Broadcasting large task binary with size 5.4 MiB
25/11/10 20:52:36 WARN DAGScheduler: Broadcasting large task binary with size 7.0 MiB
25/11/10 20:54:39 WARN DAGScheduler: Broadcasting large task binary with size 8.4 MiB
25/11/10 20:56:28 WARN DAGScheduler: Broadcasting large task binary with size 9.7 MiB
25/11/10 20:58:15 WARN DAGScheduler: Broadcasting large task binary with size 10.9 MiB
25/11/10 20:59:51 WARN DAGScheduler: Broadcasting large task binary with size 11.9 MiB
25/11/10 21:01:13 WARN DAGScheduler: Broadcasting large task binary with size 12.7 MiB
25/11/10 21:02:26 WARN DAGScheduler: Broadcas

## Results

In [10]:
val_predictions = model.transform(val)
val_accuracy = evaluator.evaluate(val_predictions)

test_predictions = model.transform(test)
test_accuracy = evaluator.evaluate(test_predictions)

print(f"Val: {val_accuracy:.4f}, Test: {test_accuracy:.4f}")

25/11/10 21:04:21 WARN DAGScheduler: Broadcasting large task binary with size 9.3 MiB
25/11/10 21:04:23 WARN DAGScheduler: Broadcasting large task binary with size 9.3 MiB

Val: 0.6601, Test: 0.6667


                                                                                

In [11]:
stage_metrics.end()
total_time = time.time() - start_time
print(f"Total time: {total_time:.2f}s")

Total time: 1514.09s


In [12]:
train.unpersist()
val.unpersist()
test.unpersist()
val_predictions.unpersist()
test_predictions.unpersist()
spark.stop()