In [1]:
import os
os.environ['JAVA_HOME'] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ['SPARK_HOME'] = "/home/bdai/spark_work/spark-3.2.4-bin-hadoop3.2"

os.environ['PYSPARK_SUBMIT_ARGS'] = "--jars /home/bdai/spark_work/rapids-4-spark_2.12-23.06.0.jar,/home/bdai/spark_work/cudf-23.06.0-cuda12.jar --master local[*] pyspark-shell"

In [2]:
import findspark
findspark.init()
from pyspark.sql import SparkSession
from pyspark.conf import SparkConf
from pyspark.sql.types import *
from pyspark.sql.functions import col, split
from pyspark.ml.feature import StringIndexer
import pyspark.sql.functions as fn
import shutil
import io
import numpy as np
import pandas as pd
from PIL import Image
import warnings
import time
import torch

warnings.filterwarnings('ignore')

In [3]:
def timing(start):
    print(f'Elapsed time: {time.time() - start:.2f} s')
# start = time.time()

# Start Session

In [5]:
start = time.time()

spark = SparkSession.builder.appName('SparkRAPIDS').config('spark.plugins','com.nvidia.spark.SQLPlugin').config("spark.driver.memory", "15g").getOrCreate()
spark.sparkContext.addPyFile('/home/bdai/spark_work/rapids-4-spark_2.12-23.06.0.jar')
spark.sparkContext.addPyFile('/home/bdai/spark_work/cudf-23.06.0-cuda12.jar')
spark.conf.set('spark.rapids.sql.enabled','true')
spark.conf.set('spark.rapids.sql.incompatibleOps.enabled', 'true')
spark.conf.set('spark.rapids.sql.format.csv.read.enabled', 'true')
spark.conf.set('spark.rapids.sql.format.csv.enabled', 'true')

timing(start)

23/08/01 09:18:28 WARN Utils: Your hostname, bdai-desktop resolves to a loopback address: 127.0.1.1; using 165.132.118.198 instead (on interface enp0s31f6)
23/08/01 09:18:28 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
23/08/01 09:18:28 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/08/01 09:18:29 WARN RapidsPluginUtils: RAPIDS Accelerator 23.06.0 using cudf 23.06.0.
23/08/01 09:18:29 WARN RapidsPluginUtils: RAPIDS Accelerator is enabled, to disable GPU support set `spark.rapids.sql.enabled` to false.
23/08/01 09:18:29 WARN RapidsPluginUtils: spark.rapids.sql.explain is set to `NOT_ON_GPU`. Set it to 'NONE' to suppress the diagnostics logging about the query placement on the 

Elapsed time: 10.52 s


# 1. Extract

In [6]:
# imagezip_path = "/home/bdai/covid_data/covidx-cxr2.zip"
image_path = "/home/bdai/spark_work/covid_dataset"

# shutil.unpack_archive(imagezip_path, image_path)

In [15]:
def get_dir_size(path='.'):
    total = 0
    with os.scandir(path) as it:
        for entry in it:
            if entry.is_file():
                total += entry.stat().st_size
            elif entry.is_dir():
                total += get_dir_size(entry.path)
    return total
dir_size = round(get_dir_size(image_path) / (1024 ** 3),2)

print("Total dataset size : {} GBs".format(dir_size))

Total dataset size : 13.07 GBs


In [7]:
start = time.time()

train_images = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(image_path + "/train")
test_images = spark.read.format("binaryFile").option("recursiveFileLookup", "true").load(image_path + "/test")

# [patient id] [filename] [class] [data source] 
train_txt = spark.read.text("/home/bdai/spark_work/covid_dataset/train.txt")
test_txt = spark.read.text("/home/bdai/spark_work/covid_dataset/test.txt")

timing(start)

Elapsed time: 1.86 s


# 2. Transform

In [8]:
def extract_size (content):
    # Extrach image size from its raw content
    image = Image.open(io.BytesIO(content))
    return image.size

@fn.pandas_udf("width: int, height: int")
def extract_size_udf(content_series):
    sizes = content_series.apply(extract_size)
    return pd.DataFrame(list(sizes))


def transform_merge(image, text):
    image = image.withColumn("file_name", fn.substring_index(image.path, "/", -1))
    text = text.select(split(col("value")," ").getItem(0).alias("patient_id"),
                       split(col("value")," ").getItem(1).alias("file_name"),
                       split(col("value")," ").getItem(2).alias("class")).drop("value")
    df = image.join(text,['file_name'],how='inner')
    df = df.select(fn.col("path"),
                   fn.col("file_name"),
                   extract_size_udf(fn.col("content")).alias("size"),
                   fn.col("content"),
                   fn.col("class"))
    indexer = StringIndexer(inputCol="class", outputCol="label")
    df = indexer.fit(df).transform(df)

    return df




In [9]:
start = time.time()

train_df = transform_merge(train_images, train_txt)
test_df = transform_merge(test_images, test_txt)

timing(start)

23/08/01 09:18:59 WARN GpuOverrides: 
!Exec <ObjectHashAggregateExec> cannot run on GPU because not all expressions can be replaced
  !Expression <AggregateExpression> stringindexeraggregator(org.apache.spark.ml.feature.StringIndexerAggregator@2692b2d, Some(createexternalrow(class#28.toString, StructField(class,StringType,true))), Some(interface org.apache.spark.sql.Row), Some(StructType(StructField(class,StringType,true))), encodeusingserializer(input[0, java.lang.Object, true], true), decodeusingserializer(input[0, binary, true], Array[org.apache.spark.util.collection.OpenHashMap], true), encodeusingserializer(input[0, java.lang.Object, true], true), BinaryType, true, 0, 0) cannot run on GPU because expression AggregateExpression stringindexeraggregator(org.apache.spark.ml.feature.StringIndexerAggregator@2692b2d, Some(createexternalrow(class#28.toString, StructField(class,StringType,true))), Some(interface org.apache.spark.sql.Row), Some(StructType(StructField(class,StringType,true))

Elapsed time: 7.06 s


In [None]:
# temp = test_df.select("content").collect()

# from torchvision import transforms
# for i in range(100):
#     temp_image = Image.open(io.BytesIO(temp[i]["content"]))
#     trans = transforms.ToTensor()
#     print(trans(temp_image).shape)

# 3. Load

In [11]:
start = time.time()

compression = spark.conf.get("spark.sql.parquet.compression.codec")
spark.conf.set("spark.sql.parquet.compression.codec", "uncompressed")

train_df.write.format("parquet").mode("overwrite").option("mergeSchema", True).saveAsTable("covid_train_binary")
test_df.write.format("parquet").mode("overwrite").option("mergeSchema", True).saveAsTable("covid_test_binary")
spark.conf.set("spark.sql.parquet.compression.codec", compression)

timing(start)

23/08/01 09:20:01 WARN GpuOverrides: 
!Exec <ProjectExec> cannot run on GPU because not all expressions can be replaced
  @Expression <AttributeReference> path#0 could run on GPU
  @Expression <AttributeReference> file_name#20 could run on GPU
  @Expression <Alias> pythonUDF0#4343 AS size#43 could run on GPU
    @Expression <AttributeReference> pythonUDF0#4343 could run on GPU
  @Expression <AttributeReference> content#3 could run on GPU
  @Expression <AttributeReference> class#28 could run on GPU
  @Expression <Alias> UDF(class#28) AS label#4171 could run on GPU
    !Expression <ScalaUDF> UDF(class#28) cannot run on GPU because neither UDF implemented by class org.apache.spark.ml.feature.StringIndexerModel$$Lambda$4881/797806008 provides a GPU implementation, nor the conf `spark.rapids.sql.rowBasedUDF.enabled` is enabled
      @Expression <AttributeReference> class#28 could run on GPU
  !Exec <ArrowEvalPythonExec> cannot run even partially on the GPU because unsupported data types in 

Elapsed time: 205.00 s


                                                                                

In [1]:
spark.stop()

NameError: name 'spark' is not defined