# Hello loading images into Spark

In [3]:
# Prerequisites
from pyspark.sql import SparkSession
from pyspark.ml import image

In [5]:
# Get SparkSession
spark = SparkSession.builder.master("local") \
    .appName("hello_reading_images") \
    .getOrCreate() 
print("Spark Version: ", spark.version)

Spark Version:  3.5.0


In [6]:
images_file_path = "data/train_images/"

df_images = spark.read.format("image").load(images_file_path)
df_images.printSchema()

root
 |-- image: struct (nullable = true)
 |    |-- origin: string (nullable = true)
 |    |-- height: integer (nullable = true)
 |    |-- width: integer (nullable = true)
 |    |-- nChannels: integer (nullable = true)
 |    |-- mode: integer (nullable = true)
 |    |-- data: binary (nullable = true)
 |-- label: integer (nullable = true)



In [7]:
df_images.select("image.height", "image.width", "image.nChannels", "image.mode", "label").show()

+------+-----+---------+----+-----+
|height|width|nChannels|mode|label|
+------+-----+---------+----+-----+
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    1|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    1|
|   288|  384|        3|  16|    1|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    1|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    1|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
|   288|  384|        3|  16|    0|
+------+-----+---------+----+-----+
only showing top 20 rows



### Read as Binary Files

In [9]:
bin_file_path = "data/train_images/"

df_binary = (spark.read.format("binaryFile")
             .option("pathGlobalFilter", "*.jpg")
             .load(bin_file_path))

df_binary.show()

+--------------------+--------------------+------+--------------------+-----+
|                path|    modificationTime|length|             content|label|
+--------------------+--------------------+------+--------------------+-----+
|file:/home/jovyan...|2024-12-19 03:06:...| 55037|[FF D8 FF E0 00 1...|    0|
|file:/home/jovyan...|2024-12-19 03:06:...| 54634|[FF D8 FF E0 00 1...|    1|
|file:/home/jovyan...|2024-12-19 03:06:...| 54624|[FF D8 FF E0 00 1...|    0|
|file:/home/jovyan...|2024-12-19 03:06:...| 54505|[FF D8 FF E0 00 1...|    0|
|file:/home/jovyan...|2024-12-19 03:06:...| 54475|[FF D8 FF E0 00 1...|    0|
|file:/home/jovyan...|2024-12-19 03:06:...| 54449|[FF D8 FF E0 00 1...|    0|
|file:/home/jovyan...|2024-12-19 03:06:...| 54440|[FF D8 FF E0 00 1...|    0|
|file:/home/jovyan...|2024-12-19 03:06:...| 54377|[FF D8 FF E0 00 1...|    0|
|file:/home/jovyan...|2024-12-19 03:06:...| 54365|[FF D8 FF E0 00 1...|    1|
|file:/home/jovyan...|2024-12-19 03:06:...| 54330|[FF D8 FF E0 0