In [0]:
# Check if the mount point exists
mount_path = "/mnt/vision-test"
dbutils.fs.ls(mount_path)

In [0]:
# Unmount the directory
# dbutils.fs.unmount(f"/mnt/{mount_name}")

# Process Images

In [0]:
# Convert the list of FileInfo objects to a spark DataFrame
def create_file_into_df(source_dir:str):

    files = dbutils.fs.ls(f"{mount_path}/{source_dir}")

    file_info_list = [
        {
            "path": file.path,
            "name": file.name
        }
        for file in files
        ]

    return spark.createDataFrame(file_info_list)

file_info_df = create_file_into_df("images")
display(file_info_df)


In [0]:
# Read the label file
label_files = dbutils.fs.ls(f"{mount_path}/labels")
label_path = label_files[0].path

label_df = spark.read.csv(label_path, header=True, inferSchema=True)
label_df = label_df.withColumnRenamed("image_name", "name")

from pyspark.sql.functions import concat, lit
label_df = label_df.withColumn("name", concat(label_df["name"], lit(".jpg")))
display(label_df)

In [0]:
# Join the two DataFrames
file_info_df = file_info_df.join(label_df, on="name", how="left")
display(file_info_df)

In [0]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image

# Spark에서는 분산되어있기 때문에 collect를 사용

def display_image(df, num_images:int=5):
    images = df.take(num_images)
    plt.figure(figsize=(10,10))
    for i, row in enumerate(images):
        img_path = row.path.replace('dbfs:/','/dbfs/')
        img = Image.open(img_path)
        plt.subplot(1,num_images,i+1)
        plt.imshow(img)
        plt.title(row.name)
        plt.axis("off")
    plt.show()

display_image(file_info_df)

In [0]:
for row in file_info_df.collect():
    img_path = row.path.replace('dbfs:/','/dbfs/')
    img = Image.open(img_path)
    width, height = img.size
    print(f"Image: {row.name}, size: {width} x {height}")
    new_size = min(width, height)

    # Crop 순서: LL, UL, UR, LR
    img = img.crop(((width-new_size)/2, (height-new_size)/2, (width+new_size)/2, (height+new_size)/2))
    print(f"{img.size = }")

    # Resize : 256 x 256
    img = img.resize((256, 256), Image.NEAREST)
    print(f"{img.size = }")
    plt.imshow(img)
    plt.show()
    break

In [0]:
import io
from pyspark.sql.functions import pandas_udf, col
from pyspark.sql.types import BinaryType

IMAGE_SIZE = 256

@pandas_udf(BinaryType())
def resize_image_udf(df_series: pd.Series) -> pd.Series:
    def resize_image(path):
        """Resize image and serialize back as jpeg"""
        # load image
        img_path = path.replace('dbfs:/','/dbfs/')
        img = Image.open(img_path)
        width, height = img.size
        new_size = min(width, height)
        
        # Crop: LL, UL, UR, LR
        img = img.crop(((width-new_size)/2, (height-new_size)/2, (width+new_size)/2, (height+new_size)/2))

        # Resize
        img = img.resize((IMAGE_SIZE,IMAGE_SIZE), Image.NEAREST)

        # Save back to jpg
        output = io.BytesIO()
        img.save(output, format="JPEG")
        return output.getvalue()

    return df_series.apply(resize_image)

# Add the metadata to enable the image preview
image_meta = {
    "spark.contentAnnotation":'{"mimeType": "image/jpeg"}'
}

df = (
    file_info_df.withColumn("image", resize_image_udf("path")).withColumn('image',col('image').alias('image', metadata=image_meta))
    )
display(df)

In [0]:
df.write.mode("overwrite").format('parquet').save(f"{mount_path}/images_resized")

In [0]:
from pyspark.sql.functions import regexp_replace, lit 

@pandas_udf(BinaryType())
def flip_image_horizontally_udf(df_series):
    def flip_image(binary_content):
        """Flip image horizontally and re-serialize back as jpeg"""
        img = Image.open(io.BytesIO(binary_content))
        img = img.transpose(Image.FLIP_LEFT_RIGHT)

        # Save back as jpeg
        output = io.BytesIO()
        img.save(output, format="JPEG")
        return output.getvalue()
    
    return df_series.apply(flip_image)

df_flipped = (
    df.withColumn("image", flip_image_horizontally_udf("image").alias('image', metadata=image_meta))
    .withColumn("name", regexp_replace("name", lit(".jpg"), lit("_flipped.jpg")))
    .withColumn("path", lit("n/a") )
)

display(df_flipped)


In [0]:
df_flipped.write.mode("append").format('parquet').save(f"{mount_path}/images_resized")

In [0]:
new_df = spark.read.format("parquet").load(f"{mount_path}/images_resized")
display(new_df)