In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
            .appName('test read minio data') \
            .config("spark.jars.packages", "com.amazonaws:aws-java-sdk-s3:1.12.765,org.apache.hadoop:hadoop-aws:3.4.0,io.delta:delta-spark_2.12:3.2.0") \
            .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
            .config("spark.hadoop.fs.s3a.access.key", "myuserserviceaccount") \
            .config("spark.hadoop.fs.s3a.secret.key", "myuserserviceaccountpassword") \
            .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
            .config("spark.hadoop.fs.s3a.path.style.access", "true") \
            .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
            .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
            .getOrCreate()

# other configs
# .config("spark.hadoop.fs.s3a.committer.name", "directory") \
# .config("spark.hadoop.fs.s3a.committer.staging.conflict-mode", "replace") \
# .config("spark.hadoop.fs.s3a.committer.staging.tmp.path", "/tmp/staging")

spark.sparkContext.setLogLevel("ERROR")
spark

In [2]:
bucket = "my-bucket"

In [3]:
from datetime import datetime

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DateType
from pyspark.sql.functions import lit, col
from delta.tables import DeltaTable

In [4]:
data =  [{'id': 1, 'name': 'Alice', 'updated_at': datetime(2022, 1, 1)},
         {'id': 2, 'name': 'Braga', 'updated_at': datetime(2022, 2, 2)},
         {'id': 3, 'name': 'Steve', 'updated_at': datetime(2022, 3, 3)}]

schema = StructType([StructField('id', IntegerType(), nullable=True),
                     StructField('name', StringType(), nullable=True),
                     StructField('updated_at', DateType(), nullable=True)])

df = spark.createDataFrame(data, schema=schema)
df.toPandas()

                                                                                

Unnamed: 0,id,name,updated_at
0,1,Alice,2022-01-01
1,2,Braga,2022-02-02
2,3,Steve,2022-03-03


In [5]:
df.write.format("delta") \
  .mode("overwrite") \
  .option("mergeSchema", "true") \
  .save(f"s3a://{bucket}//delta-lake/users")

                                                                                

In [6]:
spark.read.format("delta") \
     .load(f"s3a://{bucket}/delta-lake/users") \
     .toPandas()


                                                                                

Unnamed: 0,id,name,updated_at
0,2,Braga,2022-02-02
1,3,Steve,2022-03-03
2,1,Alice,2022-01-01
