In [1]:
#Setting Spark with MinIO
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType,FloatType
from pyspark.sql.functions import col,to_date,date_format,split,split,trim,upper,regexp_replace
import os
from pyspark.sql import SparkSession
S3_ACCESS_KEY = os.getenv("S3_ACCESS_KEY")
S3_SECRET_KEY = os.getenv("S3_SECRET_KEY")
S3_ENDPOINT = os.getenv("S3_ENDPOINT")

spark = SparkSession.builder \
    .appName("MinIOReader") \
    .config("spark.jars.packages", "org.apache.hadoop:hadoop-aws:3.3.4") \
    .config("spark.hadoop.fs.s3a.endpoint", S3_ENDPOINT) \
    .config("spark.hadoop.fs.s3a.access.key", S3_ACCESS_KEY) \
    .config("spark.hadoop.fs.s3a.secret.key", S3_SECRET_KEY) \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .getOrCreate()

spark.conf.set("spark.sql.legacy.timeParserPolicy", "LEGACY")
spark.catalog.clearCache()


25/11/15 16:51:40 WARN Utils: Your hostname, harshithts-HP-Pavilion-Gaming-Laptop-15-ec2xxx resolves to a loopback address: 127.0.1.1; using 192.168.1.2 instead (on interface wlo1)
25/11/15 16:51:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


:: loading settings :: url = jar:file:/home/harshithts/Documents/sturctruing/env/lib/python3.12/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/harshithts/.ivy2/cache
The jars for the packages stored in: /home/harshithts/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-17e99a46-f492-4abe-84da-5a43fa51894f;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
:: resolution report :: resolve 186ms :: artifacts dl 5ms
	:: modules in use:
	com.amazonaws#aws-java-sdk-bundle;1.12.262 from central in [default]
	org.apache.hadoop#hadoop-aws;3.3.4 from central in [default]
	org.wildfly.openssl#wildfly-openssl;1.0.7.Final from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	-------------------------

In [2]:
#schema defining
Profile_schema = StructType([
    StructField("user_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("country", StringType(), True),
    StructField("account_type", StringType(), True),
    StructField("credit_score", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("timestamp", StringType(), True)
])

merchant_schema = StructType([
    StructField("merchant_id", StringType(), True),
    StructField("merchant_name", StringType(), True),
    StructField("category", StringType(), True),
    StructField("country", StringType(), True),
    StructField("risk_level", StringType(), True),
    StructField("timestamp", StringType(), True)
])
account_schema = StructType([
    StructField("account_id", StringType(), True),
    StructField("user_id", StringType(), True),
    StructField("current_balance", FloatType(), True),
    StructField("currency", StringType(), True),
    StructField("last_updated", StringType(), True),
  
])



In [3]:

#reading data from the buckets
BUCKET_NAME = os.getenv("BUCKET_NAME")

profile_df=spark.read.format("json")\
    .schema(Profile_schema)\
    .load(f"s3a://{BUCKET_NAME}/bronze/profiles-folder/*.json")

merchant_df= spark.read.format("json")\
    .schema(merchant_schema)\
    .load(f"s3a://{BUCKET_NAME}/bronze/merchant-folder/*.json")

account_df= spark.read.format("json")\
    .schema(account_schema)\
    .load(f"s3a://{BUCKET_NAME}/bronze/account-folder/*.json")




25/11/15 16:51:48 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties


In [4]:
#coalescing the dataframes to 2 partitions each
profile_df = profile_df.coalesce(2)
merchant_df = merchant_df.coalesce(2)
account_df = account_df.coalesce(2)

print("Number of partitions:", profile_df.rdd.getNumPartitions())
print("Number of partitions:", merchant_df.rdd.getNumPartitions())
print("Number of partitions:", account_df.rdd.getNumPartitions())

partition_data = profile_df.rdd.mapPartitionsWithIndex(
    lambda idx, it: [(idx, list(it))] 
).collect()

for idx, rows in partition_data:
    print(f"\nðŸ§© Partition {idx}: {len(rows)} sample rows")
    for r in rows:
        print(r)




Number of partitions: 2
Number of partitions: 2
Number of partitions: 2


[Stage 0:>                                                          (0 + 2) / 2]


ðŸ§© Partition 0: 5 sample rows
Row(user_id='U0003', name='Michael Wright', country='Syrian Arab Republic', account_type='Savings', credit_score=766, status='suspended', timestamp='2025-03-09T01:55:51.245730')
Row(user_id='U0006', name='Kelsey Garcia', country='Norfolk Island', account_type='Savings', credit_score=576, status='closed', timestamp='2025-08-05T03:56:01.088296')
Row(user_id='U0001', name='Melissa Alexander', country='Mongolia', account_type='Savings', credit_score=771, status='closed', timestamp='2025-09-17T11:34:59.637391')
Row(user_id='U0004', name='Colin Rivera', country='Mauritania', account_type='Current', credit_score=776, status='suspended', timestamp='2025-05-02T05:29:44.735715')
Row(user_id='U0010', name='Michael Murray', country='Comoros', account_type='Current', credit_score=767, status='suspended', timestamp='2025-11-03T10:48:17.294047')

ðŸ§© Partition 1: 5 sample rows
Row(user_id='U0005', name='Elizabeth Pierce', country='Nigeria', account_type='Savings', cr

                                                                                

In [5]:
#esnure every user_id starts with U 
profile_df = profile_df.filter(col("user_id").startswith("U"))

#transforming the timestamp column to date, month and year for profile dataframe

profile_df = profile_df.withColumn("date", to_date(col("timestamp"), "yyyy-MM-dd"))\
.withColumn("month", date_format("date", "MMMM"))\
.withColumn("year", date_format("date", "yyyy"))\
.withColumn("time",date_format(col("timestamp"), "HH:mm:ss"))\
.withColumn("country",regexp_replace(trim(col("country")), " ", ""))


profile_df = profile_df


#splitted names to first and last names
profile_df = profile_df.withColumn("first_name", split(col("name"), " ").getItem(0))
profile_df = profile_df.withColumn("last_name", split(col("name"), " ",).getItem(1))




#handling duplicates

profile_duplictes = profile_df.groupBy("user_id")\
            .count()\
            .filter("count > 1")

profile_df = profile_df.dropDuplicates(["user_id"])

profile_df = profile_df.select('user_id', 'first_name', 'last_name', 'country', 'account_type', 'credit_score', 'status', 'date', 'month', 'year','time')
profile_df.show()


+-------+----------+---------+------------------+------------+------------+---------+----------+---------+----+--------+
|user_id|first_name|last_name|           country|account_type|credit_score|   status|      date|    month|year|    time|
+-------+----------+---------+------------------+------------+------------+---------+----------+---------+----+--------+
|  U0001|   Melissa|Alexander|          Mongolia|     Savings|         771|   closed|2025-09-17|September|2025|11:34:59|
|  U0002|   Lindsey|  Wallace|           Denmark|     Savings|         566|   closed|2025-01-01|  January|2025|00:00:53|
|  U0003|   Michael|   Wright|SyrianArabRepublic|     Savings|         766|suspended|2025-03-09|    March|2025|01:55:51|
|  U0004|     Colin|   Rivera|        Mauritania|     Current|         776|suspended|2025-05-02|      May|2025|05:29:44|
|  U0005| Elizabeth|   Pierce|           Nigeria|     Savings|         845|   active|2025-07-28|     July|2025|16:50:39|
|  U0006|    Kelsey|   Garcia|  

In [6]:
#ensurinq that merchant id starts with M
merchant_df = merchant_df.filter(col("merchant_id").startswith("M"))

#duplicate records 
duplicates_key_df = (
    merchant_df.groupBy("merchant_id")
    .count()
    .filter("count > 1")
    .select("merchant_id")
)

#removing the dupicates
merchant_df = merchant_df.dropDuplicates(["merchant_id"])

#trimming all string data columns

def trim_all_string_columns(merchant_df):

    string_cols = [f.name for f in merchant_df.schema.fields if f.dataType.simpleString() == "string"]
    for c in string_cols:
        merchant_df = merchant_df.withColumn(c, trim(col(c)))
    return merchant_df

merchant_df = trim_all_string_columns(merchant_df)

#tranfroming the timestamp column to date month and year for merchant data

merchant_df = merchant_df.withColumn("date", to_date(col("timestamp"), "yyyy-MM-dd"))
merchant_df = merchant_df.withColumn("month", date_format("date", "MMMM"))
merchant_df = merchant_df.withColumn("year", date_format("date", "yyyy"))
merchant_df = merchant_df.select('merchant_id', 'merchant_name', 'category', 'country', 'risk_level', 'date', 'month', 'year')


merchant_df.show()



+-----------+--------------------+----------+-------+----------+----------+---------+----+
|merchant_id|       merchant_name|  category|country|risk_level|      date|    month|year|
+-----------+--------------------+----------+-------+----------+----------+---------+----+
|      M0001|Hill, Gonzales an...|    Travel|  India|    Medium|2025-01-01|  January|2025|
|      M0002|      Martinez-Owens|    Retail|  India|       Low|2025-04-24|    April|2025|
|      M0003|Waller, Mueller a...|Restaurant| Russia|      High|2025-03-31|    March|2025|
|      M0004|     Oconnor-Carroll|Restaurant|    USA|       Low|2025-09-26|September|2025|
|      M0005|       Bennett-Moore|    Travel|  India|       Low|2025-10-25|  October|2025|
|      M0006|         Torres-Long|    Retail|  India|       Low|2025-06-24|     June|2025|
|      M0007|        Jones-Chavez|    Travel| Russia|    Medium|2025-01-02|  January|2025|
|      M0008|       Harris-Murphy|Restaurant| Russia|      High|2025-02-23| February|2025|

In [7]:
#account id starts with A and user id starts with U
account_df = account_df.filter(col("account_id").startswith("A") & col("user_id").startswith("U")) 
                       
#extracting date month year from last_updated column
account_df = account_df.withColumn("date", to_date(col("last_updated"), "yyyy-MM-dd"))\
                    .withColumn("month", date_format("date", "MMMM")) \
                    .withColumn("year", date_format("date", "yyyy"))\
                    .withColumn("Time",date_format(col("last_updated"), "HH:mm:ss"))

# #ensuring the userid , accountid , cuurency are Uppercase

account_df = account_df.withColumn("account_id", upper(col("account_id"))) \
                       .withColumn("user_id", upper(col("user_id"))) \
                       .withColumn("currency", upper(col("currency")))
##duplicate datas
duplicate_keys = (
    account_df.groupBy("account_id")
    .count()
    .filter("count > 1")
)
account_df = account_df.dropDuplicates(["account_id"])

#filtering the account data for current balance greater than 0 and not null currency

account_df = account_df.filter((col("current_balance") > 0) & (col("currency").isNotNull()))

account_df.select('account_id', 'user_id', 'current_balance', 'currency', 'date', 'month', 'year','time','last_updated').show()

+----------+-------+---------------+--------+----------+---------+----+--------+--------------------+
|account_id|user_id|current_balance|currency|      date|    month|year|    time|        last_updated|
+----------+-------+---------------+--------+----------+---------+----+--------+--------------------+
|     A0001|  U0001|       93336.67|     INR|2025-06-09|     June|2025|06:39:38|2025-06-09T06:39:...|
|     A0002|  U0002|      171217.88|     INR|2025-07-26|     July|2025|10:07:27|2025-07-26T10:07:...|
|     A0003|  U0003|      198817.06|     INR|2025-03-18|    March|2025|20:57:35|2025-03-18T20:57:...|
|     A0004|  U0004|      134945.77|     INR|2025-01-30|  January|2025|22:42:45|2025-01-30T22:42:...|
|     A0005|  U0005|      133217.58|     INR|2025-08-22|   August|2025|09:30:15|2025-08-22T09:30:...|
|     A0006|  U0006|      117262.16|     INR|2025-09-20|September|2025|10:20:54|2025-09-20T10:20:...|
|     A0007|  U0007|      154211.77|     INR|2025-04-03|    April|2025|07:02:39|20

In [8]:
profile_df.write.format("parquet").mode("append").save(f"s3a://{BUCKET_NAME}/silver/processed-profiles/")
merchant_df.write.format("parquet").mode("append").save(f"s3a://{BUCKET_NAME}/silver/processed-merchants/")
account_df.write.format("parquet").mode("append").save(f"s3a://{BUCKET_NAME}/silver/processed-accounts/")


                                                                                