In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, mean, stddev, min, max, when, isnan
from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType
from pyspark.ml.stat import Correlation
import pandas as pd
import numpy as np



In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

spark = SparkSession.builder \
    .appName("Data processing") \
    .config("spark.sql.shuffle.partitions", "200")  \
    .config("spark.executor.memory", "4g")  \
    .config("spark.driver.memory", "4g")  \
    .config("spark.executor.cores", "2")  \
    .getOrCreate()

In [0]:
data = spark.read.csv("/FileStore/tables/creditcard.csv", header=True, inferSchema=True)


In [0]:
dbutils.fs.ls('FileStore/tables/')

Out[4]: [FileInfo(path='dbfs:/FileStore/tables/creditcard-1.csv', name='creditcard-1.csv', size=150828752, modificationTime=1733142386000),
 FileInfo(path='dbfs:/FileStore/tables/creditcard.csv', name='creditcard.csv', size=150828752, modificationTime=1732695303000),
 FileInfo(path='dbfs:/FileStore/tables/train_identity.csv', name='train_identity.csv', size=26529680, modificationTime=1733142316000),
 FileInfo(path='dbfs:/FileStore/tables/train_transaction.csv', name='train_transaction.csv', size=683351067, modificationTime=1733142468000)]

In [0]:
data = spark.read.csv('/FileStore/tables/creditcard.csv', header=True, inferSchema=True)

In [0]:
# Check class distribution
data.groupBy("Class").count().show()

+-----+------+
|Class| count|
+-----+------+
|    1|   492|
|    0|284315|
+-----+------+



In [0]:
#  Implement Efficient Data Partitioning Strategies

In [0]:
# Calculate the optimal number of partitions based on data size
data_size_in_bytes = spark._jvm.org.apache.spark.util.SizeEstimator.estimate(data._jdf)
target_partition_size = 128 * 1024 * 1024  # 128 MB

optimal_partitions = data_size_in_bytes // target_partition_size 


In [0]:
# Repartition the dataset
partitioned_data = data.repartition(optimal_partitions, col("Class"))

In [0]:

# Save partitioned data
partitioned_data.write.mode("overwrite").partitionBy("Class").parquet("partitioned_data/")


In [0]:
# Task 3: Create Aggregations and Statistical Summaries

In [0]:
# Aggregations by class
agg_data = data.groupBy("Class").agg(
    count("*").alias("count"),
    mean("Amount").alias("avg_amount"),
    stddev("Amount").alias("std_amount"),
    min("Amount").alias("min_amount"),
    max("Amount").alias("max_amount")
)

agg_data.show()



+-----+------+------------------+------------------+----------+----------+
|Class| count|        avg_amount|        std_amount|min_amount|max_amount|
+-----+------+------------------+------------------+----------+----------+
|    1|   492|122.21132113821139| 256.6832882977121|       0.0|   2125.87|
|    0|284315| 88.29102242231887|250.10509222589212|       0.0|  25691.16|
+-----+------+------------------+------------------+----------+----------+



In [0]:
data.describe().display()

summary,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
count,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0,284807.0
mean,94813.85957508069,9.516248586879277e-16,-4.1513836117258586e-17,-1.3156692677161953e-15,1.4976914722303293e-15,8.941441625255696e-16,1.3971002539462024e-15,-2.6664656275316093e-16,1.528826849318273e-16,-2.2894481447166985e-15,2.176532506335261e-15,1.8074485571052584e-15,-1.2837355476259965e-15,1.8042551850962386e-15,9.899453227961665e-16,4.9720802180439706e-15,1.4585726651198357e-15,-4.0715493115003616e-16,1.2214647934501085e-15,1.0039163253356284e-15,4.770099438473463e-16,3.718282533002536e-16,-4.630389413078844e-16,2.985802828433598e-16,4.504351011597839e-15,7.153153300204558e-16,1.636403568872131e-15,-3.5685932200797293e-16,-1.2593860860572196e-16,88.34961925093698,0.00172748563062
stddev,47488.145954566266,1.9586958038574889,1.6513085794769968,1.51625500517777,1.4158685749409217,1.3802467340314388,1.332271089757576,1.2370935981826656,1.194352902669203,1.09863208922432,1.0888497654025182,1.0207130277115588,0.9992013895301411,0.9952742301251544,0.9585956112570638,0.9153160116104386,0.87625288738837,0.8493370636743882,0.8381762095288418,0.8140405007685789,0.7709250248871173,0.734524014371313,0.725701560440911,0.6244602955949902,0.6056470678271607,0.5212780705409428,0.4822270132610572,0.4036324949650301,0.3300832641602508,250.1201092401884,0.0415271896354648
min,0.0,-56.407509631329,-72.7157275629303,-48.3255893623954,-5.68317119816995,-113.743306711146,-26.1605059358433,-43.5572415712451,-73.2167184552674,-13.4340663182301,-24.5882624372475,-4.79747346479757,-18.6837146333443,-5.79188120632084,-19.2143254902614,-4.49894467676621,-14.1298545174931,-25.1627993693248,-9.49874592104677,-7.21352743017759,-54.497720494566,-34.8303821448146,-10.933143697655,-44.8077352037913,-2.83662691870341,-10.2953970749851,-2.60455055280817,-22.5656793207827,-15.4300839055349,0.0,0.0
max,172792.0,2.45492999121121,22.0577289904909,9.38255843282114,16.8753440335975,34.8016658766686,73.3016255459646,120.589493945238,20.0072083651213,15.5949946071278,23.7451361206545,12.0189131816199,7.8483920756446,7.12688295859376,10.5267660517847,8.87774159774277,17.3151115176278,9.25352625047285,5.04106918541184,5.59197142733558,39.4209042482199,27.2028391573154,10.5030900899454,22.5284116897749,4.58454913689817,7.51958867870916,3.5173456116238,31.6121981061363,33.8478078188831,25691.16,1.0


In [0]:
# Data quality check

In [0]:
# Null Check: Count the number of null values for each column
null_values = data.select([count(when(col(c).isNull(), c)).alias(c) for c in data.columns])
null_values.show()

+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|Time| V1| V2| V3| V4| V5| V6| V7| V8| V9|V10|V11|V12|V13|V14|V15|V16|V17|V18|V19|V20|V21|V22|V23|V24|V25|V26|V27|V28|Amount|Class|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|   0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|     0|    0|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+



In [0]:
# Duplicates Check: Count the number of duplicate rows
duplicates_count = data.count() - data.distinct().count()

In [0]:
print(f"Duplicate Rows: {duplicates_count}")


Duplicate Rows: 1081


In [0]:
# Amount Range Check: Count negative amounts (which are not valid)
invalid_amounts = data.filter(data["Amount"] < 0).count()


In [0]:
print(f"invalid_amounts Rows: {invalid_amounts}")


invalid_amounts Rows: 0
