# Intro to Spark
## Husayn El Sharif
## Refer to: https://academy.zerotomastery.io/courses/data-engineering-bootcamp/lectures/60978439
### Use environment spark4_env001 (see spark4_env001.yml)

In [1]:
# imports

from pyspark.sql import SparkSession # Import SparkSession from PySpark. SQL module

from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType # Import data types for defining schema

In [2]:
# Create a Spark session instance
spark = (
    SparkSession.builder
    .master("local[*]")  # Use local and all available cores
    .appName("MyFirstSparkApp")  # Name of the application
    .getOrCreate()  # Create the Spark session (or get it if it already exists)
) 

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/12/25 08:52:22 WARN Utils: Your hostname, Husayn-SLS2, resolves to a loopback address: 127.0.1.1; using 10.255.255.254 instead (on interface lo)
25/12/25 08:52:22 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/12/25 08:52:23 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Print the Spark version
print(f"The Apache Spark version is: {spark.version}")

The Apache Spark version is: 4.1.0


In [4]:
# define schema instead of inferring it
schema = StructType([
    StructField("step", IntegerType(), True), # True means the field can be null
    StructField("type", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("nameOrig", StringType(), True),
    StructField("oldbalanceOrg", DoubleType(), True),
    StructField("newbalanceOrig", DoubleType(), True),
    StructField("nameDest", StringType(), True),
    StructField("oldbalanceDest", DoubleType(), True),
    StructField("newbalanceDest", DoubleType(), True),
    StructField("isFraud", IntegerType(), True),
    StructField("isFlaggedFraud", IntegerType(), True),
])

In [5]:
# read a CSV file into a DataFrame
df_inferred_schema = spark.read.csv("example_csv_data/example_data.csv", 
                    header=True, # Use first row as header
                    inferSchema=True, # Infer data types from data, can be slow for large datasets
                    quote='"',  # Use double quotes as quote character
                    mode="PERMISSIVE" # Handle malformed lines by setting fields to null (other options: DROPMALFORMED which drops malformed lines, FAILFAST which throws an error)
                    )

                                                                                

In [6]:
# read a CSV file into a DataFrame with defined schema, much faster than inferring schema
df_defined_schema = spark.read.csv("example_csv_data/example_data.csv", 
                    header=True, # Use first row as header
                    schema=schema, # Use the defined schema. If data doesn't match schema, errors will be null
                    quote='"',  # Use double quotes as quote character
                    mode="PERMISSIVE" # Handle malformed lines by setting fields to null (other options: DROPMALFORMED which drops malformed lines, FAILFAST which throws an error)
                    )

In [7]:
# Helpful methods
df_defined_schema.count()  # Get number of rows in DataFrame

1048575

In [8]:
df_defined_schema.show(10)  # Show first 10 rows of DataFrame

+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|step|    type|  amount|   nameOrig|oldbalanceOrg|newbalanceOrig|   nameDest|oldbalanceDest|newbalanceDest|isFraud|isFlaggedFraud|
+----+--------+--------+-----------+-------------+--------------+-----------+--------------+--------------+-------+--------------+
|   1| PAYMENT| 9839.64|C1231006815|     170136.0|     160296.36|M1979787155|           0.0|           0.0|      0|             0|
|   1| PAYMENT| 1864.28|C1666544295|      21249.0|      19384.72|M2044282225|           0.0|           0.0|      0|             0|
|   1|TRANSFER|   181.0|C1305486145|        181.0|           0.0| C553264065|           0.0|           0.0|      1|             0|
|   1|CASH_OUT|   181.0| C840083671|        181.0|           0.0|  C38997010|       21182.0|           0.0|      1|             0|
|   1| PAYMENT|11668.14|C2048537720|      41554.0|      29885.86|M1230701703|      

In [9]:
# Select specific columns
df2 = df_defined_schema.select("step", "type", "amount")    
df2.show(5)  # Show first 5 rows of the new DataFrame

+----+--------+--------+
|step|    type|  amount|
+----+--------+--------+
|   1| PAYMENT| 9839.64|
|   1| PAYMENT| 1864.28|
|   1|TRANSFER|   181.0|
|   1|CASH_OUT|   181.0|
|   1| PAYMENT|11668.14|
+----+--------+--------+
only showing top 5 rows


In [10]:
df3 = df_defined_schema.select(
    df_defined_schema["step"],
    df_defined_schema["type"],
    (df_defined_schema["amount"]*100).alias("amount_cents")  # Create a new column with amount in cents
)

df3.show(5)  # Show first 5 rows of the new DataFrame

+----+--------+------------+
|step|    type|amount_cents|
+----+--------+------------+
|   1| PAYMENT|    983964.0|
|   1| PAYMENT|    186428.0|
|   1|TRANSFER|     18100.0|
|   1|CASH_OUT|     18100.0|
|   1| PAYMENT|   1166814.0|
+----+--------+------------+
only showing top 5 rows


In [11]:
# stop the Spark session
spark.stop() # Stop the Spark session when done, freeing up resources, especially important in production environments