In [1]:
'''
Big Data Infrastructure
'''
import pyspark
print(pyspark.__version__)

3.4.4


In [None]:
import pyspark

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

# 1. Start a Spark Session (Infrastructure Mindset)
spark = SparkSession.builder.appName("RBC_Risk_Pipeline").getOrCreate()

# 2. Load the data we just created
# inferSchema = True means Spark will automatically detect data types
df = spark.read.csv("data/raw_loans.csv", header=True, inferSchema=True)

# 3. SQL-style transformations: Logic for Risk Categorization
# This shows you understand how to segment a bank's portfolio
df_processed = df.withColumn("risk_rating", 
    when(col("credit_score") < 550, "High Risk")
    .when(col("credit_score") < 700, "Medium Risk")
    .otherwise("Low Risk")
)

# 4. Feature Engineering: Calculating Debt-to-Income (DTI) ratio
df_processed = df_processed.withColumn("DTI", col("loan_amount") / col("customer_income"))

# 5. Save the output in 'Parquet' format (Standard in Bank IT Environments)
# Parquet is a columnar (stores data column-wise), 
# compressed (smaller files and faster reads)and efficient data storage file format optimized for big data processing
# Sprak can read Parquet files muh faster than CSV files
# .write.mode("overwrite") means if the file already exists, overwrite it
df_processed.write.mode("overwrite").parquet("data/processed_loans.parquet")


print("Spark Pipeline Complete: Data cleaned and Risk Ratings assigned.")
print(df_processed.show(5))

Spark Pipeline Complete: Data cleaned and Risk Ratings assigned.
+-------+---------------+------------+-----------+--------------------+-----------------+---------------+-----------+-------------------+
|loan_id|customer_income|credit_score|loan_amount|       interest_rate|employment_status|current_balance|risk_rating|                DTI|
+-------+---------------+------------+-----------+--------------------+-----------------+---------------+-----------+-------------------+
|   1000|          45795|         791|      31548| 0.23334959810163441|         Employed|          34447|   Low Risk| 0.6888961677038978|
|   1001|          30860|         498|      40434| 0.18672668626404276|    Self-Employed|          18131|  High Risk| 1.3102397926117952|
|   1002|         133694|         661|      30039| 0.11104239166864148|         Employed|          28429|Medium Risk|0.22468472781127052|
|   1003|         149879|         740|      37530| 0.18686620450360128|         Employed|          18705|  