In [4]:
df_users.printSchema()
df_cards.printSchema()
df_transactions.printSchema()


root
 |-- address: string (nullable = true)
 |-- birth_month: string (nullable = true)
 |-- birth_year: string (nullable = true)
 |-- credit_score: string (nullable = true)
 |-- current_age: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- id: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- num_credit_cards: string (nullable = true)
 |-- per_capita_income: float (nullable = true)
 |-- retirement_age: string (nullable = true)
 |-- total_debt: float (nullable = true)
 |-- yearly_income: float (nullable = true)
 |-- errors: string (nullable = true)

root
 |-- acct_open_date: date (nullable = true)
 |-- card_brand: string (nullable = true)
 |-- card_number: string (nullable = true)
 |-- card_on_dark_web: string (nullable = true)
 |-- card_type: string (nullable = true)
 |-- client_id: string (nullable = true)
 |-- credit_limit: double (nullable = true)
 |-- cvv: string (nullable = true)
 |-- expires: date (nu

In [5]:

# Gold Layer Star Schema Creation

from awsglue.context import GlueContext
from pyspark.context import SparkContext

# Glue & Spark Context 
sc = SparkContext.getOrCreate()
glueContext = GlueContext(sc)
spark = glueContext.spark_session

# --- 1. Read data from Silver Layer ---
silver_bucket = "s3://aws-project-1-mskish/silver-layer"

df_users = spark.read.parquet(f"{silver_bucket}/users-topic/")
df_cards = spark.read.parquet(f"{silver_bucket}/cards-topic/")
df_transactions = spark.read.parquet(f"{silver_bucket}/transactions-topic/")

# --- 2. Build Dimensions & Fact ---

# Dimension: Users
dim_users = (
    df_users.select(
        "id",
        "gender",
        "address",
        "yearly_income",
        "credit_score",
        "current_age",
        "num_credit_cards",
        "per_capita_income",
        "retirement_age",
        "total_debt"
    )
    .withColumnRenamed("id", "user_id")
)

# Dimension: Cards
dim_cards = (
    df_cards.select(
        "id",
        "client_id",
        "card_type",
        "card_brand",
        "credit_limit",
        "acct_open_date",
        "expires",
        "num_cards_issued",
        "has_chip"
    )
    .withColumnRenamed("id", "card_id")
)

# Fact: Transactions
fact_transactions = (
    df_transactions.select(
        "id",
        "client_id",
        "card_id",
        "amount",
        "date",
        "merchant_id",
        "merchant_city",
        "merchant_state",
        "use_chip",
        "zip"
    )
    .withColumnRenamed("id", "transaction_id")
    .withColumnRenamed("date", "transaction_date")
)

# --- 3. Write to Gold Layer ---
gold_bucket = "s3://aws-project-1-mskish/gold-layer"

dim_users.write.mode("overwrite").parquet(f"{gold_bucket}/dim_users/")
dim_cards.write.mode("overwrite").parquet(f"{gold_bucket}/dim_cards/")
fact_transactions.write.mode("overwrite").parquet(f"{gold_bucket}/fact_transactions/")

print("✅ Gold layer star schema created successfully.")


✅ Gold layer star schema created successfully.
