In [0]:
!pip install faker

In [0]:
import random
from faker import Faker

In [0]:
from pyspark.sql.functions import *
from pyspark.sql.types import StructType, StructField, StringType, DoubleType

## Data Generator

In [0]:
# Initialize Faker
fake = Faker()

In [0]:
schema = StructType([
    StructField("ID", StringType(), False),
    StructField("Name", StringType(), False),
    StructField("Email", StringType(), False),
    StructField("Phone", StringType(), False),
    StructField("SSN", StringType(), False),
    StructField("Credit Card", StringType(), False),
    StructField("Address", StringType(), False),
    StructField("DOB", StringType(), False),
    StructField("Salary", DoubleType(), False),
    StructField("Account Number", StringType(), False)
])

In [0]:
data = [
    (
        fake.uuid4(),
        fake.name(),
        fake.email(),
        fake.phone_number(),
        fake.ssn(),
        fake.credit_card_number(),
        fake.address().replace("\n", ", "),
        fake.date_of_birth(minimum_age=18, maximum_age=90).strftime("%Y-%m-%d"),
        round(random.uniform(30000, 150000), 2),
        fake.bban()
    )
    for _ in range(100)
]

In [0]:
# Create Spark DataFrame
df_spark = spark.createDataFrame(data, schema=schema)

In [0]:
display(df_spark.take(1))

# Masking Concepts

In [0]:
def maskEmail(email: str):
  mail_usr =  email.split('@')[0]
  return f"{mail_usr[:2]}*****@{email.split('@')[1]}"

def maskPhoneNumber(phone: str):
  return f"***-**-{phone[-4:]}"

def maskAccountNumber(acct: str):
  return f"********{acct[-4:]}"

def maskCreditCard(cc: str):
  return f"************{cc[-4:]}"

In [0]:
mailMasker_udf = udf(maskEmail, StringType())
phoneMasker_udf = udf(maskPhoneNumber, StringType())
acctMasker_udf = udf(maskAccountNumber, StringType())
ccMasker_udf = udf(maskCreditCard, StringType())

In [0]:
masked_df = df_spark.withColumn("masked_email", mailMasker_udf(col("Email")))
masked_df = masked_df.withColumn("masked_phone", phoneMasker_udf(col("Phone")))
masked_df = masked_df.withColumn("masked_acct", acctMasker_udf(col("Account Number")))
masked_df = masked_df.withColumn("masked_cc", ccMasker_udf(col("Credit Card")))

In [0]:
display(masked_df.take(1))