## This code will generate data and store in DBFS

In [0]:
%pip install faker
from faker import Faker
from random import randint, choice
from time import sleep
import pandas as pd
from datetime import datetime
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType

# Initialize the Faker generator
fake = Faker()



# Define the schema for each table
transactions_schema = StructType([
    StructField("transaction_id", StringType(), True),
    StructField("customer_id", StringType(), True),
    StructField("branch_id", StringType(), True),
    StructField("channel", StringType(), True),
    StructField("transaction_type", StringType(), True),
    StructField("amount", DoubleType(), True),
    StructField("currency", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("status", StringType(), True)
])

customers_schema = StructType([
    StructField("customer_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("email", StringType(), True),
    StructField("phone", StringType(), True),
    StructField("address", StringType(), True),
    StructField("credit_score", IntegerType(), True),
    StructField("join_date", TimestampType(), True),
    StructField("last_update", TimestampType(), True)
])

branches_schema = StructType([
    StructField("branch_id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("location", StringType(), True),
    StructField("timezone", StringType(), True)
])

# Define the tables and their schemas
tables = {
    "transactions": transactions_schema,
    "customers": customers_schema,
    "branches": branches_schema,
}

# Function to generate data for a specific table
def generate_data(table_name, num_entries=1):
    data = []
    for _ in range(num_entries):
        row = {}
        for column in tables[table_name].fieldNames():
            if column == 'transaction_id':
                row[column] = "T" + str(randint(1000, 9999))
            elif column == 'customer_id':
                row[column] = "C" + str(randint(3000, 8000))
            elif column == 'branch_id':
                row[column] = "B" + str(randint(101, 110)).zfill(3)
            elif column == 'channel':
                row[column] = choice(['online', 'branch', 'mobile', 'ATM'])
            elif column == 'transaction_type':
                row[column] = choice(['deposit', 'withdrawal', 'transfer', 'balance_check'])
            elif column == 'amount':
                row[column] = round(randint(1, 1000) * 100, 2)
            elif column == 'currency':
                row[column] = choice(['USD', 'EUR', 'GBP'])
            elif column == 'timestamp':
                row[column] = fake.date_time_this_year()
            elif column == 'status':
                row[column] = choice(['success', 'pending', 'failed'])
            elif column == 'name':
                if table_name == "branches":
                    row[column] = fake.company_suffix() + " Branch"
                else:
                    row[column] = fake.name()
            elif column == 'email':
                first_name = fake.first_name().lower()
                row[column] = f"{first_name}85@example.net"
            elif column == 'phone':
                row[column] = ''.join([str(randint(0, 9)) for _ in range(10)])
            elif column == 'address':
                row[column] = fake.address()
            elif column == 'credit_score':
                row[column] = randint(300, 850)
            elif column == 'join_date':
                row[column] = fake.date_time_between(start_date='-5y', end_date=datetime.today())
            elif column == 'last_update':
                row[column] = fake.date_time_between(start_date='-1y', end_date=datetime.today())
            elif column == 'location':
                if table_name == "branches":
                    row[column] = choice(['New York, USA', 'London, UK', 'Toronto, Canada', 'Sydney, Australia', 'Los Angeles, USA'])
                else:
                    row[column] = fake.city()
            elif column == 'timezone':
                if table_name == "branches":
                    row[column] = choice(['EST', 'GMT', 'EST', 'AEST', 'PST'])
                else:
                    row[column] = choice(['UTC', 'EST', 'CST', 'MST', 'PST'])
            else:
                row[column] = fake.uuid4()
        data.append(row)
    return pd.DataFrame(data)

# Generate static data for customers and branches
customers_pd_df = generate_data("customers", num_entries=1000)
branches_pd_df = generate_data("branches", num_entries=10)

# Convert to Spark DataFrames with correct schema
customers_spark_df = spark.createDataFrame(customers_pd_df, schema=customers_schema)
branches_spark_df = spark.createDataFrame(branches_pd_df, schema=branches_schema)



# Write the static data to DBFS
customers_spark_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("raw_data.customers")
branches_spark_df.write.format("delta").mode("overwrite").option("overwriteSchema", "true").saveAsTable("raw_data.branches")

# Function to stream transactions
def stream_data():
    while True:
        # Generate transactions data
        transactions_pd_df = generate_data("transactions", num_entries=5)
        transactions_pd_df['timestamp'] = pd.to_datetime(transactions_pd_df['timestamp'])
        transactions_spark_df = spark.createDataFrame(transactions_pd_df, schema=transactions_schema)
        transactions_spark_df.write.format("delta").mode("append").option("mergeSchema", "true").saveAsTable("raw_data.transactions")
        print("Generated and saved transactions data")
        sleep(10)  # Adjust the sleep duration as needed

# Start the data streaming
stream_data()
