In [0]:
!pip install faker

In [0]:

import uuid
import random
import time
import pandas as pd
import numpy as np
from faker import Faker

## Notebook parameter
- **landing_zone_volume**: Volume to store the generated fake data.
- This value is specified from DABs.

In [0]:
dbutils.widgets.text("landing_zone_volume", "")

## Create a directory in the Volume to store the fake parquet dataset

In [0]:
volume = dbutils.widgets.get("landing_zone_volume")

customers_directory = f"{volume}/customers"
orders_directory = f"{volume}/orders"

print("Here is where the parquet files are going to be stored:\n\n-", customers_directory, orders_directory)

In [0]:
# This command is idempotent and will do nothing if the directory already exists.
# Multiple executions are not going to delete the directory nor the files.
dbutils.fs.mkdirs(customers_directory)
dbutils.fs.mkdirs(orders_directory)

## Generate the data using Faker

In [0]:
# Initialize the Faker object
fake = Faker()

# Create a function to generate test data
def generate_customers_data(n_customers=1000):
    customers = []
    for _ in range(n_customers):
        customers.append({
            "customer_id": np.random.choice([str(uuid.uuid4()), None], p=[0.95, 0.05]),
            "name": fake.name(),
            "email": fake.email(),
            "address": fake.address().replace("\n", ", "),
            "phone": fake.phone_number(),
            "registration_date": fake.date_between(start_date='-5y', end_date='today')
        })
    return pd.DataFrame(customers)

In [None]:
def generate_orders_data(customers_df, n_orders=10000):
    orders = []
    for _ in range(n_orders):
        orders.append({
            "order_id": np.random.choice([str(uuid.uuid4()), None], p=[0.95, 0.05]),
            "customer_id": np.random.choice(customers_df["customer_id"]),
            "order_date": fake.date_between(start_date='-4y', end_date='today'),
            "amount": round(np.random.uniform(10, 1000), 2),
            "product": fake.word(),
            "fulfilled": np.random.choice([True, False], p=[0.85, 0.15])
        })
    return pd.DataFrame(orders)

## Generate N parquet files into the landing volume directory

In [0]:
customers = 1000
orders = 10000

num_of_customer_files = 5
num_of_orders_files = 5

for _ in range(num_of_customer_files):
  timestr = time.strftime("%Y%m%d_%H%M%S")
  customer_df = generate_customers_data(customers)
  customers_filepath = f"{customers_directory}/{timestr}.parquet"
  customer_df.to_parquet(customers_filepath, index=False)
  print("CUSTOMERS FILE CREATED: ", customers_filepath)

  for _ in range(num_of_orders_files):
    timestr = time.strftime("%Y%m%d_%H%M%S")
    orders_df = generate_orders_data(customer_df, orders)
    orders_filepath = f"{orders_directory}/{timestr}.parquet"
    orders_df.to_parquet(orders_filepath, index=False)
    print("ORDERS FILE CREATED: ", orders_filepath)

  print("", "NEXT BATCH GENERATION", "")