In [0]:
!pip install faker

In [0]:
import pandas as pd
from faker import Faker
import random
import time

## Notebook parameter
- **landing_zone_volume**: Volume to store the generated fake data.
- This value is specified from DABs.

In [0]:
dbutils.widgets.text("landing_zone_directory", "")

## Create a directory in the Volume to store the fake parquet dataset

In [0]:
directory = dbutils.widgets.get("landing_zone_directory")
print("Here is where the parquet files are going to be stored:\n\n-", directory)

In [0]:
# This command is idempotent and will do nothing if the directory already exists.
# Multiple executions are not going to delete the directory nor the files.
dbutils.fs.mkdirs(directory)

## Generate the data using Faker

In [0]:
# Initialize the Faker object
fake = Faker()

# Create a function to generate test data
def create_test_data(num_entries=10):
    data = []
    for _ in range(num_entries):
        data_entry = {
            "Full Name": fake.name(),
            "Street Address": fake.street_address(),
            "City": fake.city(),
            "State": fake.state(),
            "Zip Code": fake.zipcode(),
            "Email Address": fake.email(),
            "Contact Number": fake.phone_number(),
            "Birthdate": fake.date_of_birth(minimum_age=18, maximum_age=65).strftime("%Y-%m-%d"),
            "Order ID": fake.uuid4(),
            "Product Name": fake.word(),
            "Purchase Date": fake.date_this_year().strftime("%Y-%m-%d"),
            "Quantity Purchased": random.randint(1, 10),
            "Total Price": round(random.uniform(10.0, 500.0), 2),
            "Company Name": fake.company(),
            "Job Position": fake.job(),
            "Profile": fake.text(max_nb_chars=200),
        }
        data.append(data_entry)
    return pd.DataFrame(data)

## Generate N parquet files into the landing volume directory

In [0]:
num_of_files = 5
num_entries = 10000

for _ in range(num_of_files):
  timestr = time.strftime("%Y%m%d_%H%M%S")
  data_df = create_test_data(num_entries)
  file_name = f"{directory}/{timestr}.parquet"
  data_df.to_parquet(file_name, index=False)
  print(f"CREATED: {file_name}")