# pip installs

In [0]:
!pip install faker

In [0]:

import uuid
import random
import time
import pandas as pd
import numpy as np
from faker import Faker

## Notebook parameter
- **landing_zone_volume**: Volume to store the generated fake data.
    - This value is specified from DABs.

In [0]:
dbutils.widgets.text("landing_zone_volume", "")

## Create a directory in the Volume to store the fake parquet dataset

In [0]:
volume = dbutils.widgets.get("landing_zone_volume")

customers_directory = f"{volume}/customers"
products_directory = f"{volume}/products"
transactions_directory = f"{volume}/transactions"

print("Here is where the files are going to be stored:\n\n-", customers_directory, products_directory, transactions_directory)

In [0]:
# This command is idempotent and will do nothing if the directory already exists.
# Multiple executions are not going to delete the directory nor the files.
dbutils.fs.mkdirs(customers_directory)
dbutils.fs.mkdirs(products_directory)
dbutils.fs.mkdirs(transactions_directory)

## Generate the data using Faker

In [0]:
# Initialize the Faker object
fake = Faker()

# Random seed for each execution
seed = random.randint(1, 100)

Faker.seed(seed)
np.random.seed(seed)
random.seed(seed)

COUNTRIES = ['USA', 'UK', 'Canada', 'Germany', 'France', 'Australia', 'Japan', 'Brazil', 'India', 'Mexico']
COUNTRY_WEIGHTS = [0.25, 0.15, 0.10, 0.10, 0.08, 0.08, 0.08, 0.06, 0.06, 0.04]

CUSTOMER_SEGMENTS = ['Enterprise', 'SMB', 'Startup', 'Individual']
PRODUCT_CATEGORIES = ['Software', 'Hardware', 'Consulting', 'Training', 'Support']

In [None]:
def generate_customers(n_customers=1000):
    customers = pd.DataFrame({
        'customer_id': [np.random.choice([f'CUST{str(str(random.randint(1, 9999999))).zfill(8)}', None], p=[0.95, 0.05]) for i in range(1, n_customers + 1)],
        'name': [fake.name() for _ in range(n_customers)],
        'email': [fake.email() for _ in range(n_customers)],
        'country': random.choices(COUNTRIES, weights=COUNTRY_WEIGHTS, k=n_customers),
        'registration_date': [fake.date_between(start_date='-3y', end_date='today') for _ in range(n_customers)],
        'customer_segment': [random.choice(CUSTOMER_SEGMENTS) for _ in range(n_customers)]
    })
    
    return customers

In [None]:
def generate_products(n_products=1000):
    products = pd.DataFrame({
        'product_id': [np.random.choice([f'PROD{str(str(random.randint(1, 9999999))).zfill(8)}', None], p=[0.98, 0.02]) for i in range(1, n_products + 1)],
        'product_name': [f"{fake.word().capitalize()} {random.choice(['Pro', 'Suite', 'Platform', 'Tool', 'System'])}" 
                        for _ in range(n_products)],
        'category': [random.choice(PRODUCT_CATEGORIES) for _ in range(n_products)],
        'price': np.random.uniform(50, 5000, n_products).round(2),
        'cost': None  # Will calculate based on price
    })
    products['cost'] = (products['price'] * np.random.uniform(0.3, 0.7, n_products)).round(2)
    
    return products

In [None]:
def generate_transactions(customers_df, products_df, n_products, n_transactions=30000):
    # Create some popular products (Pareto principle)
    customer_id_pool = customers_df['customer_id'].tolist()
    products_id_pool = products_df['product_id'].tolist()
    popular_products = random.sample(products_id_pool, k=int(n_products * 0.2))

    transactions = []
    for i in range(n_transactions):
        transaction_id = np.random.choice([f'TXN{str(str(random.randint(1, 9999999))).zfill(8)}', None], p=[0.97, 0.03])
        customer_id = random.choice(customer_id_pool)
        
        # 80% chance to pick from popular products
        if random.random() < 0.8:
            product_id = random.choice(popular_products)
        else:
            product_id = random.choice(products_id_pool)
        
        quantity = random.choices([1, 2, 3, 5, 10], weights=[0.5, 0.25, 0.15, 0.07, 0.03])[0]
        transaction_date = fake.date_between(start_date='-2y', end_date='today')
        discount_applied = random.choices([0, 0.05, 0.10, 0.15, 0.20], weights=[0.6, 0.2, 0.1, 0.07, 0.03])[0]
        
        transactions.append({
            'transaction_id': transaction_id,
            'customer_id': customer_id,
            'product_id': product_id,
            'quantity': quantity,
            'transaction_date': transaction_date,
            'discount_applied': discount_applied
        })

    return pd.DataFrame(transactions)

## Save the pandas dataframes into the respective directory

In [0]:
# Create Customers file

n_customers = 500
n_products = 1000
n_transactions = 10000

### Customers CSV file ####
timestr = time.strftime("%Y%m%d_%H%M%S")
customers_df = generate_customers(n_customers)
customers_filepath = f"{customers_directory}/{timestr}.parquet"
customers_df.to_parquet(customers_filepath, index=False)
print("CUSTOMERS FILE CREATED: ", customers_filepath)

### Products JSON file ####
timestr = time.strftime("%Y%m%d_%H%M%S")
products_df = generate_products(n_products)
products_filepath = f"{products_directory}/{timestr}.parquet"
products_df.to_parquet(products_filepath, index=False)
print("PRODUCTS FILE CREATED: ", products_filepath)

### Transactions JSON file ####
# Create 3 transaction files
timestr = time.strftime("%Y%m%d_%H%M%S")
transactions_df = generate_transactions(customers_df, products_df, n_products, n_transactions)
transactions_filepath = f"{transactions_directory}/{timestr}.parquet"
transactions_df.to_parquet(transactions_filepath, index=False)
print("TRANSACTIONS FILE CREATED: ", transactions_filepath)