In [0]:
import time
import random
import json
from pyspark.sql import SparkSession

# Spark session (only if needed outside Databricks notebooks)
spark = SparkSession.builder.getOrCreate()

# Define output path
output_path = "/mnt/input-data/streaming-data/"

# Sample data pools
emp_names = ["Alice", "Bob", "Charlie", "David", "Eve", "Frank", "Grace", "Helen", "Ivy", "Jack", "Karen", "Leo", "Mona", "Nina", "Oscar", "Paul", "Quincy", "Rita", "Steve", "Tina"]
countries = ["USA", "Canada", "UK", "India", "Germany"]
designations = ["Software Engineer", "Data Scientist", "Manager", "Analyst", "Consultant"]
departments = ["IT", "HR", "Finance", "Marketing", "Operations"]

# Counter to create unique employee IDs
empid_counter = 1000

# Function to generate a batch of employee records
def generate_batch_records(batch_size=10):
    global empid_counter
    records = []
    selected_names = random.sample(emp_names, batch_size)  # Select unique names for this batch

    for name in selected_names:
        record = {
            "empid": empid_counter,
            "empname": name,
            "country": random.choice(countries),
            "designation": random.choice(designations),
            "dept": random.choice(departments)
        }
        records.append(record)
        empid_counter += 1

    return records

# Write multiple files, each with 10 records
for i in range(100):  # You can change this to create more files
    batch_records = generate_batch_records(batch_size=10)
    json_data = "\n".join([json.dumps(record) for record in batch_records])  # Write as newline-delimited JSON (standard practice)
    
    file_path = f"{output_path}employee_batch_{i}.json"
    
    # Save JSON batch into a file
    dbutils.fs.put(file_path, json_data, overwrite=True)
    
    print(f"Written file: {file_path} with {len(batch_records)} records.")
    
    time.sleep(3)  # wait 3 seconds between file writes to simulate streaming
