In [0]:
%pip install dbldatagen

In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import expr, from_unixtime, floor, rand, lit
from pyspark.sql.types import *
import dbldatagen as dg
from datetime import datetime

spark = SparkSession.builder.getOrCreate()

# a) define schema
schema = StructType([
    StructField("site_id",      StringType(), False),
    StructField("tracker_row",  StringType(), False),
    StructField("angle_actual", DoubleType(), False),
    StructField("angle_target", DoubleType(), False),
    StructField("rpm",          DoubleType(), False),
    StructField("torque",       DoubleType(), False),
    StructField("motor_temp",   DoubleType(), False),
    StructField("ambient_temp", DoubleType(), False),
    StructField("wind_speed",   DoubleType(), False),
    StructField("irradiance",   DoubleType(), False),
    StructField("fault_code",   StringType(),   True)
])

# b) compute total rows = sensors × per-sec slots (if you want that full volume)
NUM_SITES      = 10
ROWS_PER_SITE = 50
site_ids     = [f"site_{i:03d}" for i in range(1, NUM_SITES+1)]
tracker_rows = [f"row_{i:05d}" for i in range(1, NUM_SITES*ROWS_PER_SITE+1)]

# if you truly want every sensor, every second:
seconds_in_window = int((datetime(2025,2,1) - datetime(2025,1,1)).total_seconds())
TOTAL_ROWS = seconds_in_window * NUM_SITES * ROWS_PER_SITE

# c) launch dbldatagen
telemetry_spec = (
    dg.DataGenerator(spark, name="sensor_logs", rows=TOTAL_ROWS, partitions=200)
      .withSchema(schema)
      .withColumnSpec("site_id"     , values=site_ids,     random=True)
      .withColumnSpec("tracker_row" , values=tracker_rows, random=True)
      .withColumnSpec("angle_actual", minValue=0.0, maxValue=180.0, random=True)
      .withColumnSpec("angle_target", expr="angle_actual + (rand()-0.5)*10")
      .withColumnSpec("rpm"         , minValue=0.0, maxValue=60.0 , random=True)
      .withColumnSpec("torque"      , minValue=0.0, maxValue=500.0, random=True)
      .withColumnSpec("motor_temp"  , minValue=15.0, maxValue=85.0, random=True)
      .withColumnSpec("ambient_temp", minValue=-5.0, maxValue=45.0, random=True)
      .withColumnSpec("wind_speed"  , minValue=0.0, maxValue=25.0 , random=True)
      .withColumnSpec("irradiance"  , minValue=0.0, maxValue=1200.0, random=True)
      .withColumnSpec("fault_code"  ,
         values=["OK","E01_motor_overload","E02_sensor_fail","E03_comm_loss"],
         weights=[0.85,0.05,0.05,0.05], random=True)
)

raw = telemetry_spec.build()

# d) attach a random timestamp anywhere in Q1 2025
seconds_in_q1 = seconds_in_window
epoch_start   = int(datetime(2025,1,1).timestamp())

with_ts = raw.withColumn(
    "event_timestamp",
    from_unixtime(
      floor( lit(epoch_start) + rand() * seconds_in_q1 ).cast("long")
    )
)


In [0]:
display(with_ts)

In [0]:
with_ts.count()

In [0]:
with_ts.write \
  .mode("overwrite") \
  .parquet("/Volumes/soni/default/create_a_volume_to_write_files/")

In [0]:
#dbutils.fs.rm("/Volumes/soni/default/create_a_volume_to_write_files/", True)