# Let's generate some data to play with in Scala

Three datasets:
- Products
- Sensors
- Locations
- Readings

## Generate products dataset

In [285]:
import pandas as pd

# Chat-GPT generated list of products
with open("products.txt", "r", encoding="UTF-8") as products_file:
    all_products = products_file.read().strip().split("\n")
    all_products_dataset = [[i+1, all_products[i]] for i in range(len(all_products))]
    all_products_df = pd.DataFrame(all_products_dataset, columns=["product_id", "product_description"])
    all_products_df.to_csv("output/products.csv", header=True, index=False)

## Generate Sensors dataset

In [286]:
# Chat-GPT generated list of sensor_types with plant names
with open("sensor_types.txt", "r", encoding="UTF-8") as sensor_types_file:
    all_sensor_types = sensor_types_file.read().strip().split("\n")
    all_sensor_types_dataset = [[i+1, all_sensor_types[i]] for i in range(len(all_sensor_types))]
    all_sensor_types_df = pd.DataFrame(all_sensor_types_dataset, columns=["sensor_type", "sensor_description"])
    all_sensor_types_df.to_csv("output/sensor_types.csv", header=True, index=False)

## Generate locations dataset

Two-dimensional 500x500 space

In [287]:
import random

all_locations = []
locations_dataset = []

for i in range(10):
    location = (random.randint(1, 500), random.randint(1, 500))
    while location in all_locations:
        location = (random.randint(1, 500), random.randint(1, 500))

    all_locations.append(location)
    locations_dataset.append([i+1, location[0], location[1]])

locations_df = pd.DataFrame(locations_dataset, columns=["location_id", "x", "y"])
locations_df.to_csv("output/locations.csv", header=True, index=False)

### Generate distance helper
Using euclidean distance

In [288]:
import math

distances_helper = {}

for i in locations_dataset:
    for j in locations_dataset:
        if i[0] != j[0]:
            distances_helper[(i[0], j[0])] = math.dist((i[1], i[2]), (j[1], j[2]))

### Create events in which a product is transported from one location to another

following all locations in-between, using the shortest path.

There is a 1% chance of the product to be lost

In [289]:
MAX_EVENTS = 100000

events_buckets = []

# Event ID, Event Type, Product ID, Sensor ID, Source Location ID, Destination Location ID, Current Location ID
process_id = 1
for _ in range(MAX_EVENTS):
    product_id = random.randint(1, len(all_products_dataset))
    sensor_type_id = random.randint(1, len(all_sensor_types_dataset))
    current_location_id = random.randint(1, len(locations_dataset))
    destination_location_id = random.randint(1, len(locations_dataset))
    
    while (destination_location_id == current_location_id):
        destination_location_id = random.randint(1, len(locations_dataset))

    event_bucket = []

    event_bucket.append([process_id, "START", product_id, sensor_type_id, current_location_id, destination_location_id, current_location_id])

    mark_as_lost = False

    while current_location_id != destination_location_id and not mark_as_lost:
        destination_distance = distances_helper[(current_location_id, destination_location_id)]
        current_distances = {k[1]: v for k, v in distances_helper.items() if k[0] == current_location_id}
        sorted_current_distances = dict(sorted(current_distances.items(), key=lambda item: item[1]))

        for destination in sorted_current_distances:

            if destination != destination_location_id:

                if distances_helper[(destination, destination_location_id)] <= destination_distance:
                    current_location_id = destination

                    mark_as_lost = random.random() < 0.01
                    
                    event_bucket.append([process_id, "MOVE", product_id, sensor_type_id, current_location_id, destination_location_id, current_location_id])
                    break

            else:
                current_location_id = destination
                event_bucket.append([process_id, "END", product_id, sensor_type_id, current_location_id, destination_location_id, current_location_id])
                break


    events_buckets.append(event_bucket)

    process_id += 1
    

### Scramble events

In [290]:
all_events = []

from copy import deepcopy
events_buckets_copy = deepcopy(events_buckets)
while events_buckets_copy:
    random_bucket_index = random.randint(0, len(events_buckets_copy) - 1)

    if not events_buckets_copy[random_bucket_index]:
        events_buckets_copy.pop(random_bucket_index)
        continue

    all_events.append(events_buckets_copy[random_bucket_index].pop(0))


## Save events

In [291]:
events = pd.DataFrame(all_events, columns=["process_id", "event_type", "product_id", "sensor_type_id", "source_location_id", "destination_location_id", "current_location_id"])
events = events.reset_index().rename(columns={"index": "event_id"})
codes, uniques = pd.factorize(events["process_id"])
events["process_id"] = codes + 1
events.to_csv("output/events.csv", header=True, index=False)
