In [0]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import random
import datetime

# Initialize Spark session
spark = SparkSession.builder.appName('hls_demo_2_sample_data').getOrCreate()

# Sample data generation function for Sales Data
def generate_sales_data(num_rows=1000):
    products = ['Aspirin', 'Ibuprofen', 'Paracetamol', 'Amlodipine', 'Metformin']
    regions = ['North America', 'Europe', 'Asia', 'Africa', 'South America']
    
    data = []
    
    for _ in range(num_rows):
        product = random.choice(products)
        region = random.choice(regions)
        sales_qty = random.randint(10, 1000)  # Random sales quantity
        sales_amount = sales_qty * random.randint(5, 30)  # Random sales amount (price per unit)
        order_date = datetime.datetime.now() - datetime.timedelta(days=random.randint(0, 365))  # Random date within the last year
        product_code = f"PROD_{random.randint(1000, 9999)}"
        
        data.append((product_code, product, region, sales_qty, sales_amount, order_date))
    
    return data

# Sample data generation function for Inventory Data
def generate_inventory_data(num_rows=1000):
    product_codes = ['PROD_1234', 'PROD_2345', 'PROD_3456', 'PROD_4567', 'PROD_5678']
    warehouse_locations = ['WH_A', 'WH_B', 'WH_C', 'WH_D', 'WH_E']
    
    data = []
    
    for _ in range(num_rows):
        product_code = random.choice(product_codes)
        location = random.choice(warehouse_locations)
        stock_qty = random.randint(50, 5000)  # Random stock quantity
        reorder_point = stock_qty // 2  # Reorder point is half of the stock
        last_received_date = datetime.datetime.now() - datetime.timedelta(days=random.randint(0, 365))  # Random date
        
        data.append((product_code, location, stock_qty, reorder_point, last_received_date))
    
    return data

# Sample data generation function for Clinical Trial Data
def generate_clinical_trial_data(num_rows=1000):
    trial_ids = ['TRIAL_A', 'TRIAL_B', 'TRIAL_C', 'TRIAL_D', 'TRIAL_E']
    drug_names = ['Aspirin', 'Ibuprofen', 'Paracetamol', 'Metformin']
    
    data = []
    
    for _ in range(num_rows):
        trial_id = random.choice(trial_ids)
        drug_name = random.choice(drug_names)
        participant_count = random.randint(100, 1000)  # Random number of participants
        success_rate = random.uniform(0.5, 0.95)  # Random success rate between 50% and 95%
        start_date = datetime.datetime.now() - datetime.timedelta(days=random.randint(0, 365))  # Random start date
        end_date = start_date + datetime.timedelta(days=random.randint(30, 180))  # Random end date (1-6 months)
        
        data.append((trial_id, drug_name, participant_count, success_rate, start_date, end_date))
    
    return data

# Generate sample data for each dataset
sales_data = generate_sales_data(1000)
inventory_data = generate_inventory_data(1000)
clinical_trial_data = generate_clinical_trial_data(1000)

# Create DataFrames for each dataset
sales_df = spark.createDataFrame(sales_data, ['product_code', 'product_name', 'region', 'sales_qty', 'sales_amount', 'order_date'])
inventory_df = spark.createDataFrame(inventory_data, ['product_code', 'location', 'stock_qty', 'reorder_point', 'last_received_date'])
clinical_trial_df = spark.createDataFrame(clinical_trial_data, ['trial_id', 'drug_name', 'participant_count', 'success_rate', 'start_date', 'end_date'])

# Show a sample of the DataFrames
sales_df.show(8)
inventory_df.show(8)
clinical_trial_df.show(8)

# Define paths to save data (Bronze layer paths)
bronze_path_sales_parquet = '/Volumes/workspace//hls_demo_2/raw_data/sales_data_bronze.parquet'
bronze_path_inventory_parquet = '/Volumes/workspace/hls_demo_2/raw_data/inventory_data_bronze.parquet'
bronze_path_clinical_trial_parquet = '/Volumes/workspace/hls_demo_2/raw_data/clinical_trial_data_bronze.parquet'

bronze_path_sales_delta = '/Volumes/workspace/hls_demo_2/raw_data/sales_data_bronze_delta'
bronze_path_inventory_delta = '/Volumes/workspace/hls_demo_2/raw_data/inventory_data_bronze_delta'
bronze_path_clinical_trial_delta = '/Volumes/workspace/hls_demo_2/raw_data/clinical_trial_data_bronze_delta'

# Save each DataFrame in Parquet format
sales_df.write.mode('overwrite').format("parquet").save(bronze_path_sales_parquet)
inventory_df.write.mode('overwrite').parquet(bronze_path_inventory_parquet)
clinical_trial_df.write.mode('overwrite').parquet(bronze_path_clinical_trial_parquet)

# Optionally, save each DataFrame in Delta format
sales_df.write.format('delta').mode('overwrite').save(bronze_path_sales_delta)
inventory_df.write.format('delta').mode('overwrite').save(bronze_path_inventory_delta)
clinical_trial_df.write.format('delta').mode('overwrite').save(bronze_path_clinical_trial_delta)


In [0]:

# Verify the data in Delta format
sales_delta_df = spark.read.format('delta').load(bronze_path_sales_delta)
inventory_delta_df = spark.read.format('delta').load(bronze_path_inventory_delta)
clinical_trial_delta_df = spark.read.format('delta').load(bronze_path_clinical_trial_delta)


sales_delta_df.show(5)
inventory_delta_df.show(5)
clinical_trial_delta_df.show(5)