In [17]:
import os
import pandas as pd
import numpy as np

# Function to generate synthetic data for a single server
def generate_server_data(server_id, num_servers, num_hours=24):
    timestamps = pd.date_range(start='2023-08-01', periods=num_hours*60, freq='T')
    request_counts = np.random.normal(100, 50, num_hours*60).astype(int)
    request_counts = np.maximum(request_counts, 1)  # Ensure all counts are at least 1

    # Normal distribution of CPU usage between 40% and 80%
    # Cap lower CPU usage at 10% and upper CPU usage at 80%
    cpu_usage = np.random.normal(50, 25, num_hours*60)
    cpu_usage = np.maximum(cpu_usage, 0) # Ensure all CPU usage are at least 0
    cpu_usage = np.minimum(cpu_usage, 100) # Ensure all CPU usage are at most 100

    memory_usage = np.random.normal(50, 25, num_hours*60)  # Random memory usage between 40% and 90%
    memory_usage = np.maximum(memory_usage, 0) # Ensure all memory usage are at least 0
    memory_usage = np.minimum(memory_usage, 100) # Ensure all memory usage are at most 100

    execution_time = np.random.normal(10, 25, num_hours*60)  # Random execution time between 10ms and 50ms
    execution_time = np.maximum(execution_time, 1) # Ensure all execution time are at least 1ms
    execution_time = np.minimum(execution_time, 50) # Ensure all execution time are at most 50ms

    request_id_offset = (np.arange(num_hours*60) // num_servers) * num_servers
    request_id = np.arange(1, len(timestamps) + 1) + request_id_offset

    # Generate a numpy list of project ids between 1 to 3
    project_id = np.random.randint(1, 4, num_hours*60) 

    # # Generate gdpr_list, if project_id is 1 then gdpr is True else False
    # gdpr_list = np.where(project_id == 2, True, False)

    data = {
        'timestamp': timestamps,
        'request_id': request_id,
        'server_id': [f'Server_{server_id}'] * len(timestamps),
        'project_id' : [f'Project_{i}' for i in project_id],
        'number_of_active_requests': request_counts,
        'cpu_usage': cpu_usage,
        'memory_usage': memory_usage,
        'execution_time': execution_time,
    }
    return pd.DataFrame(data)

# Function to generate synthetic data for all servers in a round-robin manner
def generate_data_for_all_servers(num_servers=10, num_hours=24):
    all_server_data = []
    for server_id in range(1, num_servers+1):
        server_data = generate_server_data(server_id, num_servers, num_hours)
        all_server_data.append(server_data)

    # Concatenate data for all servers and sort by timestamp
    all_data = pd.concat(all_server_data)
    all_data = all_data.sort_values(by='timestamp').reset_index(drop=True)
    all_data['request_id'] = range(1, len(all_data) + 1)

    return all_data

# Function to identify incidents based on latency greater than 20ms
def identify_incidents(data):
    data['incident'] = data['execution_time'].apply(lambda x: x > 20)
    incident_data = data[data['incident'] == True].copy(deep=True)
    incident_data['incident_id'] = range(1, len(incident_data) + 1)
    incident_data = incident_data[['incident_id', 'timestamp', 'request_id']]
    return incident_data

# Function to generate aggregated server metrics
def generate_aggregated_metrics(data):
    aggregated_data = data.groupby(['server_id', pd.Grouper(key='timestamp', freq='H')]).agg(
        avg_cpu_usage=('cpu_usage', 'mean'),
        avg_memory_usage=('memory_usage', 'mean'),
        total_requests=('number_of_active_requests', 'sum')
    ).reset_index()

    aggregated_data['timestamp'] = aggregated_data['timestamp'].dt.strftime('%Y-%m-%d %H:%M:%S')

    return aggregated_data


In [18]:
# Generate synthetic data for all servers in a round-robin manner
all_data = generate_data_for_all_servers(num_servers=10, num_hours=24)

# Identify incidents and add incident_id column
incident_data = identify_incidents(all_data)

# Generate aggregated server metrics
aggregated_data = generate_aggregated_metrics(all_data)

# Save dataframes to CSV files
data_folder = 'data'

# Generate data_folder if doesn't exists

if not os.path.exists(data_folder):
    os.makedirs(data_folder)

all_data[['request_id', 'timestamp', 'project_id', 'server_id',
            'number_of_active_requests', 'cpu_usage', 'memory_usage',
            'execution_time']].to_csv(f'{data_folder}/request.csv', index=False)
incident_data.to_csv(f'{data_folder}/incident.csv', index=False)
aggregated_data.to_csv(f'{data_folder}/aggregated_server_metrics.csv', index=False)