In [None]:
from analyserlib import distributionanalyzer as da

import pandas as pd
import os.path
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Retrieve Chameleon traces

- From disk (adapt the path)

In [None]:
# Downloaded and unarchived from https://www.scienceclouds.org/cloud-traces/chameleon-openstack-kvm-cloud-trace-2020-09-04/
instance_events_df = pd.read_csv('/usr/local/src/chameleon/new_kvm_instance_events.csv', parse_dates=['START_TIME', 'FINISH_TIME'])

# Adapt Chameleon traces

Configuration options are originally detailed in json, we start by converting them to a DataFrame

In [None]:
distribution = dict()
distribution["instance"] = list()
distribution["cpu"] = list()
distribution["mem"] = list()
distribution["disk"] = list()

instance_events_df.loc[instance_events_df["EVENT"] == "compute_start_instance"]
unique_list = instance_events_df["INSTANCE_UUID"].unique()
for instance in unique_list:
    temp_df = instance_events_df.loc[instance_events_df["INSTANCE_UUID"] == instance]
    line_data = json.loads(temp_df["PROPERTIES"].iloc[0].replace("'", '"'))
    distribution["instance"].append(instance)
    distribution["cpu"].append(line_data['vcpus'])
    distribution["mem"].append(line_data['memory_mb']/1024)
    distribution["disk"].append(line_data['root_gb'])
    
distribution_df = pd.DataFrame(distribution)
distribution_df.head()

Then, we convert the list of libvirt events to a DataFrame with VM start and stop timestamps columns

In [None]:
vm_with_start_stop = dict()
vm_with_start_stop['instance'] = list()
vm_with_start_stop['cpu'] = list()
vm_with_start_stop['mem'] = list()
vm_with_start_stop['disk'] = list()
vm_with_start_stop['start'] = list()
vm_with_start_stop['stop'] = list()

def register_vm_session(instance, start, stop):
    vm_with_start_stop['instance'].append(instance)
    vm_with_start_stop['cpu'].append(distribution_df.loc[distribution_df['instance'] == instance]['cpu'].iloc[0])
    vm_with_start_stop['mem'].append(distribution_df.loc[distribution_df['instance'] == instance]['mem'].iloc[0])
    vm_with_start_stop['disk'].append(distribution_df.loc[distribution_df['instance'] == instance]['disk'].iloc[0])
    vm_with_start_stop['start'].append(start)
    vm_with_start_stop['stop'].append(stop)

unique_list = instance_events_df["INSTANCE_UUID"].unique()
max_val = int(instance_events_df['START_SEC'].max())
for instance in unique_list:
    
    sub_pd = instance_events_df.loc[instance_events_df['INSTANCE_UUID'] == instance]
    
    start, stop = (None, None)
    first_loop = True
    for index, row in sub_pd.iterrows():
        
        if row['EVENT'] in ['compute_start_instance','compute__do_build_and_run_instance']:
            start = int(row['START_SEC'])
            
        if row['EVENT'] in ['compute_stop_instance','compute_terminate_instance']:
            stop = int(row['START_SEC'])
            
        if first_loop and (start == None) and (stop != None):  # VM was active before the beginning of dataset
            start = 0
            register_vm_session(instance, start, stop)
            start, stop = (None, None)
            first_loop = False
            
        if (stop is not None) and (start is None): # Disregard multiple shutdowns instructions
            stop = None
            
        if (start is not None) and (stop is not None):
            if start < stop:
                register_vm_session(instance, start, stop)
            else:
                print('Unordered start/stop encountered on', instance)
            start, stop = (None, None) 
            first_loop = False
    
    if (start != None) and (stop == None):
        stop = max_val
        register_vm_session(instance, start, stop)
        
vm_df = pd.DataFrame(vm_with_start_stop)
vm_df.head()

# Display distribution information

We use analyserlib to display configuration options distribution on Chameleon dataset

In [None]:
timestamp_step = 3600 # 1 hour
timestamp_begin = None
timestamp_end = None 

df_cpu, df_mem = da.get_cpu_and_mem_average_distribution(vm_df,
                        timestamp_begin=timestamp_begin, timestamp_end=timestamp_end, timestamp_step=timestamp_step,
                        col_flavor_cpu='cpu', col_flavor_mem='mem', #column name in your dataset
                        col_vm_created='start', col_vm_deleted='stop') #column name in your dataset

In [None]:
print("CPU distribution observed:")
print(df_cpu)

In [None]:
print("Memory distribution observed:")
print(df_mem)

VM distribution analysis is now completed. We write associated distribution scenario

In [None]:
output_file="scenario-vm-distribution.yml"
if not os.path.exists(output_file):
    da.convert_distribution_to_scenario(df_cpu, df_mem, 
                                        col_flavor_cpu='cpu', col_flavor_mem='mem', #columns name in your dataset
                                        output_file=output_file)
    print("Scenario written to", output_file)    
else:
    print("File already exists! Abording")

In [None]:
print("Reading generated distribution as yaml file")
with open(output_file) as f:
    print(f.read())