In [10]:
import os
import numpy as np
import pandas as pd

vmtable = pd.read_csv("trace_data/vmtable/vmtable.csv", header=None)
vmtable.columns = ["vm_id", "subscription_id", "deployment_id",
                   "timestamp_vm_created", "timestamp_vm_deleted",
                   "max_cpu", "avg_cpu", "p95_max_cpu", "vm_category",
                   "vm_virtual_core_count_bucket", "vm_memory_gb_bucket"]
vmtable.head()

Unnamed: 0,vm_id,subscription_id,deployment_id,timestamp_vm_created,timestamp_vm_deleted,max_cpu,avg_cpu,p95_max_cpu,vm_category,vm_virtual_core_count_bucket,vm_memory_gb_bucket
0,71fJw0x+SDRdAxKPwLyHZhTgQpYw2afS6tjJhfT6kHnmLH...,GB6uQC1NSArW5n+TtOybL7GQ1yByjuWtZnsj+5QccZ525R...,2sh/ZjaYdfpslv4iYBfNzFe4rs982kHVvNGJGeQ8MIBCDr...,558300,1673700,91.776885,0.728879,20.75963,Delay-insensitive,8,32
1,rKggHO/04j31UFy65mDTwtjdMQL/G03xWfl3xGeiilB4/W...,ub4ty8ygwOECrIz7eaZ/9hDwnCsERvZ3nJJ03sDSpD85et...,+ZraIDUNaWYDZMBiBtZm7xSjr+j3zcHGjup1+wyKxHFmyJ...,424500,425400,37.879261,3.325358,37.879261,Unknown,4,32
2,YrR8gPtBmfNaOdnNEW5If1SdTqQgGQHEnLHGPjySt53bKW...,9LrdYRcUfGbmL2fFfLR/JUg2OTkjGRe3iluwIhDRPnPDPa...,GEyIElfPSFupze8T+T1niQMepeqG88VpLNuxUMyIDbz8VF...,1133100,1133700,0.304368,0.220553,0.304368,Unknown,4,32
3,xzQ++JF1UAkh70CDhmzkiOo+DQn+E2TLErCFKEmSswv1pl...,0XnZZ8sMN5HY+Yg+0dykYB5oenlgsrCpzpgFSvn/MX42Ze...,7aCQS6fPUw9rwCPiqvghk/WCEbMV3KgNJjA+sssdfY5Ybl...,0,2591400,98.573424,30.340054,98.212503,Interactive,2,4
4,vZEivnhabRmImDr+JqKqZnpIM3WxtypwoxjfjnklR/idyR...,HUGaZ+piPP4eHjycCBki2yq0raJywdzrVuriR6nQceH3hA...,/s/D5VtTQDxyS6wq7N/VQAMczx61Ny1Ut3a3iFmDSOCXxp...,228300,229800,82.581449,13.876299,82.581449,Unknown,2,4


In [15]:
vmtable.loc[(vmtable["timestamp_vm_created"]!=0),'timestamp_vm_created'].idxmin()

np.int64(14177)

In [18]:
vmtable.loc[14177]

vm_id                           UvzMIKkIvQK682qHAxTk45QalDO2KcmbW9NrG8TnjysfzV...
subscription_id                 wA+NjFBhsKDx+8aBOS/6S1q0poOnhIjD8c8SswoR9isthg...
deployment_id                   0Kj50+yzaaJ1lVYZISpt8OkEnAiE1Li8HaUSAI3OL/J6C+...
timestamp_vm_created                                                          300
timestamp_vm_deleted                                                          600
max_cpu                                                                 47.481999
avg_cpu                                                                 21.448534
p95_max_cpu                                                             47.481999
vm_category                                                               Unknown
vm_virtual_core_count_bucket                                                    2
vm_memory_gb_bucket                                                             8
Name: 14177, dtype: object

In [17]:
vmtable.loc[14177, 'vm_id']

'UvzMIKkIvQK682qHAxTk45QalDO2KcmbW9NrG8TnjysfzVem6Ru7JrfeX8yWvd+B'

In [3]:
(vmtable["timestamp_vm_created"]<=0).mean()

np.float64(0.08433016217852549)

In [8]:
(vmtable["timestamp_vm_deleted"]<=3*86400).mean()

np.float64(0.08545275394836226)

In [6]:
(vmtable["vm_category"].eq("Interactive")).mean()

np.float64(0.029113931564193996)

In [7]:
((vmtable["timestamp_vm_deleted"]<=3*86400) & vmtable["vm_category"].eq("Interactive")).mean()

np.float64(0.0)

In [9]:
((vmtable["timestamp_vm_deleted"]<=3*86400) & vmtable["vm_category"].ne("Interactive")).mean()

np.float64(0.08545275394836226)

In [None]:
# timestamp_vm_created가 n일치 미만인 VM만 선발
ndays = 3
nseconds = ndays * 86400
vmtable = vmtable.loc[vmtable["timestamp_vm_created"] < nseconds]
vmtable.reset_index(drop=True, inplace=True)

# vCPU, vMem 추가
vmtable["vCPU"] = vmtable["vm_virtual_core_count_bucket"].map({"2":2, "4":4, "8":8, "12":12, "24":24, ">24":24})

# 일부 VM만 sample
sample_n = 2000
random_seed = 42

# sampling 가중치 조절
weights = np.ones(len(vmtable))
is_started_on_0 = vmtable["timestamp_vm_created"] == 0
is_on_demand_VM = vmtable["vm_category"].str.lower().eq("interactive")
# t=0에서 시작하는 on-demand VM은 가중치를 낮게 줌
weights[is_started_on_0 & is_on_demand_VM] = 0.1
# t=0에서 시작하지 않는 on-demand VM은 가중치를 높게 줌
weights[(~is_started_on_0) & is_on_demand_VM] = 10

vmtable = vmtable.sample(
    n=sample_n, 
    replace=False, 
    weights=weights,
    random_state=random_seed, 
    ignore_index=True
)

print("sample VM statistics")
print(f"t=0에서 시작하는 VM 비율: {(vmtable["timestamp_vm_created"] == 0).mean().item()*100:.2f}%")
print(f"on-demand VM 비율: {(vmtable["vm_category"].str.lower().eq("interactive")).mean().item()*100:.2f}%")
vmtable

sample VM statistics
t=0에서 시작하는 VM 비율: 69.85%
on-demand VM 비율: 12.75%


Unnamed: 0,vm_id,subscription_id,deployment_id,timestamp_vm_created,timestamp_vm_deleted,max_cpu,avg_cpu,p95_max_cpu,vm_category,vm_virtual_core_count_bucket,vm_memory_gb_bucket,vCPU
0,beyIgmrwrFUOt/P2n0UAncEA8BMU9MaH43ftA8RC1l8Ex3...,FljdJZ7hWYgtnvhdvpOLCI5GkvaO2Doc7vRqxwikLuDT2J...,RTmUrQcFulvMzmkZf3p0PpI1nymq8lhGbsOsG0CmklYBdE...,0,106800,96.165657,4.663349,62.781278,Unknown,2,8,2
1,YkGNEW9KIQxxK3oFbKGAdMKZ17x/nN6ExoG1Df15rAG+V2...,Fjsxy24DC5ExGRbSolHQNBAJh10g0pkCefSKC6Ox6SS1mh...,5DUqD/SlhPqdLfZkJb8tkiypmuCEoNdTZYQJnP0vagqm1c...,0,2591400,98.247278,4.374038,46.226845,Interactive,2,8,2
2,YQsrgFpCZATla+gaEiCGSgmjyzf8FqgFjHqtH4Ak8RFHjK...,ipxXXsn0bCXkjmJOZtlV6bilgztEtGEBUXTFv0z3wY6BF8...,zRkZ3zlbYh4wIWdlCcGVukwKRgQ5GClzlwQXgwO7J6ThmX...,40500,41400,49.397398,11.682085,49.397398,Unknown,2,8,2
3,nS5hSIkSENlqCDYFRFw2Psd+iAg6V8ckOHOmueWl/5+732...,SEjRM1kadaMGVu2dHxeF42Um7ji29emJ8/0FBK5XqTXvK6...,sZU82xgr/oPZ1dDSla9gpK1pVx/69WMUfo+2LX1SlTSK62...,0,507300,66.672558,6.180197,33.690730,Unknown,4,32,4
4,gAVIQd07iF9fLUcu0Zxur3a52DatFnIqZN9XsWg7mpEYKG...,mgvu8t+NettJI1yyn2NWJdEo+NvXBOZMi9N9+wWGrAI9Kb...,JFIj7efWP7GnSpm20fAUMiNz2Ia7E63b+txBdV+kkl6u86...,7200,10200,97.649245,43.871106,97.649245,Unknown,2,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...
1995,Q+Vnp8jS+1C3IPrCm2zOObLFOFuazEQO4BHZSh2I7JVkLz...,Od+648kXhgKnQXJ5kd1D/qV2ZArdlHso2obf1ao5awu1d9...,bOpL1j/heogv9zmaqHpuqvzLP0G+95TJhpmsChjeMy/3eJ...,0,35100,97.082209,9.016426,93.482739,Unknown,2,4,2
1996,UUyTWNyD+caBWX5bFXAFXt2uRwW8iDOx0yu4RvdUqGt80n...,gcon9q5Xnb6gUKSDuuKEPFoCaCW2deAmSX4O2jOE6rJad/...,UUbZEnY0WXAjdi0NBCvmCkXAvOCCAZ6XHRg5ki5i+4DQ5k...,29700,34200,82.790247,16.717243,82.790247,Unknown,2,8,2
1997,6RvYUyfAZXI93cpiOERNJguk8z1CRxzqXbs/sq6B2qDhNl...,LMFVo44E78wxMvPNvBPmeMv9CSQnU4HLjsTiwoQOg++HxW...,Ssk+Vwc6Py9GerihTQCR5jtRlBfWB9vrfKtW9IF6z386SP...,28800,29100,0.060531,0.023656,0.060531,Unknown,4,32,4
1998,bxpNz5RHl6+dh7RNZQxq9nzV+io2uuDfICeaZt7RlcGW/A...,dZicRxSsj49MaFhtMt88SLEamULu1hP1ABfIbAWKjcp1+4...,ZI8BNoCcZlNMqYucv2Z/4+bjmqJHU02D+VZwn6e/s/CfV3...,0,2591400,75.188811,0.773046,8.985911,Delay-insensitive,8,32,8


In [None]:
from tqdm.notebook import trange

cpu_readings_path = "trace_data/vm_cpu_readings/"
fname = "vm_cpu_readings-file-{}-of-195.csv.gz"
sample_vm_id = vmtable['vm_id']

sample_vm_data = []
for i in trange(1, len(os.listdir(cpu_readings_path))+1):
    file_path = os.path.join(cpu_readings_path, fname.format(i))
    df = pd.read_csv(file_path, header=None, compression="gzip")
    df.columns = ["timestamp", "vm_id", "min_cpu", "max_cpu", "avg_cpu"]

    sample_vm_df = df.loc[df['vm_id'].isin(sample_vm_id)]
    sample_vm_df.reset_index(drop=True, inplace=True)
    
    sample_vm_data.append(sample_vm_df)
    del df
    
    if sample_vm_df["timestamp"].max() >= nseconds:
        break

sample_vm_df = pd.concat(sample_vm_data, axis=0, ignore_index=True)
sample_vm_df = sample_vm_df.loc[sample_vm_df["timestamp"] < nseconds]
sample_vm_df

  0%|          | 0/24 [00:00<?, ?it/s]

Unnamed: 0,timestamp,vm_id,min_cpu,max_cpu,avg_cpu
0,0,gvLGWcpJytecikJmCgPyMW1ro2Qt+pvDY7Fj3dxhXzmFdi...,4.560651,11.479948,6.898977
1,0,LlWsN1b+Ls5jt3q0x0fCoh2WtML9kTzqHWvA+fS8/Pazre...,0.466489,0.618680,0.505132
2,0,we9LGwHQtzRXXDSn66G4xda++VOxftsmzJft0OwcGPhpzz...,4.401925,29.059240,9.581130
3,0,/4ERLqziC+E2H/5AcorXTYRMcS5KouBHPQTLGnMeoBKx7n...,5.373990,49.986778,12.907325
4,0,+i4lJZoTtcz0Wtws41sWpyJjP9K1eAq/wJrwiIfXrOS+lN...,0.244150,0.493670,0.351295
...,...,...,...,...,...
391861,86100,xgeQ3kSXguRKFFLu6r04fLyRGrGyGEoSssRiYgPIKn/O9c...,2.764120,11.527814,4.164978
391862,86100,M/xM7tVslBFoME8cYbhRGCpyfoHc2T6NT7ihNOH0F0uEhg...,1.235171,6.194472,2.835712
391863,86100,K91shwmGOT65nkiuZ/41chtOTS3mrUHlZO0a1WjimeIgjY...,5.967137,8.899772,7.304947
391864,86100,LKjuTANhsLfJ/dmdJ9088bPhDNArPYeVQKuq6OHwJ9m4XT...,0.948806,6.566265,1.808365


In [3]:
# 1시간 + vm_id 집계
sample_vm_df["hour"] = sample_vm_df["timestamp"] // 3600

sample_vm_df = (
    sample_vm_df
    .groupby(["vm_id", "hour"], as_index=False)
    .agg(
        min_cpu=("min_cpu", "min"),
        avg_cpu=("avg_cpu", "mean"),
        max_cpu=("max_cpu", "max"),
    )
)

# vCPU 개수 join
sample_vm_df = sample_vm_df.merge(
    vmtable[["vm_id", "vCPU", "vm_category"]],
    on="vm_id",
    how="left"
)

# core 사용량 계산
sample_vm_df["min_core_usage"] = sample_vm_df["min_cpu"] * sample_vm_df["vCPU"] / 100
sample_vm_df["avg_core_usage"] = sample_vm_df["avg_cpu"] * sample_vm_df["vCPU"] / 100
sample_vm_df["max_core_usage"] = sample_vm_df["max_cpu"] * sample_vm_df["vCPU"] / 100

sample_vm_df

Unnamed: 0,vm_id,hour,min_cpu,avg_cpu,max_cpu,vCPU,vm_category,min_core_usage,avg_core_usage,max_core_usage
0,+1CC6EC+vHdQR7jJCNztKCoocDCBe4i10o4/L2VIickBBi...,0,2.676840,4.676953,34.496406,2,Delay-insensitive,0.053537,0.093539,0.689928
1,+1CC6EC+vHdQR7jJCNztKCoocDCBe4i10o4/L2VIickBBi...,1,2.640696,4.641649,33.071980,2,Delay-insensitive,0.052814,0.092833,0.661440
2,+1CC6EC+vHdQR7jJCNztKCoocDCBe4i10o4/L2VIickBBi...,2,2.630494,4.638883,35.553100,2,Delay-insensitive,0.052610,0.092778,0.711062
3,+1CC6EC+vHdQR7jJCNztKCoocDCBe4i10o4/L2VIickBBi...,3,2.661216,4.652888,33.991978,2,Delay-insensitive,0.053224,0.093058,0.679840
4,+1CC6EC+vHdQR7jJCNztKCoocDCBe4i10o4/L2VIickBBi...,4,2.774083,5.593173,87.126481,2,Delay-insensitive,0.055482,0.111863,1.742530
...,...,...,...,...,...,...,...,...,...,...
33532,zzXejkted3tPChwkEWSRnsqUt9+KyZIBav6AVdUb5A4hXB...,19,5.816900,9.492391,56.868283,2,Interactive,0.116338,0.189848,1.137366
33533,zzXejkted3tPChwkEWSRnsqUt9+KyZIBav6AVdUb5A4hXB...,20,5.774605,9.548711,59.471706,2,Interactive,0.115492,0.190974,1.189434
33534,zzXejkted3tPChwkEWSRnsqUt9+KyZIBav6AVdUb5A4hXB...,21,5.907553,9.556688,61.024761,2,Interactive,0.118151,0.191134,1.220495
33535,zzXejkted3tPChwkEWSRnsqUt9+KyZIBav6AVdUb5A4hXB...,22,5.986690,10.712679,64.268974,2,Interactive,0.119734,0.214254,1.285379


In [4]:
sample_vm_df.to_csv("sample_vm_data.csv", index=False)