# Setup

In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [28]:
procs_added = pd.read_csv("procs_added.txt", index_col=None, names=["tick", "machineID", "procType", "sla", "actualComp", "migrated"])
procs_current = pd.read_csv("procs_current.txt", index_col=None, names=["tick", "machineID", "isActive", "sla", "actualComp", "compDone"])
procs_done = pd.read_csv("procs_done.txt", index_col=None, names=["tick", "machineID", "procType", "sla", "ticksPassed", "actualComp"])
procs_killed = pd.read_csv("procs_killed.txt", index_col=None, names=["tick", "machineID", "sla", "compDone", "memUsed"])

util_metrics = pd.read_csv("usage.txt", index_col=None, names=["tick", "machineID", "maxTicksPassedToSlaRatio", "memUsage", "qlen", "ticksInQ", "ticksLeftOver"])

# prepare
procs_current["compLeft"] = procs_current["sla"] - procs_current["compDone"]
procs_added["compLeft"] = procs_added["sla"]
all_procs = pd.concat([procs_current[['tick', "compLeft"]], procs_added[["tick", "compLeft"]]])
load_num_procs_per_tick = procs_added[["tick"]].groupby("tick").size().reset_index(name='numProcsCurrent')
load_work_per_tick = procs_added.groupby("tick").sum().reset_index()

procs_done['timePassedAsPct'] = (100 * procs_done["ticksPassed"]) / procs_done["sla"]

procs_late = procs_done.where(procs_done["timePassedAsPct"] > 100.0001).dropna().reset_index(drop=True)
procs_late = procs_late.where(procs_late["ticksPassed"] > procs_late["actualComp"]).dropna().reset_index(drop=True)

proc_timings = pd.merge(procs_done, load_num_procs_per_tick, on='tick', how='left')

ticks_left = util_metrics.groupby("tick")["ticksLeftOver"].agg(['min', 'max']).reset_index()
ticks_left["range"] = ticks_left["max"] - ticks_left["min"]


In [42]:
# ticks running while proc becoming late
procs_late["tickStarted"] = procs_late["tick"] - np.floor(procs_late["ticksPassed"])
late_procs_running = procs_late[["tickStarted", "tick", "sla"]]
late_procs_running = late_procs_running.rename(columns={"tickStarted": "tickStarted", "tick": "tickDone", "sla":"sla"})

total_ticks_left_over_per_tick = util_metrics[["tick", "ticksLeftOver"]].groupby("tick").sum().reset_index()

merged_df = pd.merge(late_procs_running, total_ticks_left_over_per_tick, how='cross')
filtered_rows = merged_df[(merged_df['tick'] >= merged_df['tickStarted']) & (merged_df['tick'] <= merged_df['tickDone'])]
sum_counts = filtered_rows.groupby(['tickStarted', 'tickDone']).agg({'ticksLeftOver': 'sum'}).reset_index()
result_df = pd.merge(late_procs_running, sum_counts, on=['tickStarted', 'tickDone'], how='left')

result_df.sort_values("ticksLeftOver", ascending=False)


Unnamed: 0,tickStarted,tickDone,sla,ticksLeftOver
166,547.0,638.0,35.680206,266.525248
202,771.0,775.0,4.071756,29.392433
200,771.0,775.0,4.637166,29.392433
103,492.0,496.0,3.560344,27.335371
102,492.0,496.0,3.425912,27.335371
...,...,...,...,...
29,129.0,130.0,0.998717,3.074105
184,729.0,730.0,1.283739,3.054908
219,823.0,826.0,3.460138,2.969951
229,871.0,873.0,2.631032,2.910953


In [74]:
# ticks running while proc becoming late
procs_late["tickStarted"] = procs_late["tick"] - np.floor(procs_late["ticksPassed"])
late_procs_running = procs_late[["tickStarted", "tick", "sla", "machineID"]]
late_procs_running = late_procs_running.rename(columns={"tickStarted": "tickStarted", "tick": "tickDone", "sla":"sla", "machineID": "machineIDRunning"})
late_procs_running.machineIDRunning = late_procs_running.machineIDRunning.astype(int)

relevant_utils = util_metrics[["tick", "ticksLeftOver", "machineID"]]

merged_df = pd.merge(late_procs_running, relevant_utils, how='cross')
filtered_rows = merged_df[(merged_df['tick'] >= merged_df['tickStarted']) & (merged_df['tick'] <= merged_df['tickDone']) & (merged_df["machineID"] == merged_df["machineIDRunning"])]
sum_counts = filtered_rows.groupby(['tickStarted', 'tickDone']).agg({'ticksLeftOver': 'sum'}).reset_index()
result_df = pd.merge(late_procs_running, sum_counts, on=['tickStarted', 'tickDone'], how='left')

print(filtered_rows.where((filtered_rows["sla"] > 4.462) & (filtered_rows["sla"] < 4.464)).dropna())

result_df.where(result_df["ticksLeftOver"] > 0).sort_values("ticksLeftOver", ascending=False).dropna()

       tickStarted  tickDone       sla  machineIDRunning  tick  ticksLeftOver  \
24120         16.0      20.0  4.463281               6.0  16.0       0.000000   
24130         16.0      20.0  4.463281               6.0  17.0       0.000000   
24138         16.0      20.0  4.463281               6.0  18.0       0.000000   
24151         16.0      20.0  4.463281               6.0  19.0       0.000000   
24157         16.0      20.0  4.463281               6.0  20.0       0.152562   

       machineID  
24120        6.0  
24130        6.0  
24138        6.0  
24151        6.0  
24157        6.0  


Unnamed: 0,tickStarted,tickDone,sla,machineIDRunning,ticksLeftOver
181,713.0,717.0,4.299078,1.0,1.238347
180,713.0,717.0,4.288896,7.0,1.238347
103,492.0,496.0,3.560344,5.0,1.047262
102,492.0,496.0,3.425912,1.0,1.047262
77,337.0,342.0,3.533154,6.0,0.996919
...,...,...,...,...,...
262,965.0,969.0,4.413695,0.0,0.023305
169,664.0,668.0,4.950041,3.0,0.018812
207,798.0,802.0,4.968095,7.0,0.015653
66,293.0,295.0,2.670878,3.0,0.014716
