# Trace Generator
This notebook is designed to take in data from Google's Borg traces and convert it into a format accepted by the trace simulator. The input is a CSV of job events organized by the job (collection) id and time. These events will be consilidated into a event in the output trace.

## Setup

In [42]:
import numpy as np
import pandas as pd

In [43]:
# CONSTANTS
JOB_SUBMIT = 0
JOB_QUEUE = 1
JOB_SCHEDULE = 3
JOB_FINISH = 6

# Open up CSV
filename = "google-borg-small1"
dataframe = pd.read_csv(filename + ".csv")
df = dataframe

# Setup trace dataframe
trace_cols = [
        'id',
        'priority',
        'scheduling_class',
        'submission_time',
        'schedule_time',
        'deadline',
        'exec_time'
    ]
trace_df = pd.DataFrame([], columns=trace_cols)


## Merge collection events into single trace event

In [44]:
job_ids = df.collection_id.unique()

for job_id in job_ids:
    job = dict()    
    events = df.loc[df['collection_id'] == job_id]    
    
    # Jobs can either be submitted or queue
    if (events['type'] == JOB_SUBMIT).any():
        job['submission_time'] = events.loc[events['type'] == JOB_SUBMIT].iloc[0]['time']
    elif (events['type'] == JOB_QUEUE).any():
        job['submission_time'] = events.loc[events['type'] == JOB_QUEUE].iloc[0]['time']
    else:
        print("skipping job: ", job_id)
        print(events)
        continue
    
    # Rest of the fields are the same for all jobs
    job['id'] = job_id
    job['priority'] = events.iloc[0]['priority']
    job['scheduling_class'] = events.iloc[0]['scheduling_class']
    job['schedule_time'] = events.loc[events['type'] == JOB_SCHEDULE].iloc[0]['time']
    job['deadline'] = events.loc[events['type'] == JOB_FINISH].iloc[0]['time']
    # TODO: factor in CPUs and resource usage into exec time. Most likely will end up
    # as cpu_cycles instead of time
    job['exec_time'] = job['deadline'] - job['schedule_time']
    
    trace_df = trace_df.append(job, verify_integrity=True, ignore_index=True)


In [45]:
trace_df

Unnamed: 0,id,priority,scheduling_class,submission_time,schedule_time,deadline,exec_time
0,375013460953,117,2,2022521137,2081569691,2368934175,287364484
1,375019419843,119,2,3146197519,3146269400,4081906439,935637039
2,375038054232,115,2,8979517264,9922247474,9955267294,33019820
3,375047148516,25,2,11928999375,11929257670,12214142968,284885298
4,375092499097,117,2,34243733916,34271266690,34589031820,317765130
...,...,...,...,...,...,...,...
995,400415463180,115,2,2668569723893,2668575491098,2668745370029,169878931
996,400425219739,117,2,2670515580305,2670607755554,2670959228445,351472891
997,400434516106,117,2,2671848875736,2671899745624,2672201780797,302035173
998,400435402429,115,2,2672323516786,2673259084153,2673364939386,105855233


## Write trace dataframe to CSV

In [46]:
trace_df.to_csv(filename + "-trace.csv")