# Summary of Results and Algorithm:


## Read input

In [1]:
# Imports
import pandas as pd
import numpy as np
# Read data into dataframe
df = pd.read_csv('./google-cluster-data-1.csv',sep=' ')

In [2]:
# Print the length of the df
print("length of df: ", len(df))

length of df:  3535029


Get CPU and MEM information

In [3]:
# Show amount of CPU for each task
df['NrmlTaskCores'].values

array([0.      , 0.      , 0.021875, ..., 0.      , 0.      , 0.      ])

In [4]:
# Save CPU amount, memory requirements, and task IDs
cpus = df['NrmlTaskCores'].values
mems = df['NrmlTaskMem'].values
task_ids = df['TaskID'].values

In [5]:
# Calculate and save execution times for two cases: mod 2 and mod 10
temp1 = df['TaskID'].values
ExecutionTime1 = []
for i in temp1:
    burstTime = (i%2+1)*300
    ExecutionTime1.append(burstTime)
    
temp2 = df['TaskID'].values
ExecutionTime2 = []
for i in temp2:
    burstTime = (i%10+1)*300
    ExecutionTime2.append(burstTime)

In [6]:
# Calculate and print total # of subtasks for two cases: mod 2 and mod 10
totalSubtask1 = 0
for i in ExecutionTime1:
    totalSubtask1 += i/300
print("Total Subtasks for setting 1:", int(totalSubtask1))

totalSubtask2 = 0
for i in ExecutionTime2:
    totalSubtask2 += i/300
    
print("Total Subtasks for setting 2:", int(totalSubtask2))

Total Subtasks for setting 1: 5298274
Total Subtasks for setting 2: 19472018


In [7]:
# Show dataframe
df.head()

Unnamed: 0,Time,ParentID,TaskID,JobType,NrmlTaskCores,NrmlTaskMem,Unnamed: 6
0,90000,757745334,1488529826,0,0.0,0.03113,
1,90000,975992247,1488529821,0,0.0,0.0,
2,90000,1468458091,1488529832,1,0.021875,0.002353,
3,90000,1460281235,1488529840,0,0.0,0.0,
4,90000,1164728954,1488529835,0,0.003125,0.001638,


In [8]:
# Save calculated execution times to columns in the dataframe
df['executionTime1'] = ExecutionTime1
df['executionTime2'] = ExecutionTime2

### Setting 1

In [19]:
# Calculates energy consumption for one vm
def getEnergyConsumption(VMCPUUsage, totalCPUCount, threshold):
    
    #here we want to calculate the cpu usage rate per individual VM so i look at the individual
    #cpu usage rate and divide it over 7/100 or 25/100
    cpuUsageRate = (totalCPUCount- VMCPUUsage)/(totalCPUCount)
    
    # Based on the equations provided
    static = 0
    if cpuUsageRate > 0:
        static = 5 
    else:
        static = 0
        
    dynamic = 0
    if cpuUsageRate < threshold:
        dynamic = cpuUsageRate*100
    else:
        dynamic = threshold * 100 + (((cpuUsageRate - threshold)**2) * 200)

    return static + dynamic

In [10]:
# Calculates cost for one vm by mapping possible time values to cost
def getCost(time, power):
    costChart = [0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.6, 0.6, 0.8, 0.8, 0.8, 0.8]
    mapping = [90000,91875,93750,95625,97500,99375,101250,103125,105000,106875,108750,110625]
    cost = 0
    for i in range(12):
        if time == mapping[i]:
            cost = costChart[i]
            break
        elif time < mapping[i]:
            cost = costChart[i-1]
        elif time > mapping[11]:
            cost = costChart[11]
    return cost*power
    

In [11]:
# Schedule all tasks in a way that minimizes the power consumption
def GreedyPower(input_data, nextQueue, threshhold, executionTime, cpuTotal, memTotal):
    length = len(input_data)
    nextQueue = []
    rejected =[]
    VMs = [[cpuTotal, memTotal] for i in range(100)]
    energyUsage = 0
    totalPower = 0
    totalCost = 0
    totalTurnaroundTime = 0
    
    # Iterates through the tasks
    for index, row in input_data.iterrows():
        cpu, mem = row['NrmlTaskCores'], row['NrmlTaskMem']
        task_id = row['TaskID']
        found_VM = False
        # Arbitrarely high value to compare the first energy value to 
        lowestPower = 234567898765
        lowestPowerIndex = -1
        VM_id = 0
        
        # Iterate through all of the vms
        for j in range(100):
            VM_id = j
            # Check whether this is even schedulable
            if VMs[VM_id][0] >= cpu and VMs[VM_id][1] >= mem:
                energy = getEnergyConsumption(VMs[VM_id][0], cpuTotal, threshhold)
                if energy < lowestPower:
                    lowestPower = energy
                    lowestPowerIndex = VM_id
                    
        # Is schedulable 
        if lowestPowerIndex != -1:
            cpu = row['NrmlTaskCores']
            mem = row['NrmlTaskMem']
            
            # Adjust the corresponding cpu and memory values for the VMs based on the allocated task
            VMs[lowestPowerIndex][0] = VMs[lowestPowerIndex][0] - cpu
            VMs[lowestPowerIndex][1] = VMs[lowestPowerIndex][1] - mem
            
            # Because we scheduled the task, update power, cost, and time
            totalPower += lowestPower
            totalCost += getCost(row['Time'], lowestPower)
            totalTurnaroundTime += row[executionTime]
            
        # If the task is rejected
        else:
            rejected.append(row['TaskID'])
            # Hard rejection of task by reducing the rest of the time for the subtasks to 0
            row[executionTime] = 0
                
        # Adjust remaining execution time for the overall task and append rest to nextQueue
        if row[executionTime] > 300:
            row[executionTime] -= 300
            nextQueue.append(row)
            
    return rejected, nextQueue, totalPower, totalCost, totalTurnaroundTime

In [12]:
def GreedyCost(input_data, nextQueue, threshhold, executionTime,cpuTotal, memTotal):
    length = len(input_data)
    nextQueue = []
    VMs = [[cpuTotal, memTotal] for i in range(100)]
    cost = 0
    rejected =[]
    totalPower = 0
    totalCost = 0
    totalTurnaroundTime = 0
    
    for index, row in input_data.iterrows():
        cpu, mem = row['NrmlTaskCores'], row['NrmlTaskMem']
        task_id = row['TaskID']
        found_VM = False
        lowestCost = float('inf')
        lowestCostIndex = -1
        VM_id = 0
        
        #iterate through all of the vms
        for j in range(100):
            VM_id = j
            #check whether this is even schedulable
            if VMs[VM_id][0] >= cpu and VMs[VM_id][1] >= mem:
                energy = getEnergyConsumption(VMs[VM_id][0], cpuTotal, threshhold)
                cost = getCost(row['Time'], energy)
                if cost < lowestCost:
                    lowestCost = cost
                    lowestCostIndex = VM_id
                    
        #is schedulable
        if lowestCostIndex != -1:
            cpu = row['NrmlTaskCores']
            mem = row['NrmlTaskMem']
            
            VMs[lowestCostIndex][0] = VMs[lowestCostIndex][0] - cpu
            VMs[lowestCostIndex][1] = VMs[lowestCostIndex][1] - mem
            
            # Because we scheduled the task, update power, cost, and time
            totalPower += getEnergyConsumption(VMs[VM_id][0], cpuTotal, threshhold)
            totalCost += lowestCost
            totalTurnaroundTime += row[executionTime]
        else:
            rejected.append(row['TaskID'])
            # Hard rejection of task by reducing the rest of the time for the subtasks to 0
            row[executionTime] = 0
                
        if row[executionTime] > 300:
            row[executionTime] -= 300
            nextQueue.append(row)
            
    return rejected, nextQueue, totalPower, totalCost, totalTurnaroundTime

In [23]:
# Schedule all tasks in a way that minimizes the turnover time
def GreedyTurnaroundTime(input_data, nextQueue, threshhold, executionTime,cpuTotal, memTotal):
    length = len(input_data)
    nextQueue = []
    VMs = [[cpuTotal, memTotal] for i in range(100)]
    energyUsage = 0
    rejected=[]
    totalPower = 0
    totalCost = 0
    totalTurnaroundTime = 0
    
    for index, row in input_data.iterrows():
        cpu, mem = row['NrmlTaskCores'], row['NrmlTaskMem']
        task_id = row['TaskID']
        found_VM = False
        index = -1
        VM_id = 0
        
        #iterate through all of the vms
        for j in range(100):
            VM_id = j
            #check whether this is even schedulable
            if VMs[VM_id][0] >= cpu and VMs[VM_id][1] >= mem:
                index = VM_id
                break
                    
        # Is schedulable 
        if index != -1:
            cpu = row['NrmlTaskCores']
            mem = row['NrmlTaskMem']
            
            VMs[index][0] = VMs[index][0] - cpu
            VMs[index][1] = VMs[index][1] - mem
            
            # Because we scheduled the task, update power, cost, and time
            power = getEnergyConsumption(VMs[index][0], cpuTotal, threshhold)
            totalPower += power
            totalCost += getCost(row['Time'], power)
            totalTurnaroundTime += row[executionTime]
        else:
            rejected.append(row['TaskID'])
            # Hard rejection of task by reducing the rest of the time for the subtasks to 0
            row[executionTime] = 0
                
        if row[executionTime] > 300:
            row[executionTime] -= 300
            nextQueue.append(row)
            
    return rejected, nextQueue, totalPower, totalCost, totalTurnaroundTime

In [None]:
def RR(input_data, nextQueue, threshhold, executionTime,cpuTotal, memTotal):
    rejected = []
    
    length = len(input_data)
    nextQueue = []
    VMs = [[cpuTotal, memTotal] for i in range(100)]
    energyUsage = 0
    totalPower = 0
    totalCost = 0
    totalTurnaroundTime = 0
    
    # Iterate through all of the tasks
    
    for index, row in input_data.iterrows():
        cpu, mem = row['NrmlTaskCores'], row['NrmlTaskMem']
        task_id = row['TaskID']
        # Beginning id for filling the VMs
        VM_id = 0
        # get the VM id we need
        # E.g., VM_id+1, +2,....
        # search next available VM. 
        # If found, update resource availability, otherwise, put task_id[i] in rejected
        # Iterate through all of the VMs
        for j in range(100):
            # find the available cpu and mem units based on VM_id
            cpu_vm = VMs[VM_id][0]
            mem_vm = VMs[VM_id][1] 
            # If a given VM has space, then remove the CPU and MEM units for the given task and go to the next task
            if cpu_vm >= cpu and mem_vm >= mem:
                # We can put task_id[i] into VM[VM_id]
                # update VM resource availability. 
                VMs[VM_id][0] = cpu_vm - cpu
                VMs[VM_id][1] = mem_vm - mem
                # Increase the VM_id so that the next VM can be checked
                VM_id = VM_id + 1
                # Reset VM_id back to 0 if it hits 100
                if VM_id == 100:
                    VM_id = 0
                # Because we scheduled the task, update power, cost, and time
                power = getEnergyConsumption(VMs[index][0], cpuTotal, threshhold)
                totalPower += power
                totalCost += getCost(row['Time'], power)
                totalTurnaroundTime += row[executionTime]
                # Skip to the next task since this one has been allocated
                break
            # If on the last VM and the task still hasn't been alloted to a VM, then put the task id in rejected
            elif j == 99:
                rejected.append(row['TaskID'])
                # Hard rejection of task by reducing the rest of the time for the subtasks to 0
                row[executionTime] = 0
            # Increase the VM_id so that the next VM can be checked
            VM_id = VM_id + 1
            # Reset VM_id back to 0 if it hits 100
            if VM_id == 100:
                VM_id = 0
                
        if row[executionTime] > 300:
            row[executionTime] -= 300
            nextQueue.append(row)


    # If none of the tasks were rejected, print 0
    if len(rejected) == 0:
        print('0')
        
    return rejected, nextQueue, totalPower, totalCost, totalTurnaroundTime

In [14]:
# Run greedy power for CPU = 7, Mem = 11
VMs = [[7, 11] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
print(len(setty))
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

# Run task queue based on optimization for power
for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
    print(i)
    returnObj = GreedyPower(taskQueue, nextQueue, 0.5, 'executionTime1', 7, 11)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

# Save rejected values
if len(rejected) > 0:
    np.save("taskReject_1_i", rejected)
np.save("VMs_1_i", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

76
90000
90300
90600
90900
91200
91500
91800
92100
92400
92700
93000
93300
93600
93900
94200
94500
94800
95100
95400
95700
96000
96300
96600
96900
97200
97500
97800
98100
98400
98700
99000
99300
99600
99900
100200
100500
100800
101100
101400
101700
102000
102300
102600
102900
103200
103500
103800
104100
104400
104700
105000
105300
105600
105900
106200
106500
106800
107100
107400
107700
108000
108300
108600
108900
109200
109500
109800
110100
110400
110700
111000
111300
111600
111900
112200
112500
Total Power:  307953.97321428085
Total Cost:  246363.17857143344
Total Turnaround Time:  9228000.0


In [None]:
# Total Power:  307953.97321428085
# Total Cost:  246363.17857143344
# Total Turnaround Time:  9228000.0

In [1]:
# Run greedy cost for CPU = 7, Mem = 11
VMs = [[7, 11] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
print(len(setty))
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0
counter = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
    print(i)
    returnObj = GreedyCost(taskQueue, nextQueue, 0.5, 'executionTime1', 7, 11)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_1_ii", rejected)
np.save("VMs_1_ii", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

NameError: name 'df' is not defined

In [None]:
# Total Power:  316117.8571428556
# Total Cost:  246363.17857143344
# Total Turnaround Time:  9228000.0

In [24]:
# Run greedy turnaround time for CPU = 7, Mem = 11
VMs = [[7, 11] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
print(len(setty))
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
    print(i)
    returnObj = GreedyTurnaroundTime(taskQueue, nextQueue, 0.5, 'executionTime1', 7, 11)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    print(totalPower)
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_1_iii", rejected)
np.save("VMs_1_iii", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

76
90000
22919.535315688652
90300
1882001.7864316404
90600
3896461.45970202
90900
3849354.704361001
91200
3630417.778260625
91500
3483680.839883505
91800
3508891.2578921984
92100
3512003.656807987
92400
3715475.7514348985
92700
3687827.1780935014
93000
3836016.366669653
93300
3986372.742586695
93600
3916680.4265787667
93900
3915884.3158488926
94200
3347376.2738761185
94500
3638063.558593777
94800
3590613.673310199
95100
3664875.0304930625
95400
3855283.948382145
95700
3831571.150670221
96000
3852244.9473857833
96300
3836737.024513875
96600
3678393.8628429575
96900
3560987.0678416174
97200
3676764.65947903
97500
3440069.7801339724
97800
3513081.9381777253
98100
3804831.497130301
98400
3809878.8374524587
98700
3924381.4645247795
99000
4062223.5033482024
99300
3813711.607142932
99600
3765143.5520964745
99900
3737670.8363758526
100200
3753425.2754310686
100500
3819803.7830438423
100800
3647350.7141262633
101100
3862537.887874655
101400
3814387.9295279686
101700
3800782.7470506714
102000
36

In [None]:
# Total Power:  960296.1957509384
# Total Cost:  768236.9566007662
# Total Turnaround Time:  9228000.0

In [None]:
# Run RR for CPU = 7, Mem = 11
VMs = [[7, 11] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
print(len(setty))
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
    print(i)
    returnObj = RR(taskQueue, nextQueue, 0.5, 'executionTime1', 7, 11)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    print(totalPower)
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_1_iv", rejected)
np.save("VMs_1_iv", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

### Setting 2

In [25]:
# Run greedy power for CPU = 25 and Mem = 40
VMs = [[25, 40] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
   
    returnObj = GreedyPower(taskQueue, nextQueue, 0.9, 'executionTime2', 25, 40)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    print(totalPower)
    print(totalCost)
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]
if len(rejected) > 0:
    np.save("taskReject_2_i", rejected)
np.save("VMs_2_i", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

2247.5874999999955
0
399532.850000019
1123.7937499999978
1502563.3874997825
316346.03875002003
2951298.4499997506
1197240.8512497684
4596514.874999123
2355427.559999737
6522249.424997352
3671292.3674998283
8352022.749996844
5211759.354999637
10053457.200001486
6676273.973750834
11688990.775570305
8038286.83875346
12254245.570070898
9348033.338244058
12696643.773068428
9801734.495600289
12729772.940585809
10157315.018457029
12796424.86502135
10183818.352463795
13028893.421611352
10237139.89202085
12631854.341482885
10423114.737290615
12605732.797592962
10105483.473178333
12851324.73687851
10084586.238073265
12614720.021481581
10281059.78950234
12585154.397857038
10091776.017183434
12557440.158987546
10068123.518286116
12527453.074745623
10045952.127184728
12478885.342010492
10021962.459802138
12481592.889913538
9983108.273610063
12393392.921144655
9985274.311924856
12189752.34492891
9914714.33691039
11729251.790828465
9751801.875942988
11462886.69999862
9316054.735164963
11662374.813724

In [None]:
# Total Power:  8643180.575000409
# Total Cost:  6914544.460000166
# Total Turnaround Time:  241750800.0

In [26]:
# Run greedy cost for CPU = 25 and Mem = 40
VMs = [[25, 40] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
   
    returnObj = GreedyCost(taskQueue, nextQueue, 0.9, 'executionTime2', 25, 40)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_2_ii", rejected)
np.save("VMs_2_ii", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

Total Power:  8735041.937501175
Total Cost:  6914823.979997629
Total Turnaround Time:  241750800.0


In [None]:
# Total Power:  8735041.937501175
# Total Cost:  6914823.979997629
# Total Turnaround Time:  241750800.0

In [27]:
# Run greedy cost for CPU = 25 and Mem = 40
VMs = [[25, 40] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
   
    returnObj = GreedyTurnaroundTime(taskQueue, nextQueue, 0.9, 'executionTime2', 25, 40)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_2_iii", rejected)
np.save("VMs_2_iii", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

Total Power:  14009945.789884511
Total Cost:  11207956.631909952
Total Turnaround Time:  241750800.0


In [None]:
# Run greedy cost for CPU = 25 and Mem = 40
VMs = [[25, 40] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
   
    returnObj = RR(taskQueue, nextQueue, 0.9, 'executionTime2', 25, 40)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_2_iv", rejected)
np.save("VMs_2_iv", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

In [None]:
# Total Power:  14009945.789884511
# Total Cost:  11207956.631909952
# Total Turnaround Time:  241750800.0

In [None]:
# RESULTS DISCUSSION
# For setting 1, the amount of power consumed is less when we use the algorithm that optimizes for power and the
# cost is incurred is less when we use the algorithm that optimizes for cost. The turnaround time is the same for all
# three algorithms because execution time is independent of cost and power. The same patterns apply to setting 2.


# Why optimizing cost and energy? What's the difference? 

# Optimizing for cost and energy is similar in the sense that cost in part depends on energy. However, cost also
# depends on time of day. So optimizing for energy purely depends on the VMs available, while optimizing for cost 
# depends on more parameters.


# When does it make a difference to optimize energy and cost? Provide a discussion.

# If the energy consumption is relatively stable over time, but there are high differences in price at certain times,
# then it makes a difference. This is because optimizing for cost would result in different results during times
# when the cost is higher than if just power was optimized for. In a real engineering scenario, it makes sense to 
# optimize depending on the greatest constraint.