# Summary of Results and Algorithm:


## Read input

In [1]:
# Imports
import pandas as pd
import numpy as np
# Read data into dataframe
df = pd.read_csv('./google-cluster-data-1.csv',sep=' ')

In [2]:
# Print the length of the df
print("length of df: ", len(df))

length of df:  3535029


Get CPU and MEM information

In [3]:
# Show amount of CPU for each task
df['NrmlTaskCores'].values

array([0.      , 0.      , 0.021875, ..., 0.      , 0.      , 0.      ])

In [4]:
# Save CPU amount, memory requirements, and task IDs
cpus = df['NrmlTaskCores'].values
mems = df['NrmlTaskMem'].values
task_ids = df['TaskID'].values

In [5]:
# Calculate and save execution times for two cases: mod 2 and mod 10
temp1 = df['TaskID'].values
ExecutionTime1 = []
for i in temp1:
    burstTime = (i%2+1)*300
    ExecutionTime1.append(burstTime)
    
temp2 = df['TaskID'].values
ExecutionTime2 = []
for i in temp2:
    burstTime = (i%10+1)*300
    ExecutionTime2.append(burstTime)

In [6]:
# Calculate and print total # of subtasks for two cases: mod 2 and mod 10
totalSubtask1 = 0
for i in ExecutionTime1:
    totalSubtask1 += i/300
print("Total Subtasks for setting 1:", int(totalSubtask1))

totalSubtask2 = 0
for i in ExecutionTime2:
    totalSubtask2 += i/300
    
print("Total Subtasks for setting 2:", int(totalSubtask2))

Total Subtasks for setting 1: 5298274
Total Subtasks for setting 2: 19472018


In [7]:
# Show dataframe
df.head()

Unnamed: 0,Time,ParentID,TaskID,JobType,NrmlTaskCores,NrmlTaskMem,Unnamed: 6
0,90000,757745334,1488529826,0,0.0,0.03113,
1,90000,975992247,1488529821,0,0.0,0.0,
2,90000,1468458091,1488529832,1,0.021875,0.002353,
3,90000,1460281235,1488529840,0,0.0,0.0,
4,90000,1164728954,1488529835,0,0.003125,0.001638,


In [8]:
# Save calculated execution times to columns in the dataframe
df['executionTime1'] = ExecutionTime1
df['executionTime2'] = ExecutionTime2

### Setting 1

In [9]:
# Calculates energy consumption for one vm
def getEnergyConsumption(VMCPUUsage, totalCPUCount, threshold):
    
    #here we want to calculate the cpu usage rate per individual VM so i look at the individual
    #cpu usage rate and divide it over 7/100 or 25/100
    cpuUsageRate = (7- VMCPUUsage)/(totalCPUCount)
    
    static = 0
    if cpuUsageRate > 0:
        static = 5 
    else:
        static = 0
        
    dynamic = 0
    if cpuUsageRate < threshold:
        dynamic = cpuUsageRate*100
    else:
        dynamic = threshold * 100 + (((cpuUsageRate - threshold)**2) * 200)
    #print(dynamic, static)
    return static + dynamic

In [10]:
# Calculates cost for one vm
def getCost(time, power):
    costChart = [0.5, 0.5, 0.6, 0.6, 0.6, 0.7, 0.7, 0.6, 0.6, 0.8, 0.8, 0.8, 0.8]
    mapping = [90000,91875,93750,95625,97500,99375,101250,103125,105000,106875,108750,110625]
    cost = 0
    for i in range(12):
        if time == mapping[i]:
            cost = costChart[i]
            break
        elif time < mapping[i]:
            cost = costChart[i-1]
        elif time > mapping[11]:
            cost = costChart[11]
    return cost*power
    

In [11]:
# Schedule all tasks in a way that minimizes the power consumption
def GreedyPower(input_data, nextQueue, threshhold, executionTime, cpuTotal, memTotal):
    length = len(input_data)
    nextQueue = []
    rejected =[]
    VMs = [[cpuTotal, memTotal] for i in range(100)]
    energyUsage = 0
    totalPower = 0
    totalCost = 0
    totalTurnaroundTime = 0
    
    for index, row in input_data.iterrows():
        #print(row)
        cpu, mem = row['NrmlTaskCores'], row['NrmlTaskMem']
        task_id = row['TaskID']
        found_VM = False
        lowestPower = 234567898765
        lowestPowerIndex = -1
        VM_id = 0
        
        #iterate through all of the vms
        for j in range(100):
            VM_id = j
            #check whether this is even schedulable
            if VMs[VM_id][0] >= cpu and VMs[VM_id][1] >= mem:
                energy = getEnergyConsumption(VMs[VM_id][0], cpuTotal, threshhold)
                #print("energy", energy, " lowest ", lowestPower)
                if energy < lowestPower:
                    lowestPower = energy
                    lowestPowerIndex = VM_id
                    
        #is schedulable thank the lord
        if lowestPowerIndex != -1:
            cpu = row['NrmlTaskCores']
            mem = row['NrmlTaskMem']
            
            VMs[lowestPowerIndex][0] = VMs[lowestPowerIndex][0] - cpu
            VMs[lowestPowerIndex][1] = VMs[lowestPowerIndex][1] - mem
            
            # Because we scheduled the task, update power, cost, and time:
            totalPower += lowestPower
            totalCost += getCost(row['Time'], lowestPower)
            totalTurnaroundTime += row[executionTime]
        else:
            rejected.append(row['TaskID'])
                
        if row[executionTime] > 300:
            row[executionTime] -= 300
            nextQueue.append(row)
            
    return rejected, nextQueue, totalPower, totalCost, totalTurnaroundTime

In [12]:
def GreedyCost(input_data, nextQueue, threshhold, executionTime,cpuTotal, memTotal):
    length = len(input_data)
    nextQueue = []
    VMs = [[cpuTotal, memTotal] for i in range(100)]
    cost = 0
    rejected =[]
    totalPower = 0
    totalCost = 0
    totalTurnaroundTime = 0
    
    for index, row in input_data.iterrows():
        cpu, mem = row['NrmlTaskCores'], row['NrmlTaskMem']
        task_id = row['TaskID']
        found_VM = False
        lowestCost = float('inf')
        lowestCostIndex = -1
        VM_id = 0
        
        #iterate through all of the vms
        for j in range(100):
            VM_id = j
            #check whether this is even schedulable
            if VMs[VM_id][0] >= cpu and VMs[VM_id][1] >= mem:
                energy = getEnergyConsumption(VMs[VM_id][0], cpuTotal, threshhold)
                cost = getCost(row['Time'], energy) # time, power arguments go here: FILL IN 
                if cost < lowestCost:
                    #print("ew")
                    lowestCost = cost
                    lowestCostIndex = VM_id
                    
        #is schedulable thank the lord
        if lowestCostIndex != -1:
            cpu = row['NrmlTaskCores']
            mem = row['NrmlTaskMem']
            
            VMs[lowestCostIndex][0] = VMs[lowestCostIndex][0] - cpu
            VMs[lowestCostIndex][1] = VMs[lowestCostIndex][1] - mem
            
            # Because we scheduled the task, update power, cost, and time:
            totalPower += getEnergyConsumption(VMs[VM_id][0], cpuTotal, threshhold)
            totalCost += lowestCost
            totalTurnaroundTime += row[executionTime]
        else:
            rejected.append(row['TaskID'])
                
        if row[executionTime] > 300:
            row[executionTime] -= 300
            nextQueue.append(row)
            
    return rejected, nextQueue, totalPower, totalCost, totalTurnaroundTime

In [13]:
# Schedule all tasks in a way that minimizes the turnover time
def GreedyTurnaroundTime(input_data, nextQueue, threshhold, executionTime,cpuTotal, memTotal):
    length = len(input_data)
    nextQueue = []
    VMs = [[cpuTotal, memTotal] for i in range(100)]
    energyUsage = 0
    rejected=[]
    totalPower = 0
    totalCost = 0
    totalTurnaroundTime = 0
    
    for index, row in input_data.iterrows():
        cpu, mem = row['NrmlTaskCores'], row['NrmlTaskMem']
        task_id = row['TaskID']
        found_VM = False
        index = -1
        VM_id = 0
        
        #iterate through all of the vms
        for j in range(100):
            VM_id = j
            #check whether this is even schedulable
            if VMs[VM_id][0] >= cpu and VMs[VM_id][1] >= mem:
                #energy = getEnergyConsumption(VMs[VM_id][0], cpuTotal, threshhold)
                index = VM_id
                    
        #is schedulable thank the lord
        if index != -1:
            cpu = row['NrmlTaskCores']
            mem = row['NrmlTaskMem']
            
            VMs[index][0] = VMs[index][0] - cpu
            VMs[index][1] = VMs[index][1] - mem
            
            # Because we scheduled the task, update power, cost, and time:
            power = getEnergyConsumption(VMs[VM_id][0], cpuTotal, threshhold)
            totalPower += power
            totalCost += getCost(row['Time'], power)
            totalTurnaroundTime += row[executionTime]
        else:
            rejected.append(row['TaskID'])
                
        if row[executionTime] > 300:
            row[executionTime] -= 300
            nextQueue.append(row)
            
    return rejected, nextQueue, totalPower, totalCost, totalTurnaroundTime

In [14]:
# Run greedy power for CPU = 7, Mem = 11
VMs = [[7, 11] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
print(len(setty))
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
    print(i)
    print(len(taskQueue))
    returnObj = GreedyPower(taskQueue, nextQueue, 0.5, 'executionTime1', 7, 11)
    print(len(returnObj[0]))
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_1_i", rejected)
np.save("VMs_1_i", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

76
90000
596
0
90300
48759
0
90600
76850
0
90900
72525
0
91200
70932
0
91500
73567
0
91800
73433
0
92100
72660
0
92400
74503
145
92700
72426
0
93000
73826
0
93300
74475
0
93600
75295
0
93900
75543
1
94200
69300
0
94500
70197
0
94800
73575
0
95100
72675
0
95400
73787
0
95700
74433
0
96000
73512
0
96300
71394
0
96600
70162
0
96900
69935
0
97200
69627
0
97500
65843
0
97800
67284
0
98100
72416
0
98400
70015
0
98700
71627
0
99000
77332
0
99300
72144
0
99600
70064
0
99900
72532
0
100200
72338
0
100500
71789
0
100800
71844
0
101100
71833
0
101400
71721
0
101700
74389
275
102000
71496
2
102300
72208
0
102600
76357
76
102900
70149
0
103200
66853
0
103500
71279
0
103800
71372
0
104100
72883
0
104400
72431
0
104700
73179
0
105000
73440
0
105300
73308
0
105600
69745
0
105900
68469
0
106200
68329
0
106500
69076
0
106800
66184
0
107100
69092
0
107400
66227
0
107700
68862
0
108000
71140
0
108300
68579
0
108600
70502
0
108900
70084
0
109200
70871
0
109500
72573
0
109800
70833
0
110100
69501
0
110400
7

In [15]:
print(len(rejected))

499


In [16]:
# Run greedy cost for CPU = 7, Mem = 11
VMs = [[7, 11] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
print(len(setty))
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0
counter = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
    print(i)
    
    
    returnObj = GreedyCost(taskQueue, nextQueue, 0.5, 'executionTime1', 7, 11)
    rejected += returnObj[0]
    print(len(returnObj[0]))
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_1_ii", rejected)
np.save("VMs_1_ii", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

76
90000
0
90300
0
90600
0
90900
0
91200
0
91500
0
91800
0
92100
0
92400
102
92700
0
Total Power:  2458493.2638313756
Total Cost:  1943809.778762856
Total Turnaround Time:  28787400.0


In [17]:
# Run greedy turnaround time for CPU = 7, Mem = 11
VMs = [[7, 11] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
print(len(setty))
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
    print(i)
    print(len(taskQueue))
    returnObj = GreedyTurnaroundTime(taskQueue, nextQueue, 0.5, 'executionTime1', 7, 11)
    rejected += returnObj[0]
    print(len(returnObj[0]))
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_1_iii", rejected)
np.save("VMs_1_iii", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

76
90000
596
0
90300
48759
0
90600
76850
3673
90900
72525
3338
91200
70932
3955
91500
73567
5508
91800
73433
6147
92100
72660
5922
92400
74503
7948
92700
72426
5287
93000
73826
4556
93300
74475
5185
93600
75295
5147
93900
75543
5775
94200
69300
4341
94500
70197
3797
94800
73575
6523
95100
72675
4663
95400
73787
4073
95700
74433
4264
96000
73512
4392
96300
71394
4396
96600
70162
4540
96900
69935
4415
97200
69627
3506
97500
65843
2767
97800
67284
2605
98100
72416
3418
98400
70015
3573
98700
71627
2941
99000
77332
5562
99300
72144
4058
99600
70064
3150
99900
72532
4577
100200
72338
5290
100500
71789
5262
100800
71844
5447
101100
71833
4918
101400
71721
4810
101700
74389
6276
102000
71496
4919
102300
72208
4938
102600
76357
6596
102900
70149
3826
103200
66853
2392
103500
71279
2406
103800
71372
2980
104100
72883
3681
104400
72431
4797
104700
73179
4108
105000
73440
4303
105300
73308
4451
105600
69745
2921
105900
68469
2625
106200
68329
3159
106500
69076
4246
106800
66184
2735
107100
69092


### Setting 2

In [18]:
# Run greedy power for CPU = 25 and Mem = 40
VMs = [[25, 40] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
   
    returnObj = GreedyPower(taskQueue, nextQueue, 0.9, 'executionTime2', 25, 40)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_2_i", rejected)
np.save("VMs_2_i", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

Total Power:  -7988742.8499986
Total Cost:  -6390994.279997127
Total Turnaround Time:  241750800.0


In [None]:
# Run greedy cost for CPU = 25 and Mem = 40
VMs = [[25, 40] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
   
    returnObj = GreedyCost(taskQueue, nextQueue, 0.9, 'executionTime2', 25, 40)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_2_ii", rejected)
np.save("VMs_2_ii", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)

In [None]:
# Run greedy cost for CPU = 25 and Mem = 40
VMs = [[25, 40] for i in range(100)]
setty = set(df['Time'].values)
setty = sorted(setty)
rejected = []
nextQueue = []
totalPower = 0
totalCost = 0
totalTurnaroundTime = 0

for i in setty:
    taskQueue = df.loc[df['Time'] == i]
    taskQueue = taskQueue.append(nextQueue)
   
    returnObj = GreedyTurnaroundTime(taskQueue, nextQueue, 0.9, 'executionTime2', 25, 40)
    rejected += returnObj[0]
    nextQueue = returnObj[1]
    totalPower = returnObj[2]
    totalCost = returnObj[3]
    totalTurnaroundTime = returnObj[4]

if len(rejected) > 0:
    np.save("taskReject_2_iii", rejected)
np.save("VMs_2_iii", VMs)
print("Total Power: ", totalPower)
print("Total Cost: ", totalCost)
print("Total Turnaround Time: ", totalTurnaroundTime)