In [3]:
import time
import numpy as np
import pandas as pd
import math
from heapdict import heapdict
np.set_printoptions(suppress=True)

In this programming problem and the next you'll code up the greedy algorithms from lecture for minimizing the weighted sum of completion times..

Download the text file below.

jobs.txt
This file describes a set of jobs with positive and integral weights and lengths. It has the format

[number_of_jobs]

[job_1_weight] [job_1_length]

[job_2_weight] [job_2_length]

...

For example, the third line of the file is "74 59", indicating that the second job has weight 74 and length 59.

You should NOT assume that edge weights or lengths are distinct.

Your task in this problem is to run the greedy algorithm that schedules jobs in decreasing order of the difference (weight - length). Recall from lecture that this algorithm is not always optimal. IMPORTANT: if two jobs have equal difference (weight - length), you should schedule the job with higher weight first. Beware: if you break ties in a different way, you are likely to get the wrong answer. You should report the sum of weighted completion times of the resulting schedule --- a positive integer --- in the box below.

ADVICE: If you get the wrong answer, try out some small test cases to debug your algorithm (and post your test cases to the discussion forum).

In [4]:
def scheduling_difference(filename):
    data = pd.read_csv('week9_file/'+filename,skiprows=[0], sep=" ", header=None)
    data.columns = ["weight", "length"]
    data['difference'] = data['weight'] - data['length']
    #sort by the difference and break tie by the weight
    data.sort_values(by=['difference','weight'],ascending=False,inplace=True)
    #completion time formula is equivalent to cummulative sum and dot product with the weight
    completion_time = (data['length'].cumsum()).dot(data['weight'])
    return completion_time

In [5]:
start_time = time.time()
print(scheduling_difference('week9_1.txt'))
print("--- %s seconds ---" % (time.time() - start_time))

69119377652
--- 0.037480831146240234 seconds ---


For this problem, use the same data set as in the previous problem.

Your task now is to run the greedy algorithm that schedules jobs (optimally) in decreasing order of the ratio (weight/length). In this algorithm, it does not matter how you break ties. You should report the sum of weighted completion times of the resulting schedule --- a positive integer --- in the box below.

In [6]:
def scheduling_ratio(filename):
    data = pd.read_csv('week9_file/'+filename,skiprows=[0], sep=" ", header=None)
    data.columns = ["weight", "length"]
    data['ratio'] = data['weight'] / data['length']
    #sort by the ratio
    data.sort_values(by=['ratio'],ascending=False,inplace=True)
    completion_time = (data['length'].cumsum()).dot(data['weight'])
    return completion_time

In [7]:
start_time = time.time()
print(scheduling_ratio('week9_1.txt'))
print("--- %s seconds ---" % (time.time() - start_time))

67311454237
--- 0.01528477668762207 seconds ---


In this programming problem you'll code up Prim's minimum spanning tree algorithm.

Download the text file below.

edges.txt
This file describes an undirected graph with integer edge costs. It has the format

[number_of_nodes] [number_of_edges]

[one_node_of_edge_1] [other_node_of_edge_1] [edge_1_cost]

[one_node_of_edge_2] [other_node_of_edge_2] [edge_2_cost]

...

For example, the third line of the file is "2 3 -8874", indicating that there is an edge connecting vertex #2 and vertex #3 that has cost -8874.

You should NOT assume that edge costs are positive, nor should you assume that they are distinct.

Your task is to run Prim's minimum spanning tree algorithm on this graph. You should report the overall cost of a minimum spanning tree --- an integer, which may or may not be negative --- in the box below.

IMPLEMENTATION NOTES: This graph is small enough that the straightforward O(mn) time implementation of Prim's algorithm should work fine. OPTIONAL: For those of you seeking an additional challenge, try implementing a heap-based version. The simpler approach, which should already give you a healthy speed-up, is to maintain relevant edges in a heap (with keys = edge costs). The superior approach stores the unprocessed vertices in the heap, as described in lecture. Note this requires a heap that supports deletions, and you'll probably need to maintain some kind of mapping between vertices and their positions in the heap.

In [8]:
def read_input(filename):
    first = True
    with open('week9_file/'+filename) as f:
        G = {}
        for line in f:
            line = line.split() # to deal with blank 
            if line and not first:            # lines (ie skip them)
                v = int(line[0])
                w = int(line[1])
                weight = int(line[2])
                if G.get(v) == None:
                    G[v] = [(v,w,weight)]
                else:
                    G[v].append((v,w,weight))
                if G.get(w) == None:
                    G[w] = [(w,v,weight)]
                else:
                    G[w].append((w,v,weight))
            else:
                num_nodes = int(line[0])
                num_edges = int(line[1])
                first = False
                    
    return G,num_nodes,num_edges

In [9]:
def prims_algorithm(filename):
    G,num_nodes,num_edges = read_input(filename)
    X = [1] 
    T = [] #invariant: X = vertoces spanned by tree-so-far T
    W = []
    #increases number of spanned vertices in cheapest way possible
    while len(X) != num_nodes: #while X != V
        vw_min = []
        for vertices in X:
            if G.get(vertices) != None:
                vw_pair_list = [vw_pair for vw_pair in G[vertices] if vw_pair[1] not in X]
                if len(vw_pair_list) == 0:
                    pass
                else:
                    u_min,v_min,weight_vw_min = min(vw_pair_list, key = lambda t: t[2])
                    vw_min.append((u_min,v_min,weight_vw_min))
        
        
        #let e = (u,v) be the cheapest edge of G with u element X, v not element X
        u_min_all,v_min_all,weight_vw_min_all = min(vw_min, key = lambda t: t[2])
        #add e to T
        T.append((u_min_all,v_min_all))
        #add v to X
        X.append(v_min_all)
        W.append(weight_vw_min_all)
    
    return sum(W)

In [10]:
start_time = time.time()
print(prims_algorithm('week9_2_test1.txt')) #-97121
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print(prims_algorithm('week9_2_test2.txt')) #-64386
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print(prims_algorithm('week9_2_test3.txt')) #-7430
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print(prims_algorithm('week9_2_test4.txt')) #-12829
print("--- %s seconds ---" % (time.time() - start_time))

-97121
--- 0.004764080047607422 seconds ---
-64386
--- 0.00504302978515625 seconds ---
-7430
--- 0.0028688907623291016 seconds ---
-12829
--- 0.0015628337860107422 seconds ---


In [11]:
start_time = time.time()
print(prims_algorithm('week9_2.txt')) #-3612829
print("--- %s seconds ---" % (time.time() - start_time))

-3612829
--- 6.299896955490112 seconds ---


In [181]:
def prims_algorithm_heap(filename):
    #Invariants: 1) elements in heap = vertices of V-X. 2) For v element V-X, key[v] = cheapest edge (u,v)
    #with U element X (or infinity if no such edges exist)
    G,num_nodes,num_edges = read_input(filename)
    X = [1]
    T = []
    W = []
    hd = heapdict()
    for i in list(G.keys()):
        if i not in X:
            hd[i] = math.inf
    
    for v,w,weight_vw in G[1]:
        hd[w] = weight_vw
    
    while any(hd): 
        v_min,weight_min = hd.popitem()
        W.append(weight_min) 
        #if w element V - X is the only whose key might have change 
        for (v,w,weight_wv) in G[v_min]:
            if hd.get(w) != None: #if it is in V-X
                #recompute key[w] := min(key[w],Cvw)
                hd[w] = min(hd[w],weight_wv)
                
    return sum(W)

In [182]:
start_time = time.time()
print(prims_algorithm_heap('week9_2_test1.txt')) #-97121
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print(prims_algorithm_heap('week9_2_test2.txt')) #-64386
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print(prims_algorithm_heap('week9_2_test3.txt')) #-7430
print("--- %s seconds ---" % (time.time() - start_time))
start_time = time.time()
print(prims_algorithm_heap('week9_2_test4.txt')) #-12829
print("--- %s seconds ---" % (time.time() - start_time))

-97121
--- 0.01171731948852539 seconds ---
-64386
--- 0.0029458999633789062 seconds ---
-7430
--- 0.001644134521484375 seconds ---
-12829
--- 0.001481771469116211 seconds ---


In [184]:
start_time = time.time()
print(prims_algorithm_heap('week9_2.txt')) #-3612829
print("--- %s seconds ---" % (time.time() - start_time))

-3612829
--- 0.10876202583312988 seconds ---


In [93]:
G,num_nodes,num_edges = read_input('week9_2.txt')