1. Mounting Google Drive- The output files are saved in ./drive/MyDrive/EDBT23/files directory

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


2. Removing all previous results- Run this when working on a new dataset or completely re-running the existing dataset

In [None]:
!rm ./drive/MyDrive/EDBT23/files/results/*.pkl
! rm -rf ./drive/MyDrive/EDBT23/files

3a. Importing dataset (in this case CYERSHAKE) from mounted google drive. Change name from CYBERSHAKE to GENOME/MONTAGE as applicable. Not necessary for WEBLOG. For WEBLOG follow from 3b after 4.


In [None]:
!cp ./drive/MyDrive/EDBT23/SyntheticWorkflows.tar.gz ./
!tar -xf ./SyntheticWorkflows.tar.gz
!rm ./SyntheticWorkflows.tar.gz
!mv ./SyntheticWorkflows/GENOME/ ./dataset/
!rm -rf SyntheticWorkflows

4. Extracting information from imported XML files in the imported dataset above. Seperating training and test set. Creating and saving list of files for tasks. Creating tasks for files (inverse of files for tasks). Saving task runtime and file size.
NOTE: Matrix operation are not used in this work. Not necessary for WEBLOG

In [None]:
import requests 
import xml.etree.ElementTree as ET 
import os
import joblib as jl
import numpy as np
from scipy import linalg 
from scipy.spatial import distance
from scipy.sparse import lil_matrix


def matrixOperation(train_tasks):
	files = []
	tasks = []
	num_files = []
	num_tasks = []
	file_id_mapping = {}
	id_file_mapping = {}
	task_id_mapping = {}
	id_task_mapping = {}	
	temp = []
	count1 = 0
	count2 = 0
	print ("Creating task ID and file ID")
	for tasks in train_tasks.keys():
		task_id_mapping[tasks] = count1
		id_task_mapping[count1] = tasks
		count1 += 1
		for file1 in train_tasks[tasks]:
			if file1 not in file_id_mapping.keys():
				file_id_mapping[file1] = count2
				id_file_mapping[count2] = file1
				count2 += 1
	num_files = count2
	del temp
	num_tasks = count1
	with open("Id_File_Mapping.pkl", 'wb') as fp:
		jl.dump(id_file_mapping, fp)
	fp.close()
	with open("Id_Task_Mapping.pkl", 'wb') as fp:
		jl.dump(id_task_mapping, fp)
	fp.close()
	del id_file_mapping
	del id_task_mapping	
	with open("File_Id_Mapping.pkl", 'wb') as fp:
		jl.dump(file_id_mapping, fp)
	fp.close()
	with open("Task_Id_Mapping.pkl", 'wb') as fp:
		jl.dump(task_id_mapping, fp)
	fp.close()
	print ("Done saving data")
	
			
def test_train_split(data, task_runtime):
	print ("Starting train-test split")
	train_task = {}
	test_task = {}
	count = 0
	data_file = {}
	file_runtime = {}
	marker = 0
	for task1 in data.keys():
		for task2 in data:		
			if len(set(data[task1]).intersection(set(data[task2]))) != 0:
				if count >= (0.2*len(data.values())):
					marker = 1		
					break
				else:
					test_task[task1] = data[task1]
					train_task[task2] = data[task2]
					count += 2				
					break
		if marker == 1:
			break
	for task1 in data.keys():
		if task1 not in test_task.keys(): 
			if task1 not in train_task.keys():
				train_task[task1] = data[task1]
	print ("Finished splitting data")		
	for tasks in train_task.keys():
		file_colls = train_task[tasks]
		runtime = task_runtime[tasks]
		for files in file_colls:
			try:
				temp = data_file[files]
				temp.append(tasks)
				data_file[files] = temp
				temp = file_runtime[files]
				file_runtime[files] = temp + runtime
			except:
				temp = [tasks]
				data_file[files] = temp
				file_runtime[files] = 0
	return train_task, test_task, data_file, file_runtime


def parseXML(xmlfile, data, file_size, task_runtime): 
	try:
		# create element tree object 
		tree = ET.parse(xmlfile) 
  	
		# get root element 
		root = tree.getroot() 
	
		for child in root:
			try:
				#Check if task execution time and file size are in int/float
				task_id = xmlfile+"_"+child.attrib['id']
				task_runtime[task_id] = float(child.attrib['runtime'])
				data[task_id] = list()
				for grn_child in child:
					if grn_child.attrib['link'] == "input":	
						data[task_id].append(grn_child.attrib['file'])
						file_size[grn_child.attrib['file']] = grn_child.attrib['size']
			except:
				continue
	except:
		print ("This XML file could not be processed")	
def main(): 

	# directory to store the file dependencises of each task
	# key: task_id, value = file_id 
	data = {}
	file_size = {}
	task_runtime = {}

	# iterarating over all files in the directory
	print ("Extracting from XML")
	dirs = "./dataset"
	for files in os.listdir(os.path.join("./",dirs)):
		if files[-3:] == "dax":
			# parse xml file 
			parseXML(os.path.join(os.path.join("./",dirs),files), data, file_size, task_runtime) 

	train_task, test_task, data_file, file_runtime = test_train_split(data, task_runtime)
	print ("Finished XML   extraction")
	print("Number of training tasks: ", len(train_task))
	print("Number of testing tasks: ", len(test_task))

	# saving extracted data

	print("Saving extracted data")
	new_file = open("TrainingTasks.pkl", "wb")
	jl.dump(train_task,new_file)
	new_file.close()
	new_file = open("TestingTasks.pkl", "wb")
	jl.dump(test_task,new_file)
	new_file.close()
	new_file = open("TrainingFiles.pkl", "wb")
	jl.dump(data_file,new_file)
	new_file.close()
	new_file = open("TrainingFileRuntime.pkl", "wb")
	jl.dump(file_runtime,new_file)
	new_file.close()	
	new_file = open("FileSize.pkl", "wb")
	jl.dump(file_size,new_file)
	new_file.close()		
	print("Finished saving extracted data") 
	
	new_file = open("TrainingTasks.pkl", "rb")
	train_tasks = jl.load(new_file)
	new_file.close()		
	matrixOperation(train_tasks)
      
if __name__ == "__main__": 
  
	# calling main function 
	main() 

Extracting from XML
Starting train-test split
Finished splitting data
Finished XML   extraction
Number of training tasks:  540416
Number of testing tasks:  58991
Saving extracted data


3b. Importing dataset- WEBLOG from mounted google drive.

In [None]:
!cp ./drive/MyDrive/EDBT23/TestingTasks.pkl ./
!cp ./drive/MyDrive/EDBT23/TrainingTasks.pkl ./
!cp ./drive/MyDrive/EDBT23/TrainingFiles.pkl ./

5. Creating directory structure for storing the results of networkx graph and final results 

In [None]:
!mkdir ./drive/MyDrive/EDBT23/files
!mkdir ./drive/MyDrive/EDBT23/files/Nodes
!mkdir ./drive/MyDrive/EDBT23/files/Nodes/Graphs
!mkdir ./drive/MyDrive/EDBT23/files/Nodes/Weights
!mkdir ./drive/MyDrive/EDBT23/files/Nodes/Graphs/subgraphs
!mkdir ./drive/MyDrive/EDBT23//files/results

6. From the extracted data above, create the networkx graph and record the weight of the edges of the graph

In [None]:
import networkx as nx
import joblib as jl
#from memory_profiler import profile
import os
import re

def graphCreator(files_list1, filename1):

	directory_graph = "./drive/MyDrive/EDBT23/files/Nodes/Graphs"
	directory_weight = "./drive/MyDrive/EDBT23/files/Nodes/Weights"	
	graph_file = filename = os.path.join(directory_graph, "Graph.pkl")							
	graph_file = re.sub(r'(ExtractedWorkflowData)','Graph', filename)
	weight_file = filename = os.path.join(directory_weight, "Weight.pkl")							
	weight_file = re.sub(r'(ExtractedWorkflowData)','Weight', filename)
	g= nx.Graph()
	print ("Creating edges of the graph")
	for it1 in files_list1.keys():
		for it2 in files_list1.keys():
			task1 = files_list1[it1]
			task2 = files_list1[it2]
			common = len(list(set(task1).intersection(set(task2))))
			if common != 0:
				g.add_edge(it1, it2, weight = common)					
	print ("Successfully created a graph")	
	print("Saving the created graph")
	new_file = open(graph_file, "wb")
	jl.dump(g, new_file)
	new_file.close()
	del g
	
	weight = {}
	for it1 in files_list1.keys():
		for it2 in files_list1.keys():
			task1 = files_list1[it1]
			task2 = files_list1[it2]
			common = len(list(set(task1).intersection(set(task2))))
			if common != 0:
				weight[it1+"///..."+it2] = common
	print ("Successfully computed the weights")	
	print("Saving the computed weights")
	new_file = open(weight_file, "wb")
	jl.dump(weight, new_file)
	new_file.close()	

if __name__=="__main__":
	
	filename = "./TrainingFiles.pkl"
	print ("Loading graph")
	new_file = open(filename, "rb")
	file_list1 = jl.load(new_file)
	new_file.close()

	graphCreator(file_list1, filename)
	

Loading graph
Creating edges of the graph
Successfully created a graph
Saving the created graph
Successfully computed the weights
Saving the computed weights


7. Finding and saving all the component subgraphs which have number of nodes greater than 1. 
Also, finding edge-to-node ratio- mean, max, min and SD.
Saving edge-to-node ratio 

In [None]:
import networkx as nx
import joblib as jl
import networkx.algorithms.components as comp
import os
import re, math
import matplotlib.pyplot as plt

def find_subgraphs(filename, G): 
	S = [G.subgraph(c).copy() for c in nx.connected_components(G)]
	count = 0
	directory = "./drive/MyDrive/EDBT23/files/Nodes/Graphs/subgraphs"
	ratio2 = []
	mean2_1 = []
	sd2_1 = []
	rat_list = {}
	for it1 in S:
		if len(it1.nodes) > 1:
			count += 1
			filename1 = re.sub(r'(./drive/MyDrive/EDBT23/files/Nodes/Graphs/)',"./drive/MyDrive/EDBT23/files/Nodes/Graphs/subgraphs/", filename)			
			filename1 = re.sub(r'(.pkl)','_'+str(count)+'.pkl', filename1)
#			filename = os.path.join(directory, filename)					
			print ("Saving the graph")			
			new_file = open(filename1, "wb")
			jl.dump(it1, new_file)
			new_file.close()
			del filename1
			ratio2.append(len(list(it1.edges))/len(list(it1.nodes)))
			rat_list[count] = len(list(it1.edges))/len(list(it1.nodes))
	mean2 = 0
	sd2 = 0
	for it1 in ratio2:
		mean2 += it1
	mean2 = mean2/len(ratio2)
	mean2_1.append(mean2)
	temp = 0
	for it1 in ratio2:
		temp += ((it1 - mean2)**2)
	temp = temp/len(ratio2)
	sd2 = math.sqrt(temp)   
	sd2_1.append(sd2)

	print ("Saving the edge-node ratio")			
	new_file = open("./drive/MyDrive/EDBT23/files/results/ratio.pkl", "wb")
	jl.dump(rat_list, new_file)
	new_file.close()			  
 
	print ("Mean of edge-to-node ratio", mean2)  
	print("Standard Deviation of edge-node ratio: ", sd2)
	print ("Maximum node-to-edge ratio: ", max(ratio2))
	print ("Minimum node-to-edge ratio: ", min(ratio2)) 
 
if __name__=="__main__":
	
	directory = "./drive/MyDrive/EDBT23/files/Nodes/Graphs"
	for filename1 in os.listdir(directory):
		if filename1[-3:] != "pkl":
			continue					
		print ("Loading graph of node: ", filename1)
		filename = os.path.join(directory, filename1)					
		new_file = open(filename, "rb")
		g = jl.load(new_file)
		new_file.close()
#	print ("Loading weight of node: ", counter)
#	new_file = open("./files/Nodes/Weights/Weight_" +str(counter) +"_.pkl", "rb")
#	weight = jl.load(new_file)
#	new_file.close()

#	g= nx.Graph()
#	for it1 in weight.keys():
#		files = it1.split("_")
#		g.add_edge(files[0], files[1])

		print(nx.number_connected_components(g))
#	maxx = max(nx.connected_component_subgraphs(g), key=len)
#	print (maxx.nodes)
#	print (len(weight))
		find_subgraphs(filename, g)

Loading graph of node:  Graph.pkl
9986
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving the graph
Saving th

8. Find betweenness centrality for all subgraphs having number of nodes greater than 1. 
Saving the betweenness centrality as a dictionary-
Key: betweenness centrality
Value: List of edges which have a betweenness centrality value = key
Sorting the dictionary in descending order (of Key)
Saving the dictionary containing sorted betweenness centrality.

In [None]:
import joblib as jl
import os
from multiprocessing import Pool
import time
import itertools
import networkx as nx
import sys
import re
import collections
from scipy import stats
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
import traceback
import numpy as np


def non_parallel(g):
	start = time.time()  
	d1 = nx.edge_betweenness_centrality(g, weight="common")
	end = time.time()
	d2 = {}
	print ("Time to find betweenness centrality of all nodes (non-parallel): ", end-start)
	start = time.time()
	for it1 in d1.keys():
		val = d1[it1]
		if val in d2.keys():
			temp = d2[val]
			temp.append(str(it1[0]) +"///..." +str(it1[1]))
			d2[val] = temp
		else:
			d2[val] = [str(it1[0]) +"///..." +str(it1[1])]
		
	return d2

if __name__=="__main__":

	print ("Loading and sorting the edges by weight")
	filename = "./drive/MyDrive/EDBT23/files/Nodes/Weights/Weight.pkl"
	new_file = open(filename, "rb")
	weights = jl.load(new_file)
	new_file.close()
	sorted_weights = {k: v for k, v in sorted(weights.items(), key=lambda item: item[1])}		
	gl_rank = {}
	count = 1
	for edges in sorted_weights.keys():
		gl_rank[edges] = count
		count += 1
	
	print ("Loading the node-edge ratio")			
	new_file = open("./drive/MyDrive/EDBT23/files/results/ratio.pkl", "rb")
	ratio = jl.load(new_file)
	new_file.close()			

	directory = "./drive/MyDrive/EDBT23/files/Nodes/Graphs/subgraphs" 
	count = 1
	tau_coll_fin = {}
	p_coll_fin = {}	
	x = []
	y = []
	p = []
	for filename1 in os.listdir(directory): 
		if filename1[0:5] == "Graph":
			filename = os.path.join(directory, filename1)			
			new_file = open(filename, "rb")
			G = jl.load(new_file)
			new_file.close()			
			tuples = non_parallel(G)
	 		#Save the betweenness centrality values and the edges
			filename = os.path.join(directory, "Tuple_"+str(count)+".tup")								
			new_file = open(filename, "wb")
			jl.dump(tuples, new_file)
			new_file.close()	
			#Save the betweenness centrality values- sorted 
			sorted_tuples = dict(sorted(tuples.items(), reverse = True)) #Check order of sorting
			filename = os.path.join(directory, "SortedTuple_"+str(count)+".tup")								
			new_file = open(filename, "wb")
			jl.dump(sorted_tuples, new_file)
			new_file.close()	
				


Loading and sorting the edges by weight
Loading the node-edge ratio
Time to find betweenness centrality of all nodes (non-parallel):  73.57196712493896
Time to find betweenness centrality of all nodes (non-parallel):  9.065217018127441
Time to find betweenness centrality of all nodes (non-parallel):  9.115142345428467
Time to find betweenness centrality of all nodes (non-parallel):  6.95945143699646
Time to find betweenness centrality of all nodes (non-parallel):  0.00018024444580078125
Time to find betweenness centrality of all nodes (non-parallel):  0.22660207748413086
Time to find betweenness centrality of all nodes (non-parallel):  0.07778406143188477
Time to find betweenness centrality of all nodes (non-parallel):  0.08601593971252441
Time to find betweenness centrality of all nodes (non-parallel):  0.07709789276123047
Time to find betweenness centrality of all nodes (non-parallel):  0.08193564414978027
Time to find betweenness centrality of all nodes (non-parallel):  0.0805428028

9. Compute threshold according to equation 1 in the paper- considering the entire graph
Delete edges from the graph
Find the subgraph 
For each test tasks
  For each subgraph
    Find TP, FP, FN and compute F-score

In [None]:
from matplotlib.units import ma
import joblib as jl
import os, re
import networkx as nx
from scipy.stats import pearsonr

with open("./TestingTasks.pkl", 'rb') as fp:
  test_set = jl.load(fp)
fp.close()

with open("./drive/MyDrive/EDBT23/files/Nodes/Weights/Weight.pkl", 'rb') as fp:
  weight = jl.load(fp)
fp.close()

betweenness_coll = {}
for filename in os.listdir("./drive/MyDrive/EDBT23/files/Nodes/Graphs/subgraphs"):
  if filename[0:6] != "Sorted":
    continue
  #Open the file for the corresponding subgraph
  filename1 = re.sub(r'(SortedTuple)',"Graph", filename)			
  filename1 = re.sub(r'(.tup)',".pkl", filename1)			
  try:
    with open(os.path.join("./drive/MyDrive/EDBT23/files/Nodes/Graphs/subgraphs", filename1), 'rb') as fp:
      G = jl.load(fp)
    fp.close()
    G_nodes = G.nodes()
    G_edges = G.edges()
  except:
    print(filename1)
    continue
  with open(os.path.join("./drive/MyDrive/EDBT23/files/Nodes/Graphs/subgraphs", filename), 'rb') as fp:
    betweenness = jl.load(fp)
  fp.close()
  betweenness_coll[list(betweenness.keys())[0]] = betweenness[list(betweenness.keys())[0]]
  del betweenness

sum_of_wts = 0
sorted_betweeness_coll = dict(sorted(betweenness_coll.items(), reverse = True)) #Check order of sorting
selected_edges = list(sorted_betweeness_coll)[0]
selected_edges = []
for it1 in sorted_betweeness_coll.keys():
  edges = sorted_betweeness_coll[it1]
  for it2 in edges:
    selected_edges.append(it2)
count = 1
for it1 in selected_edges:
  sum_of_wts += weight[it1]
  count += 1
thr = sum_of_wts/count

new_file = open("./drive/MyDrive/EDBT23/files/Nodes/Graphs/Graph.pkl", "rb")
G = jl.load(new_file)
new_file.close()

for edges in selected_edges:
  if weight[edges] < thr:
    edge = edges.split("///...")
    G.remove_edge(edge[0], edge[1])
S = [G.subgraph(c).copy() for c in nx.connected_components(G)]

all_nodes = G.nodes()
node_coll = {}
count = 1
for subgraphs in S:
  if len(subgraphs.nodes) > 1:
    node_coll[count] = subgraphs.nodes
    count +=1
try:
  with open("./drive/MyDrive/temp.pkl", 'rb') as fp:
    file_wrapper = jl.load(fp)
  fp.close()
  tp_f = file_wrapper['tp_f']
  tn_f = file_wrapper['tn_f'] 
  fp_f = file_wrapper['fp_f'] 
  fn_f = file_wrapper['fn_f']
  f1_f = file_wrapper['f1_f']
  maxx_f = file_wrapper['maxx_f'] 
  maxx_f1_f = file_wrapper['maxx_f1_f'] 
  task_name = file_wrapper['name'] 
except:
  tp_f = {}
  tn_f = {}
  fp_f = {}
  fn_f = {}
  f1_f = {}
  maxx_f1_f = {}
  maxx_f = {}
  map = {}
  file_wrapper = {}
  task_name = ''
flag = 0
for tasks in test_set:
  if task_name == tasks or task_name == '': 
    flag = 1
  if flag == 0:
    continue
  test_files = test_set[tasks]
  task_name = tasks
  tp = {}
  tn = {}
  fp = {}
  fn = {}
  f1 = {} 
  max_f1 = 0 
  maxx = -1    
  count = 1
  for it1 in node_coll.keys():
    files = node_coll[it1]
    tp[count] = len(set(test_files).intersection(set(files)))/len(set(test_files))
    notin_test = set(all_nodes).difference(set(test_files))
    notin_pred = set(all_nodes).difference(set(files))
    tn[count] = len(notin_test.intersection(notin_pred))
    fp[count] = len(set(files).difference(set(test_files)))/len(set(all_nodes))
    fn[count] = len(set(test_files).difference(set(files)))/len(set(all_nodes))
    f1[count] = (2*tp[count])/((2*tp[count])+fp[count]+fn[count])
    if f1[count] > maxx:
      max_f1 = count
      maxx = f1[count]
    count += 1  
  tp_f[tasks] = tp
  tn_f[tasks] = fp
  fp_f[tasks] = fp
  fn_f[tasks] = fn
  f1_f[tasks] = f1
  maxx_f[tasks] = maxx
  maxx_f1_f[tasks] = max_f1
  file_wrapper['tp_f'] = tp_f
  file_wrapper['tn_f'] = tn_f
  file_wrapper['fp_f'] = fp_f
  file_wrapper['fn_f'] = fn_f
  file_wrapper['f1_f'] = f1_f
  file_wrapper['maxx_f'] = maxx_f
  file_wrapper['maxx_f1_f'] = maxx_f1_f
  file_wrapper['name'] = tasks
  with open("./drive/MyDrive/temp.pkl", 'wb') as fp:
    jl.dump(file_wrapper, fp)
  fp.close()  
with open("./drive/MyDrive/final.pkl", 'wb') as fp:
  jl.dump(file_wrapper, fp)
fp.close()    

10. Find the results


In [None]:
import matplotlib.pyplot as plt
import math
from scipy import stats

with open("./drive/MyDrive/final.pkl", 'rb') as fp:
  file_wrapper = jl.load(fp)
fp.close()
tp_f = file_wrapper['tp_f']
tn_f = file_wrapper['tn_f'] 
fp_f = file_wrapper['fp_f'] 
fn_f = file_wrapper['fn_f']
f1_f = file_wrapper['f1_f']
maxx_f = file_wrapper['maxx_f'] 
maxx_f1_f = file_wrapper['maxx_f1_f'] 

#Find the subcompoenent with most maxx
comp_counter = {}
count = 1
for tasks in maxx_f1_f.keys():
  component = maxx_f1_f[tasks]  
  try:
    temp = comp_counter[component]
    temp += 1
    comp_counter[component] = temp
  except:
    comp_counter[component] = 1
    count += 1
sorted_comp_counter = {k: v for k, v in sorted(comp_counter.items(), reverse=True, key=lambda item: item[1])}		

print ("Loading the node-edge ratio")			
new_file = open("./drive/MyDrive/EDBT23/files/results/ratio.pkl", "rb")
ratio = jl.load(new_file)
new_file.close()

sd_fs = 0
sd_tp = 0
sd_fn = 0
sd_fp = 0
mean_fs = 0
mean_tp = 0
mean_fn = 0
mean_fp = 0
count = 0
tau_f1 = 0
tau_tp = 0
tau_fp = 0
tau_fn = 0
tau_f1_c = 0
tau_tp_c = 0
tau_fp_c = 0
tau_fn_c = 0
for tasks in f1_f.keys():
  f1 = f1_f[tasks]  
  tp = tp_f[tasks]
  fp = fp_f[tasks]
  fn = fn_f[tasks]
  rat = {}
  all_list = list(f1.keys())
  for it1 in ratio.keys():
    if it1 in all_list:
      rat[it1] = ratio[it1] 
  sorted_ra = dict(sorted(rat.items(), reverse = True)) #Check order of sorting
  rank1 = list(sorted_ra.keys())  
  sorted_f1 = dict(sorted(f1.items(), reverse = True)) #Check order of sorting
  rank2 = list(sorted_f1.keys())
  tau, p_value = stats.kendalltau(np.array(rank1), np.array(rank2))
  if p_value < 0.05:
    tau_f1 += tau
    tau_f1_c += 1
  sorted_tp = dict(sorted(tp.items(), reverse = True)) #Check order of sorting
  rank2 = list(sorted_tp.keys())
  tau, p_value = stats.kendalltau(np.array(rank1), np.array(rank2))
  if p_value < 0.05:
    tau_tp += tau
    tau_tp_c += 1  
  sorted_fp = dict(sorted(fp.items(), reverse = True)) #Check order of sorting
  rank2 = list(sorted_fp.keys())
  tau, p_value = stats.kendalltau(np.array(rank1), np.array(rank2))
  if p_value < 0.05:
    tau_fp += tau
    tau_fp_c += 1  
  sorted_fn = dict(sorted(fn.items(), reverse = True)) #Check order of sorting
  rank2 = list(sorted_fn.keys())
  tau, p_value = stats.kendalltau(np.array(rank1), np.array(rank2))
  if p_value < 0.05:
    tau_fn += tau
    tau_fn_c += 1

  it1 = f1[list(sorted_comp_counter.keys())[0]]  
  mean_fs += it1

  it1 = tp[list(sorted_comp_counter.keys())[0]]  
  mean_tp += it1

  it1 = fp[list(sorted_comp_counter.keys())[0]]  
  mean_fp += it1  

  it1 = fn[list(sorted_comp_counter.keys())[0]]  
  mean_fn += it1 
  count+= 1 
mean_fs = mean_fs/count
mean_tp = mean_tp/count
mean_fp = mean_fp/count
mean_fn = mean_fn/count
f1_f1 = []
tp_f1 = []
fp_f1 = []
fn_f1 = []
for tasks in f1_f.keys():
  f1 = f1_f[tasks]  
  it1 = f1[list(sorted_comp_counter.keys())[0]]    
  sd_fs += (mean_fs-it1)**2
  f1_f1.append(it1)
  tp = tp_f[tasks]  
  it1 = tp[list(sorted_comp_counter.keys())[0]]  
  sd_tp += (mean_tp-it1)**2
  tp_f1.append(it1)
  fp = fp_f[tasks]  
  it1 = fp[list(sorted_comp_counter.keys())[0]]  
  sd_fp = (mean_fp-it1)**2  
  fp_f1.append(it1)
  fn = fn_f[tasks]  
  it1 = fn[list(sorted_comp_counter.keys())[0]]  
  sd_fn = (mean_fn-it1)**2  
  fn_f1.append(it1)
max_fs = max(f1_f1)
max_tp = max(tp_f1)
max_fp = max(fp_f1)
max_fn = max(fn_f1)
min_fs = min(f1_f1)
min_tp = min(tp_f1)
min_fp = min(fp_f1)
min_fn = min(fn_f1)
print ("Number of files in all subgraph: ", len(all_nodes))
print ("Mean F1 score: ", mean_fs, "    Standard deviation: ", math.sqrt(sd_fs), "Maximum F1 score: ", max_fs, "Minimum F1 score: ", min_fs)
print ("Mean TP score: ", mean_tp, "    Standard deviation: ", math.sqrt(sd_tp), "Maximum TP score: ", max_tp, "Minimum TP score: ", min_tp)
print ("Mean FP score: ", mean_fp, "    Standard deviation: ", math.sqrt(sd_fp), "Maximum FP score: ", max_fp, "Minimum FP score: ", min_fp)
print ("Mean FN score: ", mean_fn, "    Standard deviation: ", math.sqrt(sd_fn), "Maximum FN score: ", max_fn, "Minimum FN score: ", min_fn)
print ("Mean Tau correlation between F score and edge-to-node ratio: ", tau_f1/tau_f1_c)
print ("Mean Tau correlation between TP and edge-to-node ratio: ", tau_tp/tau_tp_c)
print ("Mean Tau correlation between FP and edge-to-node ratio: ", tau_fp/tau_fp_c)
print ("Mean Tau correlation between FN and edge-to-node ratio: ", tau_fn/tau_fn_c)


In [None]:
!rm ./drive/MyDrive/EDBT23/files/Graph/subgraphs/Tuple_*.tup

rm: cannot remove './drive/MyDrive/files/Graph/subgraphs/Tuple_*.tup': No such file or directory


In [None]:
#TODO1
#Open files: Weight, Sorted Betweenness
#Find thresholds- algorithmic, percentage
#From the weight and full graph file, delete the edges whose weight < threshold
#Find the disconnected components

#TODO2
#For every test set task
#For every subconnected components
#Find and record F1 score

TODO: Result of the final experiment

In [None]:
#TODO1
