# Data Extractor

This jupyter notebook extracts data from .stat files, compiles the data into usable statistics, and exports the data into a series of csv files. Data on runtime and flowtime cost is extracted and compiled to provide information on average runtime, success rate, flowtime ratio, minimum solved problem size, and maximum solved problem size. 

All data is stored in the "Data" directory. This directory is automatically created if it does not already exist. 

## Dependencies and User-Defined Variables

In [None]:
import os
import re
import csv

extract_dir = "cbs_analysis"

resolutions = [1, 2, 4]
experiment_range = (1, 20) # this is an inclusive range

map_names = ["empty-8-8", "empty-16-16", "empty-32-32", "empty-48-48", 
             "random-32-32-10", "random-32-32-20", "random-64-64-10", "random-64-64-20", 
             "room-32-32-4", "room-64-64-8",  "room-64-64-16",
             "maze-32-32-2", "maze-32-32-4", "maze-128-128-10", "maze-128-128-2", 
             "Berlin_1_256", "Boston_0_256", "Paris_1_256", 
             "ht_chantry", "ht_mansion_n", "lak303d", "lt_gallowstemplar_n", "den312d", "ost003d", 
             "brc202d", "den520d", "w_woundedcoast"]

## Initialize Data Structures and Directory


In [None]:
# Get all subdirectories
extract_path = os.path.join(os.getcwd(), extract_dir)

scenario_range = list(range(experiment_range[0], experiment_range[1] + 1))
items = os.listdir(extract_path)

# Get all files within subdirectories/resolutions
file_tracker = dict()
for name in map_names:
    file_tracker[name] = dict()

for map_name in map_names:
    for res in resolutions: 
        matching_files = []
        map_dir = os.path.join(extract_path, map_name, "res" + str(res))

        for _, _, files in os.walk(map_dir): 
            for file in files: 
                if "stat" in file: 
                    matching_files.append(file)
        file_tracker[map_name][res] = matching_files
        
def custom_sort(file_name):
    # sort files by scenario number, query number
    scen_number = int(re.search(r'scen-(\d+)', file_name).group(1))
    query_number = int(re.search(r'query-(\d+)', file_name).group(1))
    return (scen_number, query_number), file_name
        
for map_name in map_names: 
    for res in resolutions: 
        file_tracker[map_name][res] = sorted(file_tracker[map_name][res], key=custom_sort)

In [None]:
# holds raw statistics on flowtime and runtime
all_data = dict()
for algorithm in ["CBS", "PBS"]: 
    all_data[algorithm] = dict()
    for resolution in resolutions: 
        all_data[algorithm][resolution] = dict()
        for map_name in map_names: 
            all_data[algorithm][resolution][map_name] = dict()         
            
# holds processed statistics: success rate, average time, 
# total time, solved scenarios, min agents solved, max agents solved
stats = dict()
for option in ["CBS", "PBS", "CostRatio"]:
    stats[option] = dict()
    for resolution in resolutions: 
        stats[option][resolution] = dict()
        for map_name in map_names: 
            stats[option][resolution][map_name] = dict()

## Populate Maps 

In [None]:
def ParseFile(file_path, algorithm):
    algorithm = algorithm + "Query"
    time_line = algorithm + "::" + algorithm
    cost_line = time_line + "::Cost"
    found = 0
    
    time = None
    cost = None
    
    with open(file_path, "r") as f: 
        for line in f: 
            l = line.split()
            if len(l) < 2: 
                continue
            if l[0] == time_line: 
                time = float(l[1])
                found += 1
            elif l[0] == cost_line: 
                cost = float(l[1])
                found += 1
            if found == 2: 
                break
    if found != 2: 
        return None, None
    
    if time > 60.0: # runtime limit of 60 seconds
        return None, None
    
    return time, cost

In [None]:
# populate raw data statistics 

for name in map_names: 
    for res in file_tracker[name]: 
        for file in file_tracker[name][res]:
            temp = file.split(".")
            temp = temp[0].split("-")
            if len(temp) < 8: 
                continue
            _, scen_num, _, num_agents, _, _, algorithm, _ = temp
            scen_num = int(scen_num)
            num_agents = int(num_agents)
            
            if scen_num not in scenario_range: 
                continue
            
            file_path = os.path.join(extract_path, name, "res"+str(res), file)
            time, cost = ParseFile(file_path, algorithm)
            
            if num_agents not in all_data[algorithm][res][name].keys(): 
                all_data[algorithm][res][name][num_agents] = dict()
                all_data[algorithm][res][name][num_agents]["flowtime"] = [None for _ in scenario_range]
                all_data[algorithm][res][name][num_agents]["time"] = list()
                all_data[algorithm][res][name][num_agents]["flowtime_export"] = dict()
                all_data[algorithm][res][name][num_agents]["time_export"] = dict()
                
                
            
            if time is None or cost is None: 
                continue
            
            all_data[algorithm][res][name][num_agents]["flowtime"][scen_num - 1] = cost
            all_data[algorithm][res][name][num_agents]["time"].append(time)
            all_data[algorithm][res][name][num_agents]["flowtime_export"][str(scen_num)] = cost
            all_data[algorithm][res][name][num_agents]["time_export"][str(scen_num)] = time

In [None]:
# calculate processed statistics: Cost ratio
for res in resolutions:
    for name in map_names: 
        all_cbs_agents = sorted(list(all_data["CBS"][res][name].keys()))
        all_pbs_agents = sorted(list(all_data["PBS"][res][name].keys()))
        
        all_agents = all_cbs_agents
        if len(all_pbs_agents) < len(all_cbs_agents): 
            all_agents = all_pbs_agents
            
        for num_agents in all_agents: 
            stats["CostRatio"][res][name][num_agents] = dict()
            
            for scen in range(len(all_data["CBS"][res][name][num_agents]["flowtime"])):
                cbs_cost = all_data["CBS"][res][name][num_agents]["flowtime"][scen]
                pbs_cost = all_data["PBS"][res][name][num_agents]["flowtime"][scen]
                
                if cbs_cost is None or pbs_cost is None: 
                    continue       
                stats["CostRatio"][res][name][num_agents][str(scen + 1)] = pbs_cost/cbs_cost
                
# calculate processed statistics: success rate, total time, num scenarios solved
# total time, min solved agents, max solved agents
for algorithm in ["CBS", "PBS"]: 
    for res in resolutions:
        for name in map_names: 
            all_agents = sorted(list(all_data[algorithm][res][name].keys()))
            
            if len(all_agents) == 0: 
                continue
            max_agents = all_agents[-1]
            
            min_agents = all_agents[0]
            num_solved_scenarios = len(all_data[algorithm][res][name][min_agents]["time"])
            if num_solved_scenarios == 0: 
                continue
            
            
            stats[algorithm][res][name]["success_rate"] = dict()
            stats[algorithm][res][name]["total_time"] = dict()
            stats[algorithm][res][name]["num_solved_scen"] = dict()
            stats[algorithm][res][name]["average_time"] = dict()
            
            for num_agents in all_agents: 
                num_solved = len(all_data[algorithm][res][name][num_agents]["time"])
                
                if num_solved_scenarios == num_solved: 
                    min_agents = num_agents
                          
                str_agent_num = str(num_agents)
                
                stats[algorithm][res][name]["success_rate"][str_agent_num] = num_solved / experiment_range[1]
                if num_solved == 0: 
                    continue
                
                all_time = sum(all_data[algorithm][res][name][num_agents]["time"])
                stats[algorithm][res][name]["total_time"][str_agent_num] = all_time
                stats[algorithm][res][name]["num_solved_scen"][str_agent_num] = num_solved
                stats[algorithm][res][name]["average_time"][str_agent_num] = all_time / num_solved
            
            stats[algorithm][res][name]["min_agents"] = min_agents
            stats[algorithm][res][name]["max_agents"] = max_agents  

## Export to CSV

In [None]:
data_dir = os.path.join(os.getcwd(), "Data")
raw_file = os.path.join(data_dir, "raw_data.csv")
time_file = os.path.join(data_dir, "time_stats.csv")
cost_file = os.path.join(data_dir, "cost_ratios.csv")
count_file = os.path.join(data_dir, "solved_count.csv")
success_file = os.path.join(data_dir, "success_rate.csv")

if not os.path.isdir(data_dir): 
    os.mkdir(data_dir)
    print("Making a new directory to hold all scraped data.")


In [None]:
# Output Raw Data

# Header: 
csv_output = "Algorithm,Resolution,Map,Scenario,Num Agents,Flowtime,Time\n"

for algorithm in ["CBS", "PBS"]: 
    for res in resolutions: 
        for name in map_names: 
            all_agents = sorted(list(all_data[algorithm][res][name].keys()))
            if len(all_agents) == 0: 
                continue            
            for num_agents in all_agents: 
                for scen_num in all_data[algorithm][res][name][num_agents]["flowtime_export"].keys(): 
                    csv_output += algorithm + ","
                    csv_output += str(int(res)) + ","
                    csv_output += name + ","
                    csv_output += scen_num + ","
                    csv_output += str(int(num_agents)) + ","
                    csv_output += str(all_data[algorithm][res][name][num_agents]["flowtime_export"][scen_num]) + ","
                    csv_output += str(all_data[algorithm][res][name][num_agents]["time_export"][scen_num]) + "\n"

with open (raw_file, "w") as f: 
    f.write(csv_output)

In [None]:
# Output Statistics        
csv_output = "Algorithm,Resolution,Map,Agents,Total Time,Number Solved Scenarios,Average Time\n"        
for algorithm in ["CBS", "PBS"]: 
    for res in resolutions: 
        for name in map_names:
            if "total_time" not in stats[algorithm][res][name]: 
                continue
            for agent_num in stats[algorithm][res][name]["total_time"].keys(): 
                csv_output += algorithm + ","
                csv_output += str(int(res)) + ","
                csv_output += name + ","
                csv_output += agent_num + ","
                csv_output += str(stats[algorithm][res][name]["total_time"][agent_num]) + ","
                csv_output += str(stats[algorithm][res][name]["num_solved_scen"][agent_num]) + ","
                csv_output += str(stats[algorithm][res][name]["average_time"][agent_num]) + "\n"  
                
with open (time_file, "w") as f: 
    f.write(csv_output)

In [None]:
csv_output = "Algorithm,Resolution,Map,Min Agents,Max Agents\n"
        
for algorithm in ["CBS", "PBS"]: 
    for res in resolutions: 
        for name in map_names: 
            csv_output += algorithm + ","
            csv_output += str(int(res)) + ","
            csv_output += name + ","
            
            if len(stats[algorithm][res][name]) == 0:
                csv_output += "0,"
                csv_output += "0\n"
            else: 
                csv_output += str(int(stats[algorithm][res][name]["min_agents"])) + ","
                csv_output += str(int(stats[algorithm][res][name]["max_agents"])) + "\n"

with open(count_file, "w") as f: 
    f.write(csv_output)

In [None]:
csv_output = "Algorithm,Resolution,Map,Agents,Success Rate\n"
for algorithm in ["CBS", "PBS"]: 
    for res in resolutions: 
        for name in map_names: 
            if len(stats[algorithm][res][name]) == 0: 
                continue
            for agent_num in stats[algorithm][res][name]["success_rate"].keys(): 
                csv_output += algorithm + ","
                csv_output += str(int(res)) + ","
                csv_output += name + ","
                csv_output += agent_num + ","
                csv_output += str(stats[algorithm][res][name]["success_rate"][agent_num]) + "\n"

with open(success_file, "w") as f: 
    f.write(csv_output)

In [None]:
# Output Cost Ratios
csv_output = "Resolution,Map,Scenario,Agents,Cost Ratio\n"

for res in resolutions: 
    for name in map_names: 
        for num_agents in stats["CostRatio"][res][name].keys(): 
            for scen_num in stats["CostRatio"][res][name][num_agents].keys(): 
                csv_output += str(int(res)) + ","
                csv_output += name + ","
                csv_output += scen_num + ","
                csv_output += str(num_agents) + ","
                csv_output += str(stats["CostRatio"][res][name][num_agents][scen_num]) + "\n"
                
with open (cost_file, "w") as f: 
    f.write(csv_output)