In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
import pydot
import os
from fnmatch import fnmatch
from typing import Tuple, Dict, List
from tqdm.notebook import tqdm

# First param is project name, second is config name
LOCATION = "..\\out\\%s\\%s.csv"
# Params are project, datapoint, concepts
DATAPOINT = "..\\data\\corpora_clean\\%s\\%s\\%d\\merged.dot"
sns.set_style("whitegrid")
figsize=(5,4)
dpi=300
palette="dark"
# Fix plot fonts
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42
matplotlib.rcParams['mathtext.fontset'] = 'stix'
matplotlib.rcParams['font.family'] = 'STIXGeneral'

In [2]:
projects = [
    "Commandline",
    "CommonMark",
    "Hangfire",
    "Humanizer",
    "Lean",
    "Nancy",
    "Newtonsoft.Json",
    "Ninject",
    "RestSharp",
]
methods_ordered = ["WL c", "WL d", "WL n", "WL dc", "WL dn", "WL cn", "WL dcn"]

In [3]:
import itertools
l = [False, True]
configs = list(itertools.product(l, repeat=3))[1:]
df = pd.DataFrame(data=[], columns=['Datapoint','Concepts','Accuracy','Overlap','Time'])
for project in tqdm(projects):
    for with_data, with_call, with_name in tqdm(configs, leave=False):
        suffix = ""
        edges_kept = ""
        if with_data:
            suffix += "d"
            edges_kept += "data"
        if with_call:
            suffix += "c"
            edges_kept += "control"
        if with_name:
            suffix += "n"
            edges_kept += "name"
    
        try:
            location = LOCATION % (project, ("wl_%s_1_results_%s" % (edges_kept, suffix)))
            temp_df = pd.read_csv(location, index_col=False)
            temp_df['Project'] = pd.Series([project]*len(temp_df.index))
            temp_df['Method'] = pd.Series(["WL %s" % suffix]*len(temp_df.index))
            
            df = df.append(temp_df, sort=False)
        except FileNotFoundError:
            pass # If it's not there, it's not there yet :)
df['Method'] = pd.Categorical(df['Method'], methods_ordered)
df = df.sort_values('Method')
df

In [4]:
fig = plt.figure(figsize=figsize, dpi=dpi)
ax = fig.gca()
sns.boxplot(data=df[df.Concepts > 1], x="Method", y="Accuracy", hue="Concepts", palette=palette, ax=ax)
# sns.swarmplot(data=df[df.Concepts > 1], x="Method", y="Accuracy", palette=palette, alpha=.66, ax=ax)

for i,artist in enumerate(ax.artists):
    # Set the linecolor on the artist to the facecolor, and set the facecolor to None
    col = artist.get_facecolor()
#     col = color_map[i]
    artist.set_edgecolor(col)
    artist.set_facecolor('None')

    # Each box has 6 associated Line2D objects (to make the whiskers, fliers, etc.)
    # Loop over them here, and use the same colour as above
    for j in range(i*6,i*6+6):
        line = ax.lines[j]
        line.set_color(col)
        line.set_mfc(col)
        line.set_mec(col)

# Also fix the legend
for legpatch in ax.get_legend().get_patches():
    col = legpatch.get_facecolor()
    legpatch.set_edgecolor(col)
    legpatch.set_facecolor('None')
        
plt.xticks(
#     rotation=10,
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-small',
)

plt.ylim((-0.0,1.0))
    
plt.savefig("Accuracy_ablation.pdf", 
            #This is simple recomendation for publication plots
            dpi=1000, 
            # Plot will be occupy a maximum of available space
            bbox_inches='tight', 
           )

In [12]:
for m in methods_ordered:
    slice_ = df[df.Concepts > 1]
    slice_ = slice_[slice_.Method == m]
    print('%s: %2.3f' % (m, slice_.Accuracy.median()))
    for c in [2, 3, 4]:
        slice__ = slice_[slice_.Concepts == c]
        print('%s, %d: %2.3f' % (m, c, slice__.Accuracy.median()))

In [5]:
fig = plt.figure(figsize=figsize, dpi=dpi)
ax = fig.gca()
sns.boxplot(data=df[df.Concepts > 1], x="Method", y="Time", hue="Concepts", palette=palette, ax=ax)
# sns.swarmplot(data=df[df.Concepts > 1], x="Method", y="Accuracy", palette=palette, alpha=.66, ax=ax)

for i,artist in enumerate(ax.artists):
    # Set the linecolor on the artist to the facecolor, and set the facecolor to None
    col = artist.get_facecolor()
#     col = color_map[i]
    artist.set_edgecolor(col)
    artist.set_facecolor('None')

    # Each box has 6 associated Line2D objects (to make the whiskers, fliers, etc.)
    # Loop over them here, and use the same colour as above
    for j in range(i*6,i*6+6):
        line = ax.lines[j]
        line.set_color(col)
        line.set_mfc(col)
        line.set_mec(col)

# Also fix the legend
for legpatch in ax.get_legend().get_patches():
    col = legpatch.get_facecolor()
    legpatch.set_edgecolor(col)
    legpatch.set_facecolor('None')
        
plt.xticks(
#     rotation=10,
    horizontalalignment='center',
    fontweight='light',
    fontsize='x-small',
)

# plt.ylim((0, 75))

plt.savefig("Time_ablation.pdf", 
            #This is simple recomendation for publication plots
            dpi=1000, 
            # Plot will be occupy a maximum of available space
            bbox_inches='tight', 
           )

In [6]:
results_atomic = dict()
for m in methods_ordered:
    wl_df = df[df.Method == m]
    atomic = (wl_df[wl_df.Concepts == 1].Overlap.isnull()).mean()
    multi = (wl_df[wl_df.Concepts > 1].Overlap.isnull()).mean()
    acc_atomic = acc = wl_df[wl_df.Concepts == 1].Accuracy.mean()
    acc = wl_df[wl_df.Concepts > 1].Accuracy.mean()
    time = wl_df.Time.mean()
    print('Method %s' % m)
    print('Correctly atomic:')
    print('%1.4f' % atomic)
    print('Incorrectly atomic:')
    print('%1.4f' % multi)
    print('Atomic graph labelling accuracy:')
    print('%1.4f' % acc_atomic)
    print('Untangle accuracy:')
    print('%1.4f' % acc)
    print('Untangle time:')
    print('%-1.4f' % time)
    print()
    results_atomic[m] = (atomic, 1 - multi, acc_atomic, acc, time)

In [7]:
import numpy as np
from scipy import spatial
from functools import reduce


def filter_(pts, pt):
    """
    Get all points in pts that are not Pareto dominated by the point pt
    """
    weakly_worse   = (pts <= pt).all(axis=-1)
    strictly_worse = (pts < pt).any(axis=-1)
    return pts[~(weakly_worse & strictly_worse)]


def get_pareto_undominated_by(pts1, pts2=None):
    """
    Return all points in pts1 that are not Pareto dominated
    by any points in pts2
    """
    if pts2 is None:
        pts2 = pts1
    return reduce(filter_, pts2, pts1)


def get_pareto_frontier(pts):
    """
    Iteratively filter points based on the convex hull heuristic
    """
    pareto_groups = []

    # loop while there are points remaining
    while pts.shape[0]:
        # brute force if there are few points:
        if pts.shape[0] < 10:
            pareto_groups.append(get_pareto_undominated_by(pts))
            break

        # compute vertices of the convex hull
        hull_vertices = spatial.ConvexHull(pts).vertices

        # get corresponding points
        hull_pts = pts[hull_vertices]

        # get points in pts that are not convex hull vertices
        nonhull_mask = np.ones(pts.shape[0], dtype=bool)
        nonhull_mask[hull_vertices] = False
        pts = pts[nonhull_mask]

        # get points in the convex hull that are on the Pareto frontier
        pareto   = get_pareto_undominated_by(hull_pts)
        pareto_groups.append(pareto)

        # filter remaining points to keep those not dominated by
        # Pareto points of the convex hull
        pts = get_pareto_undominated_by(pts, pareto)

    return np.vstack(pareto_groups)

In [8]:
ablation_result = get_pareto_frontier(1 - np.asarray(list(results_atomic.values())))
if len(ablation_result) == 7:
    print('All solutions are pareto optimal')
else:
    print('Pareto optimal solutions are:')
    print(ablation_result)

In [9]:
results_df = pd.DataFrame(data=results_atomic, index=['Atomic', 'Multi', 'Atomic Accuracy', 'Accuracy', 'Time']).T
results_df['Method'] = results_df.index
results_df

In [10]:
for x, y in itertools.combinations(['Atomic', 'Multi', 'Atomic Accuracy', 'Accuracy', 'Time'], 2):
    fig = plt.figure(figsize=figsize, dpi=dpi)
    ax = fig.gca()
    sns.scatterplot(data=results_df, x=x, y=y, hue="Method", palette=palette, ax=ax)
    
    if y != "Time":
        plt.ylim((0, 1))
    else:
        plt.yscale("log")
    plt.xlim((0, 1))
    plt.savefig("Comparison_%s_%s_ablation.pdf" % (x, y), 
                #This is simple recomendation for publication plots
                dpi=1000, 
                # Plot will be occupy a maximum of available space
                bbox_inches='tight', 
               )