In [1]:
from analysis_funcs import *

In [2]:
data_file = './data/jekyll/jekyll_parsed_steps.json'
with open(data_file, 'r') as f:
    test_data = json.load(f)

In [3]:
# Convert JSON steps into strings
step_definition_strings = stringify_test_cases(test_data, "step_definition")
step_name_strings = stringify_test_cases(test_data, "step_name")
scenario_nums, scenario_title_strings = stringify_test_titles(test_data)

# Calculate matrix for only NCD
matrix = calculate_pairwise_ncd(step_name_strings)

In [4]:
# Determine number of clusters
num_clusters = len(set([test['feature_file'] for test in test_data]))
true_cluster_labels = true_clusters(test_data)

In [5]:
predicted_clusters = kmeans_clustering(matrix, num_clusters, scenario_title_strings)

In [6]:
predicted_clusters

{0: ['Alter the payload for one page but not another',
  'Modify page contents before writing to disk',
  'Work with a page after writing it to disk',
  'Alter the payload for certain posts',
  'Modify post contents before writing to disk',
  'Work with a post after writing it to disk',
  'Allow hooks to have a named priority',
  'Rouge renders code block once'],
 1: ['Unrendered collection',
  'Filter documents by where',
  'Basic site with unpublished page',
  'Include a nested file relative to a page at root',
  'Include a file from a variable',
  'Rebuild when layout is changed',
  'Rebuild when an include is changed',
  'Markdown in pagination on index',
  'Use custom permalink schema with cased file name',
  "Don't place asset files in layout",
  'Render content of another page',
  'Overriding a theme with SCSS',
  'A theme with an include',
  'A theme without data',
  'A theme with data overridden by data in source directory',
  'A theme with a layout',
  'A theme with *just* la

In [7]:
true_cluster_labels

{'cache.feature': ['Default Cache directory',
  'Custom Cache directory',
  'Disk usage in safe mode',
  'Disabling disk usage in non-safe mode'],
 'collections.feature': ['Unrendered collection',
  'Rendered collection',
  'Rendered collection at a custom URL',
  'Rendered document in a layout',
  'Collections specified as an array',
  'Collections specified as an hash',
  'Rendered collection with document with future date',
  'Access rendered collection with future dated document via Liquid',
  'Access rendered and published collection documents via Liquid',
  'Unrendered collection with future dated document',
  'Access unrendered collection with future dated document via Liquid',
  'Access unrendered but publishable collection documents via Liquid',
  'Access rendered collection with future date and unpublished flag via Liquid',
  'Access unrendered collection with future date and unpublished flag via Liquid',
  'All the documents',
  'Documents have an output attribute, which is 

In [8]:
len(true_cluster_labels)

28

In [9]:
len(predicted_clusters)

28

In [11]:
# match the predicted cluster labels with the true cluster labels using Hungarian Algorithm
matches, precision, recall, f1, mean_avg_precision = cluster_similarity(true_cluster_labels, predicted_clusters)

In [12]:
matches

[('cache.feature', 24),
 ('collections.feature', 12),
 ('collections_dir.feature', 16),
 ('create_sites.feature', 14),
 ('data.feature', 15),
 ('drafts.feature', 26),
 ('embed_filters.feature', 17),
 ('frontmatter_defaults.feature', 6),
 ('highlighting.feature', 27),
 ('hooks.feature', 0),
 ('include_relative_tag.feature', 4),
 ('include_tag.feature', 20),
 ('incremental_rebuild.feature', 8),
 ('layout_data.feature', 9),
 ('link_tag.feature', 2),
 ('markdown.feature', 5),
 ('pagination.feature', 13),
 ('permalinks.feature', 10),
 ('plugins.feature', 3),
 ('post_data.feature', 21),
 ('post_excerpts.feature', 11),
 ('post_url_tag.feature', 18),
 ('rendering.feature', 22),
 ('site_configuration.feature', 23),
 ('site_data.feature', 25),
 ('theme.feature', 1),
 ('theme_configuration.feature', 19),
 ('theme_gem.feature', 7)]

In [13]:
mean_avg_precision

0.07142857142857142

In [14]:
precision

0.027972027972027972

## Double check precision calculation

In [15]:
true_clusters = true_cluster_labels
predicted_clusters = predicted_clusters

In [16]:
true_keys = list(true_clusters.keys())
pred_keys = list(predicted_clusters.keys())

In [17]:
true_keys

['cache.feature',
 'collections.feature',
 'collections_dir.feature',
 'create_sites.feature',
 'data.feature',
 'drafts.feature',
 'embed_filters.feature',
 'frontmatter_defaults.feature',
 'highlighting.feature',
 'hooks.feature',
 'include_relative_tag.feature',
 'include_tag.feature',
 'incremental_rebuild.feature',
 'layout_data.feature',
 'link_tag.feature',
 'markdown.feature',
 'pagination.feature',
 'permalinks.feature',
 'plugins.feature',
 'post_data.feature',
 'post_excerpts.feature',
 'post_url_tag.feature',
 'rendering.feature',
 'site_configuration.feature',
 'site_data.feature',
 'theme.feature',
 'theme_configuration.feature',
 'theme_gem.feature']

In [18]:
pred_keys

[0,
 1,
 2,
 3,
 4,
 5,
 6,
 7,
 8,
 9,
 10,
 11,
 12,
 13,
 14,
 15,
 16,
 17,
 18,
 19,
 20,
 21,
 22,
 23,
 24,
 25,
 26,
 27]

In [22]:
overlap_matrix = np.zeros((len(true_keys), len(pred_keys)))

for i, true_key in enumerate(true_keys):
    true_values = set(true_clusters[true_key])
    for j, pred_key in enumerate(pred_keys):
        pred_values = set(predicted_clusters[pred_key])
        overlap = true_values & pred_values
        overlap_matrix[i, j] = len(overlap)
    
cost_matrix = -overlap_matrix

# Apply the Hungarian algorithm to find the optimal assignment
row_ind, col_ind = linear_sum_assignment(cost_matrix)

true_labels = []
pred_labels = []

for i, true_key in enumerate(true_keys):
    for value in true_clusters[true_key]:
        true_labels.append(i)
        
for j, pred_key in enumerate(pred_keys):
    for value in predicted_clusters[pred_key]:
        pred_labels.append(col_ind[j])

In [28]:
row_ind

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27], dtype=int64)

In [24]:
col_ind

array([24, 12, 16, 14, 15, 26, 17,  6, 27,  0,  4, 20,  8,  9,  2,  5, 13,
       10,  3, 21, 11, 18, 22, 23, 25,  1, 19,  7], dtype=int64)

In [25]:
pred_keys.index(0)

0

In [26]:
predicted_clusters

{0: ['Alter the payload for one page but not another',
  'Modify page contents before writing to disk',
  'Work with a page after writing it to disk',
  'Alter the payload for certain posts',
  'Modify post contents before writing to disk',
  'Work with a post after writing it to disk',
  'Allow hooks to have a named priority',
  'Rouge renders code block once'],
 1: ['Unrendered collection',
  'Filter documents by where',
  'Basic site with unpublished page',
  'Include a nested file relative to a page at root',
  'Include a file from a variable',
  'Rebuild when layout is changed',
  'Rebuild when an include is changed',
  'Markdown in pagination on index',
  'Use custom permalink schema with cased file name',
  "Don't place asset files in layout",
  'Render content of another page',
  'Overriding a theme with SCSS',
  'A theme with an include',
  'A theme without data',
  'A theme with data overridden by data in source directory',
  'A theme with a layout',
  'A theme with *just* la

In [27]:
true_clusters

{'cache.feature': ['Default Cache directory',
  'Custom Cache directory',
  'Disk usage in safe mode',
  'Disabling disk usage in non-safe mode'],
 'collections.feature': ['Unrendered collection',
  'Rendered collection',
  'Rendered collection at a custom URL',
  'Rendered document in a layout',
  'Collections specified as an array',
  'Collections specified as an hash',
  'Rendered collection with document with future date',
  'Access rendered collection with future dated document via Liquid',
  'Access rendered and published collection documents via Liquid',
  'Unrendered collection with future dated document',
  'Access unrendered collection with future dated document via Liquid',
  'Access unrendered but publishable collection documents via Liquid',
  'Access rendered collection with future date and unpublished flag via Liquid',
  'Access unrendered collection with future date and unpublished flag via Liquid',
  'All the documents',
  'Documents have an output attribute, which is 

In [35]:
i = 0
total_correct = 0
total_scenarios = 0
average_precisions = []
for true_key, true_labels in true_clusters.items():
    cur_correct = 0
    pred_cluster_index = col_ind[i]
    pred_labels = set(predicted_clusters[pred_cluster_index])

    for label in true_labels:
        total_scenarios += 1
        if label in pred_labels:
            total_correct += 1
            cur_correct += 1

    average_precisions.append(cur_correct/len(true_labels))
            
print(np.mean(average_precisions))
print(total_correct/total_scenarios)

0.0699602918090313
0.05244755244755245
