In [None]:
# Task 3

### Question 3.1 - How would you measure diversity?

First we look into the question, how we can measure the diversity of a subset of different answers. To calculate the diversity, we need to differentiate two types of answers: _categorical_ and _continuous_.

An example for a categorical column is gender. We can measure the diversity in this case with the _entropy_ measure. Since entropy measures how spread out or balanced the selected values are, it is ideal. In the end we want to normalize the entropy to account for classes that are not in the selected subset.

For continuous data columns like _age_ we can measure the diversity using the _standard deviation_. We could also use the _entropy_ and treat every age as a separate class. However, this would mean that we don't account for the absolute age difference between workers, just for different ages. We therefore decided that the standard deviation is the better metric.
To get a score in-between 0 and 1, it has to be normalized: The std. deviation of the subset is divided by the std. deviation of the maximum possible std. deviation of the full dataset. This is done by calculating the difference of the maximum and minimum values of the original dataset and then dividing it by two.

In the end we calculate our total diversity score by calculating the diversity of each relevant column and then generating their mean.

In [None]:
import pandas as pd
import numpy as np
from scipy.stats import entropy


Excerpt of the original dataframe:

In [None]:
df = pd.read_csv('../../data/answerList_data.csv')
df['correctness'] = (((df['GroundTruth'] == 0.0) & (df['Answer.option'] == 'NO')) | (
        (df['GroundTruth'] == 1.0) & (df['Answer.option'] == 'YES'))).astype(int)

df

For our analysis we pick the Failing Method "HIT01_8" and look at question no. 1.
We then filter out incorrect answers and are left with 12 correct answers, given to question 1.

In [None]:
df_question_1 = df[df['Question.ID'] == 1]
df_question_1_correct = df_question_1[df_question_1['correctness'] == 1]

df_question_1_correct

For demonstration we can print the gender diversity and its normalized value of all picked answers compared to the full dataset:

```python
def categorical_diversity(series):
    counts = series.value_counts(normalize=True)
    return entropy(counts, base=2)


def categorical_diversity_norm(series, series_full):
    k = series_full.nunique()
    return categorical_diversity(series) / (np.log2(k) if k > 1 else 1)
```

In [None]:
def categorical_diversity(series):
    counts = series.value_counts(normalize=True)
    return entropy(counts, base=2)


def categorical_diversity_norm(series, series_full):
    k = series_full.nunique()
    return categorical_diversity(series) / (np.log2(k) if k > 1 else 1)


gender_diversity = categorical_diversity(df_question_1_correct['Worker.gender'])
gender_diversity_norm = categorical_diversity_norm(df_question_1_correct['Worker.gender'], df['Worker.gender'])

print('Gender diversity of all answers:', gender_diversity)
print('Normalized gender diversity', gender_diversity_norm)

For continuous variables, we use the following method:
```python
def coefficient_variation(series, series_full):
    if len(series) < 2:
        return 0
    return series.std() / ((series_full.max() - series_full.min()) / 2)
````

For demonstration purposes we print the age diversity of all picked answers compared to the full dataset:

In [None]:
def coefficient_variation(series, series_full):
    if len(series) < 2:
        return 0
    return series.std() / ((series_full.max() - series_full.min()) / 2)

age_diversity = coefficient_variation(df_question_1_correct['Worker.age'], df['Worker.age'])

print('Normalized age diversity', age_diversity)

In [None]:
def combined_diversity_score(df_selection, df_all, categorical_cols, continuous_cols):
    total_number_cols = len(categorical_cols) + len(continuous_cols)


    categorical_score = sum(1 / total_number_cols * categorical_diversity_norm(df_selection[col], df_all[col]) for col in categorical_cols)
    continuous_score = sum(1 / total_number_cols * coefficient_variation(df_selection[col], df_all[col]) for col in continuous_cols)

    return categorical_score + continuous_score

def list_diversity_scores(df_selection, df_all, categorical_cols, continuous_cols):
    total_number_cols = len(categorical_cols) + len(continuous_cols)

    categorical_scores = [1 / total_number_cols * categorical_diversity_norm(df_selection[col], df_all[col]) for col in categorical_cols]
    continuous_scores = [1 / total_number_cols * coefficient_variation(df_selection[col], df_all[col]) for col in continuous_cols]

    return categorical_scores + continuous_scores

For our analysis, we chose the following categorical columns
- Worker.profession
- Worker.gender

and the following continuous columns
- Worker.age
- Worker.yearsOfExperience

To combine multiple factors into a single diversity score, we take both categorical and continous columns, give each an equal weight and sum them up.
The diversity score is then normalized by the number of factors.

The combined diversity score of our picked answers compared to the full dataset:

In [None]:
categorical_cols = ['Worker.profession', 'Worker.gender']
continuous_cols = ['Worker.yearsOfExperience', 'Worker.age']

diversity_score = combined_diversity_score(df_question_1_correct, df, categorical_cols, continuous_cols)

print('Combined Diversity', diversity_score)

In [None]:
def select_n(df, n):
    return df.sample(n=n)

In [None]:
for n in range(1, 10):
    sample_df = select_n(df_question_1_correct, n)
    div_scores = list_diversity_scores(sample_df, df_question_1_correct, categorical_cols, continuous_cols)
    print('Sample df\n', sample_df[categorical_cols+continuous_cols])
    print('Diversity score\n', div_scores)
    print('Total diversity score', combined_diversity_score(sample_df, df_question_1_correct, categorical_cols, continuous_cols))
    print('\n\n')


### Subsets

We are now going to look at subsets of the 12 previously picked answers.
We will evaluate these subset according to three different measures:
- diversity (regarding in regard to persons)
- readability
- semantic similarity of the subset to our hand-crafted ground truth


In [None]:


ground_truth = "The IF statement in line 279 checks whether minutesOffset is set to a value between 0 and 59. If not, an IllegalArgumentException is thrown. This is a bug because the minutesOffset may also be negative. The IF statement should check for the minutesOffset to be between -59 and 59."

bug_group = df_question_1_correct

To check for similarity, we embed the two sentences to be compared with the model
"all-MiniLM-L6-v2" and use cosine similarity on the transformed embeddings.
```python
def calculate_cosine_similarity(hyp, ref):
    hyp_embedding = model.encode(hyp)
    ref_embedding = model.encode(ref)
    return np.dot(hyp_embedding, ref_embedding) / (np.linalg.norm(hyp_embedding) * np.linalg.norm(ref_embedding))

```

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

def calculate_cosine_similarity(hyp, ref, ref_embedding=None):
    hyp_embedding = model.encode(hyp)
    if ref_embedding is None:
        ref_embedding = model.encode(ref)
    return np.dot(hyp_embedding, ref_embedding) / (np.linalg.norm(hyp_embedding) * np.linalg.norm(ref_embedding))


To calculate a readability score, we are using the Flesch Reading Ease formula.

In [None]:
import textstat
def calculate_flesch_reading_ease(text):
    return textstat.flesch_reading_ease(text)

def calculate_automated_readability_index(text):
    return textstat.automated_readability_index(text)

In [None]:
from scipy.stats import entropy

def coefficient_variation(series, series_full):
    if len(series) < 2:
        return 0
    return series.std() / ((series_full.max() - series_full.min()) / 2)

def categorical_diversity(series):
    counts = series.value_counts(normalize=True)
    return entropy(counts, base=2)


def categorical_diversity_norm(series, series_full):
    k = series_full.nunique()
    return categorical_diversity(series) / (np.log2(k) if k > 1 else 1)

def combined_diversity_score(df_selection, df_all, categorical_cols, continuous_cols):
    total_number_cols = len(categorical_cols) + len(continuous_cols)


    categorical_score = sum(1 / total_number_cols * categorical_diversity_norm(df_selection[col], df_all[col]) for col in categorical_cols)
    continuous_score = sum(1 / total_number_cols * coefficient_variation(df_selection[col], df_all[col]) for col in continuous_cols)

    return categorical_score + continuous_score

Since the full dataset of answers is relatively small (12), we can search through all possible subsets of lengths 1 to 12 and compare their scores in all three categories. Below are the number of combinations to check for a subset of size `n`

In [None]:
# use itertools to get all possible subsets of bug_group rows
import itertools

def get_subsets(df):
    for i in range(1, len(df) + 1):
        amount = 0
        for subset in itertools.combinations(df.iterrows(), i):
            amount += 1;
            yield df.loc[[x[0] for x in subset]]

all_subsets = list(get_subsets(bug_group))

In [None]:
categorical_cols = ['Worker.profession', 'Worker.gender']
continuous_cols = ['Worker.yearsOfExperience', 'Worker.age']

Below is an excerpt of the three scores for each possible subset. The text-content to be scored per subset is defined by the concatenation of the explanations of the subset. Keep in mind, that no permutations are included. The order is random.

In [None]:
from tqdm import tqdm

ground_truth_embedding = model.encode(ground_truth)

results = []
for subset in tqdm(all_subsets):
    # concat all strings of Answer.explanation in subset
    subset_text = subset['Answer.explanation'].str.cat(sep='\n\n')
    readability_score = calculate_flesch_reading_ease(subset_text)
    similarity_score = calculate_cosine_similarity(subset_text, ground_truth, ref_embedding=ground_truth_embedding)
    diversity_score = combined_diversity_score(subset, bug_group, categorical_cols, continuous_cols)
    results.append((readability_score, similarity_score, diversity_score))

df_results = pd.DataFrame(results, columns=['readability', 'similarity', 'diversity'])
df_results

In [None]:
def normalize_column(series):
    return (series - series.min()) / (series.max() - series.min())
df_results['readability.norm'] = normalize_column(df_results['readability'])
df_results['similarity.norm'] = normalize_column(df_results['similarity'])
df_results['diversity.norm'] = normalize_column(df_results['diversity'])

In [None]:
# add normed scores
df_results['score'] = (df_results['readability.norm'] + df_results['similarity.norm'] + df_results['diversity.norm']) / 3

# sort by score descending
df_results = df_results.sort_values('score', ascending=False)


In [None]:
# @hidden_cell
def print_subset(i): # i is the best ranked index
    score_row = df_results.iloc[i]
    original_index = df_results.index[i]
    subset = all_subsets[original_index]

    print('Readability:', score_row['readability.norm'])
    print('Similarity:', score_row['similarity.norm'])
    print('Diversity:', score_row['diversity.norm'])
    print('Score:', score_row['score'])
    # print the concatted Answer.explanation
    print(subset['Answer.explanation'].str.cat(sep='\n\n'))

In [None]:
# @hidden_cell
print_subset(0)
print_subset(1)

In [None]:
# @hidden_cell

# get the best subset's index
best_subset_index = df_results.index[0]
best_subset_index

best_subset = all_subsets[best_subset_index]
best_subset

To understand which scores can be maximized while taking loss of other scores into account, we can use pareto fronts.

At first, we can remove any dominated solutions from the dataframe. A solution is dominated if there is another solution that is better in all scores.

In [None]:
def is_pareto_efficient(costs):
    is_efficient = np.ones(costs.shape[0], dtype=bool)
    display(len(is_efficient))
    for i, c in enumerate(costs):
        if is_efficient[i]:
            is_efficient[is_efficient] = np.any(costs[is_efficient] > c, axis=1)
            is_efficient[i] = True
    return is_efficient

pareto_mask = is_pareto_efficient(df_results[["readability.norm", "similarity.norm", "diversity.norm"]].values)
pareto_df = df_results[pareto_mask]
pareto_df

We reduced the number of interesting subsets from 4095 to 83.

Ordered by the total score we see that the best score is 0.876. Printed is the best possible concatenated explanation, regarding all three scores:

In [None]:
print(best_subset['Answer.explanation'].str.cat(sep='\n\n'))

For reference, here ist the chosen ground truth from Task 2:

>"The IF statement in line 279 checks whether minutesOffset is set to a value between 0 and 59. If not, an IllegalArgumentException is thrown. This is a bug because the minutesOffset may also be negative. The IF statement should check for the minutesOffset to be between -59 and 59.""

In [None]:
import matplotlib
matplotlib.use('MacOSX')


In [None]:
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt

color_values = pareto_df["readability.norm"] + pareto_df["similarity.norm"] + pareto_df["diversity.norm"]

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(pareto_df["readability.norm"],
           pareto_df["similarity.norm"],
           pareto_df["diversity.norm"],
           c=color_values,
           cmap='plasma',
           label='Pareto-optimal points')

ax.set_xlabel('Readability')
ax.set_ylabel('Similarity')
ax.set_zlabel('Diversity')
plt.legend()
plt.show()

| ![image-4](attachment:image-4.png) | ![image](attachment:image.png) |
|---------------------------|------------------------|
| ![image-2](attachment:image-2.png) | ![image-3](attachment:image-3.png) |


In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

color_values = pareto_df["readability.norm"] + pareto_df["similarity.norm"] + pareto_df["diversity.norm"]

fig = plt.figure(figsize=(8, 6))
ax = fig.add_subplot(111, projection='3d')

ax.scatter(pareto_df["readability.norm"],
           pareto_df["similarity.norm"],
           pareto_df["diversity.norm"],
           c=color_values,
           cmap='plasma',
           label='Pareto-optimal points')

ax.set_xlabel('Readability')
ax.set_ylabel('Similarity')
ax.set_zlabel('Diversity')
plt.legend()
plt.show()

In [None]:
%matplotlib inline
# Create a figure with 3 subplots for 2D projections
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
scatters = []

# (1) Similarity vs Diversity
sc = axes[0].scatter(pareto_df["similarity.norm"], pareto_df["diversity.norm"], c=color_values, cmap='plasma', edgecolors='k')
axes[0].set_xlabel('Similarity')
axes[0].set_ylabel('Diversity')
axes[0].set_title('Projection: Similarity vs Diversity')
scatters.append(sc)

# (2) Similarity vs Readability
sc = axes[1].scatter(pareto_df["similarity.norm"], pareto_df["readability.norm"], c=color_values, cmap='plasma', edgecolors='k')
axes[1].set_xlabel('Similarity')
axes[1].set_ylabel('Readability')
axes[1].set_title('Projection: Similarity vs Readability')
scatters.append(sc)

# (3) Readability vs Diversity
sc = axes[2].scatter(pareto_df["readability.norm"], pareto_df["diversity.norm"], c=color_values, cmap='plasma', edgecolors='k')
axes[2].set_xlabel('Readability')
axes[2].set_ylabel('Diversity')
axes[2].set_title('Projection: Readability vs Diversity')
scatters.append(sc)

cbar = fig.colorbar(scatters[0], ax=axes, orientation='vertical', fraction=0.02, pad=0.05)
cbar.set_label('Color Value (Readability + Similarity + Diversity)')


# Adjust layout and show
plt.show()

### 3.2
Max Score (similarity, readability) while compromising diversity

df_results['score_similarity_readability'] = (df_results['readability.norm'] + df_results['similarity.norm']) / 2


In [None]:
df_results['score_similarity_readability'] = (df_results['readability.norm'] + df_results['similarity.norm']) / 2

# sort by score descending
df_results.sort_values('score_similarity_readability', ascending=False).head(3)

Indeed, by sacrificing diversity we can achieve a higher score fo readability and similarity: 0.891

### 3.3 Highest diversity at maximum similarity (compromising on readability)

In [None]:
df_results.sort_values('similarity.norm', ascending=False).head(5)

Sorting by normed similarity and looking at the perfect maximum of 1.000, we get a diversity of 0.258.

Compromising slightly on similarity, picking the third entry at 0.978, we can achieve a diversity score of 0.693.