# Calculate diversity of generations
From paper, page 9:

The diversity is measured using the average overlap of the longest sub-sequence (LCS) among sampled answers:

\begin{equation}
1 - \frac{1}{\binom{M}{2}} \sum_{s \neq s' \in C} \text{ROUGE-L}(s, s') 
\end{equation}

$\binom{M}{2}$ is the number of possible pairs of $s \neq s'$ (order doesn't matter)


In [16]:
from itertools import combinations
from rouge_score import rouge_scorer
# https://thepythoncode.com/article/calculate-rouge-score-in-python#rouge-l
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)

def calculate_diversity(strings):
    if len(strings) < 2:
        return 0
    
    rouge_l_sum = 0
    n_pairs = 0
    
    for s, s_prime in combinations(strings, 2):
        rouge_l_sum += scorer.score(s, s_prime)["rougeL"].fmeasure
        n_pairs += 1
    
    return 1 - rouge_l_sum / n_pairs
    
# Example usage
strings = ["Nikkei 225", "The Nikkei", "Nikkei 225", "Nikkei", "Nikkei", "Nikkei 225", "The Nikkei", 
           "The Nikkei", "Nikkei 225", "The Nikkei"]

diversity = calculate_diversity(strings)
diversity

0.29629629629629606