In [1]:
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
import os
from datetime import datetime, timedelta
from diversity_oracle import DiversityOracle
from utils import save_to_csv

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     /home/borito1907/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


In [2]:
import numpy as np
from scipy.stats import mannwhitneyu

def compute_mann_whitney_u(scores, window1_size, window2_size):
    """
    Computes the Mann-Whitney U test statistic and p-value between two sliding windows of scores.
    
    Args:
        scores (list or np.array): A list or array of scores.
        window1_size (int): The size of the first window.
        window2_size (int): The size of the second window.
        
    Returns:
        tuple: A tuple containing the U statistic and the p-value.
    """
    # Ensure that the input is a numpy array
    scores = np.array(scores)
    
    # Compute the number of scores
    num_scores = len(scores)
    
    # Check if the windows are valid
    if window1_size + window2_size != num_scores:
        raise ValueError("The sum of the window sizes should be equal to the number of scores.")
    
    # Define the two windows
    window1 = scores[:window1_size]
    window2 = scores[window2_size:]
    
    # Compute the Mann-Whitney U test
    u_stat, p_val = mannwhitneyu(window1, window2, alternative='two-sided')
    
    return u_stat, p_val

In [None]:
plots_folder = "./plots/plots_1/"
folder = '1_evan_1_4_1/'

csv_path = os.path.join(plots_folder, folder, 'div_df.csv')

div_df = pd.read_csv(csv_path)

important_metrics = elements = [
    'TokenSemantics',
    'PartOfSpeechSequence',
    'mattr',
    'hdd',
    'mtld',
    'normalized_unique_unigrams',
    'normalized_unique_bigrams',
    'normalized_unique_trigrams'
]

for metric in important_metrics:
    window_size = 25
    metric_scores = div_df[metric].tail(window_size * 2).to_numpy()

    u_stat, p_val = compute_mann_whitney_u(metric_scores, window_size, window_size)
    print(f"Metric: {metric}")
    print(f"U_stat, p_val: {u_stat}, {p_val}")

In [None]:
plots_folder = "./plots/plots_1/"
folder = '1_evan_1_4_1/'

csv_path = os.path.join(plots_folder, folder, 'div_df.csv')

div_df = pd.read_csv(csv_path)

metric = "normalized_unique_unigrams"

window_sizes = range(10,30,5)

for window_size in window_sizes:
    metric_scores = div_df[metric].tail(window_size * 2).to_numpy()

    u_stat, p_val = compute_mann_whitney_u(metric_scores, window_size, window_size)
    print(f"Window Size: {window_size}")
    print(f"Metric: {metric}")
    print(f"U_stat, p_val: {u_stat}, {p_val}")