## Imports and Helper Functions

In [1]:
import re
import json
import pandas as pd
import matplotlib.pyplot as plt
import os
from datetime import datetime, timedelta
from diversity_oracle import DiversityOracle
from utils import save_to_csv, find_csv, count_words

[nltk_data] Downloading package benepar_en3 to
[nltk_data]     /home/borito1907/nltk_data...
[nltk_data]   Package benepar_en3 is already up-to-date!


In [2]:
div_oracle = DiversityOracle(metrics = {}, verbose=False, normalized=False)
normalized_div_oracle = DiversityOracle(metrics = {}, verbose=False, normalized=True)

def get_success_dfs(csv_files):
    dfs = []
    for file in csv_files:
        df = pd.read_csv(file)
        dfs.append(df[df['quality_preserved'] == True])
    return dfs

def create_corpuses(dfs, normalized):
    corpuses = []
    min_length = min(len(df) for df in dfs)
    
    if normalized:
        for i in range(min_length):
            corpus = [df.iloc[i]['mutated_text'] for df in dfs]
            corpuses.append(corpus)
    else:
        for i in range(min_length):
            corpus = [text for df in dfs for text in df.iloc[:i+1]['mutated_text']]
            corpuses.append(corpus)
    return corpuses

def get_diversity_df(csv_files, normalized):
    dfs = get_success_dfs(csv_files)
    corpuses = create_corpuses(dfs, normalized)
    metric_dicts = []
    for corpus in corpuses:
        if normalized:
            metrics = normalized_div_oracle(corpus)
        else:
            metrics = div_oracle(corpus)
        metric_dict = {metric['metric_name']: metric['diversity_score'] for metric in metrics}
        
        metric_dicts.append(metric_dict)
    
    df = pd.DataFrame(metric_dicts)
    return df

def plot_metric(df, column_name):

    # Plotting
    plt.figure(figsize=(10, 6))  # Set the figure size (optional)
    plt.plot(df.index, df[column_name], marker='o', linestyle='-', color='b')  # Plot with line and markers
    plt.title(f"Evolution of {column_name}")  # Title of the plot
    plt.xlabel('Step Number')  # X-axis label
    plt.ylabel(column_name)  # Y-axis label, replace with your column name
    plt.grid(True)  # Show grid
    plt.show()

def save_plots(df, folder):
    for column_name in df.columns:
        plt.figure(figsize=(10, 6))
        plt.plot(df.index, df[column_name], marker='o', linestyle='-', color='b')
        plt.title(f'Evolution of {column_name} Values Over Rows')
        plt.xlabel('Step Number')
        plt.ylabel(column_name)
        plt.grid(True)
        
        # Save the figure as a PNG file
        filename = f'{folder}/{column_name}.png'
        plt.savefig(filename, dpi=300)
        
        # Clear the current figure to avoid overlapping of plots
        plt.clf()  # Use plt.close() if you want to close the figure completely       

## Batch Analysis

In [None]:

# Normalized Values
txt_file_directory = "./third_round/"
plots_folder = "./plots/plots_3/"

for txt_filename in os.listdir(txt_file_directory):
    print(f"Filename: {txt_filename}")
    csv_filename = find_csv(txt_filepath)
    directory = "./eval/results/"
    csv_filepath = os.path.join(directory, csv_filename)

    csv_files = [csv_filepath]
    
    div_df = get_diversity_df(csv_files, True)
    
    directory_path = os.path.join(plots_folder, txt_filename[:-4])
    if not os.path.exists(directory_path):
        os.makedirs(directory_path, exist_ok=True)
    csv_path = os.path.join(directory_path, 'normalized_div_df.csv')

    save_to_csv(div_df, csv_path)
    save_plots(div_df, directory_path)

In [None]:
# Non-normalized Values
txt_file_directory = "./third_round/"
plots_folder = "./plots/plots_3/"

for txt_filename in os.listdir(txt_file_directory):
    print(f"Filename: {txt_filename}")
    txt_filepath = os.path.join(txt_file_directory, txt_filename)
    csv_filename = find_csv(txt_filepath)
    directory = "./eval/results/"
    csv_filepath = os.path.join(directory, csv_filename)

    csv_files = [csv_filepath]
    
    div_df = get_diversity_df(csv_files, False)
    
    directory_path = os.path.join(plots_folder, txt_filename[:-4])
    if not os.path.exists(directory_path):
        os.makedirs(directory_path, exist_ok=True)
    csv_path = os.path.join(directory_path, 'div_df.csv')

    save_to_csv(div_df, csv_path)
    save_plots(div_df, directory_path)

## Mann-Whitney U-Test

In [None]:
import numpy as np
from scipy.stats import mannwhitneyu

def compute_mann_whitney_u(scores, window1_size, window2_size):
    """
    Computes the Mann-Whitney U test statistic and p-value between two sliding windows of scores.
    
    Args:
        scores (list or np.array): A list or array of scores.
        window1_size (int): The size of the first window.
        window2_size (int): The size of the second window.
        
    Returns:
        tuple: A tuple containing the U statistic and the p-value.
    """
    # Ensure that the input is a numpy array
    scores = np.array(scores)
    
    # Compute the number of scores
    num_scores = len(scores)
    
    # Check if the windows are valid
    if window1_size + window2_size != num_scores:
        raise ValueError("The sum of the window sizes should be equal to the number of scores.")
    
    # Define the two windows
    window1 = scores[:window1_size]
    window2 = scores[window2_size:]
    
    # Compute the Mann-Whitney U test
    u_stat, p_val = mannwhitneyu(window1, window2, alternative='two-sided')
    
    return u_stat, p_val