## Test for KL Leibler Divergence

In [1]:
# %env

In [2]:
%pwd

'/Users/liamroy/Documents/Studies/Monash_31194990/PHD/Studies/Study_03/LLM_motion/llm_audio_testcase/stats'

In [3]:
import os

import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

import numpy as np

from openpyxl import Workbook
from openpyxl import load_workbook

In [19]:
def normalize(data):
    return data / np.sum(data)

def kl_divergence(P, Q):
    return np.sum(P * np.log(P / Q))

def jensen_shannon_distance(P, Q):
    P = normalize(P)
    Q = normalize(Q)
    
    M = 0.5 * (P + Q)
    
    kl_p_m = kl_divergence(P, M)
    kl_q_m = kl_divergence(Q, M)
    
    jsd = 0.5 * kl_p_m + 0.5 * kl_q_m
    
    return np.sqrt(jsd)

In [20]:
conditions = ["llm", "human"]
states = ["stuck", "accomplished", "progressing"]

for state in states:
    print(f"\n\n@@@@@@@@@@@@@@@ Kullback–Leibler Divergence For State: {state} @@@@@@@@@@@@@@@\n")

    # FIRST DO LLM COND:
    # Load the data from the Excel file, specifying the sheet name
    llm_file_path = './../llm_audio_rawdata.xlsx'  # Replace with your file path
    llm_data = pd.read_excel(llm_file_path,sheet_name='llm_' + state + '_00', usecols="D:F", nrows=80)

    mapping = {'A': 0, 'B': 1, 'C': 2}
    llm_data = llm_data.replace(mapping).infer_objects()

    # Convert data to numeric values, coercing errors (e.g., empty cells or non-numeric text will become NaN)
    llm_data = llm_data.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

    # Ensure the data values are between 0 and 2 as required
    assert llm_data.values.max() <= 2 and llm_data.values.min() >= 0, "Values out of range (0, 1, 2)"

    # Initialize a 3x3x3 numpy array to hold the histogram counts
    llm_data_histogram = np.zeros((3, 3, 3), dtype=int)

    # Iterate through each row in the DataFrame and bin the data
    for row in llm_data.itertuples(index=False):
        x, y, z = row  # Unpack the three-vector (each value will be 0, 1, or 2)
        llm_data_histogram[x, y, z] += 1

    # print(f"LLM DATA |3x3x3 Histogram Matrix for {state}:")
    # print(llm_data_histogram)


    # THEN DO HUMAN COND:
    # Load the data from the Excel file, specifying the sheet name
    human_file_path = './../llm_audio_rawdata.xlsx'  # Replace with your file path
    human_data = pd.read_excel(human_file_path,sheet_name='human_' + state + '_00', usecols="D:F", nrows=24)

    mapping = {'A': 0, 'B': 1, 'C': 2}
    human_data = human_data.replace(mapping).infer_objects()

    # Convert data to numeric values, coercing errors (e.g., empty cells or non-numeric text will become NaN)
    human_data = human_data.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

    # Ensure the data values are between 0 and 2 as required
    assert human_data.values.max() <= 2 and human_data.values.min() >= 0, "Values out of range (0, 1, 2)"

    # Initialize a 3x3x3 numpy array to hold the histogram counts
    human_data_histogram = np.zeros((3, 3, 3), dtype=int)

    # Iterate through each row in the DataFrame and bin the data
    for row in human_data.itertuples(index=False):
        x, y, z = row  # Unpack the three-vector (each value will be 0, 1, or 2)
        human_data_histogram[x, y, z] += 1

    # print(f"HUMAN DATA | 3x3x3 Histogram Matrix for {state}:")
    # print(human_data_histogram)


    # Then, flatten each 3x3x3 matrix into a 1D vector of length 27
    llm_vector = llm_data_histogram.flatten()
    human_vector = human_data_histogram.flatten()

    # Then, convert to probability distributions. Since KL Divergence operates on 
    # probability distributions, normalize each vector so that the sum of its elements is 1
    llm_prob = normalize(llm_vector) 
    human_prob = normalize(human_vector)

    # Next, add smoothing. Since some bins in B_prob are zero, add a small 
    # smoothing value (1e−9) to avoid division by zero in the KL Divergence calculation
    llm_prob_smooth = np.where(llm_prob == 0, 1e-9, llm_prob)
    human_prob_smooth = np.where(human_prob == 0, 1e-9, human_prob)  

    # Finally, compute KL Divergence using the formula:KL_divergence = np.sum(seriesA * np.log(seriesA / seriesB))
    # This quantifies how how much Series B diverges from Series A, capturing the "distance" between the two distributions. 
    # Lower values indicate greater similarity, meaning Series A is more contained within Series B.

    KL_divergence = kl_divergence(llm_prob_smooth, human_prob_smooth)
 
    print(f"\n >>> KL Divergence for the state {state} is: {KL_divergence:.3f}\n")





@@@@@@@@@@@@@@@ Kullback–Leibler Divergence For State: stuck @@@@@@@@@@@@@@@


 >>> KL Divergence for the state stuck is: 0.661



@@@@@@@@@@@@@@@ Kullback–Leibler Divergence For State: accomplished @@@@@@@@@@@@@@@


 >>> KL Divergence for the state accomplished is: 1.284



@@@@@@@@@@@@@@@ Kullback–Leibler Divergence For State: progressing @@@@@@@@@@@@@@@


 >>> KL Divergence for the state progressing is: 20.168



## Now do Jensen-Shannon Distance (JSD)

In [37]:
for state in states:

    # FIRST DO LLM COND:
    # Load the data from the Excel file, specifying the sheet name
    llm_file_path = './../llm_audio_rawdata.xlsx'  # Replace with your file path
    llm_data = pd.read_excel(llm_file_path,sheet_name='llm_' + state + '_00', usecols="D:F", nrows=80)

    mapping = {'A': 0, 'B': 1, 'C': 2}
    llm_data = llm_data.replace(mapping).infer_objects()

    # Convert data to numeric values, coercing errors (e.g., empty cells or non-numeric text will become NaN)
    llm_data = llm_data.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

    # Ensure the data values are between 0 and 2 as required
    assert llm_data.values.max() <= 2 and llm_data.values.min() >= 0, "Values out of range (0, 1, 2)"

    # Initialize a 3x3x3 numpy array to hold the histogram counts
    llm_data_histogram = np.zeros((3, 3, 3), dtype=int)

    # Iterate through each row in the DataFrame and bin the data
    for row in llm_data.itertuples(index=False):
        x, y, z = row  # Unpack the three-vector (each value will be 0, 1, or 2)
        llm_data_histogram[x, y, z] += 1

    # THEN DO HUMAN COND:
    # Load the data from the Excel file, specifying the sheet name
    human_file_path = './../llm_audio_rawdata.xlsx'  # Replace with your file path
    human_data = pd.read_excel(human_file_path,sheet_name='human_' + state + '_00', usecols="D:F", nrows=24)

    mapping = {'A': 0, 'B': 1, 'C': 2}
    human_data = human_data.replace(mapping).infer_objects()

    # Convert data to numeric values, coercing errors (e.g., empty cells or non-numeric text will become NaN)
    human_data = human_data.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

    # Ensure the data values are between 0 and 2 as required
    assert human_data.values.max() <= 2 and human_data.values.min() >= 0, "Values out of range (0, 1, 2)"

    # Initialize a 3x3x3 numpy array to hold the histogram counts
    human_data_histogram = np.zeros((3, 3, 3), dtype=int)

    # Iterate through each row in the DataFrame and bin the data
    for row in human_data.itertuples(index=False):
        x, y, z = row  # Unpack the three-vector (each value will be 0, 1, or 2)
        human_data_histogram[x, y, z] += 1


    # PRINT SANITY CHECK
    # print("llm_data_histogram:\n", llm_data_histogram)
    # print("human_data_histogram:\n", human_data_histogram)


    # Then, flatten each 3x3x3 matrix into a 1D vector of length 27
    llm_vector = llm_data_histogram.flatten()
    human_vector = human_data_histogram.flatten()

    # Then, convert to probability distributions. Since KL Divergence operates on 
    # probability distributions, normalize each vector so that the sum of its elements is 1
    llm_prob = normalize(llm_vector) 
    human_prob = normalize(human_vector)

    # Next, add smoothing. Since some bins in B_prob are zero, add a small 
    # smoothing value (1e−9) to avoid division by zero in the KL Divergence calculation
    llm_prob_smooth = np.where(llm_prob == 0, 1e-9, llm_prob)
    human_prob_smooth = np.where(human_prob == 0, 1e-9, human_prob)  
    
    print(f"\n\n@@@ Jensen-Shannon Distance Betweem LLM-Generated Audio State '{state}' and Human After RL Learning @@@\n")

    JS_distance = jensen_shannon_distance(llm_prob_smooth, human_prob_smooth)
    print(f">>> Jensen-Shannon Distance for the state {state} is: {JS_distance:.3f}\n")




@@@ Jensen-Shannon Distance Betweem LLM-Generated Audio State 'stuck' and Human After RL Learning @@@

>>> Jensen-Shannon Distance for the state stuck is: 0.444



@@@ Jensen-Shannon Distance Betweem LLM-Generated Audio State 'accomplished' and Human After RL Learning @@@

>>> Jensen-Shannon Distance for the state accomplished is: 0.553



@@@ Jensen-Shannon Distance Betweem LLM-Generated Audio State 'progressing' and Human After RL Learning @@@

>>> Jensen-Shannon Distance for the state progressing is: 0.819



## Finally, Jensen-Shannon Distance for LLM-Audio and Uniform Distribution


In [40]:
for state in states:

    # FIRST DO LLM COND:
    # Load the data from the Excel file, specifying the sheet name
    llm_file_path = './../llm_audio_rawdata.xlsx'  # Replace with your file path
    llm_data = pd.read_excel(llm_file_path,sheet_name='llm_' + state + '_00', usecols="D:F", nrows=80)

    mapping = {'A': 0, 'B': 1, 'C': 2}
    llm_data = llm_data.replace(mapping).infer_objects()

    # Convert data to numeric values, coercing errors (e.g., empty cells or non-numeric text will become NaN)
    llm_data = llm_data.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

    # Ensure the data values are between 0 and 2 as required
    assert llm_data.values.max() <= 2 and llm_data.values.min() >= 0, "Values out of range (0, 1, 2)"

    # Initialize a 3x3x3 numpy array to hold the histogram counts
    llm_data_histogram = np.zeros((3, 3, 3), dtype=int)

    # Iterate through each row in the DataFrame and bin the data
    for row in llm_data.itertuples(index=False):
        x, y, z = row  # Unpack the three-vector (each value will be 0, 1, or 2)
        llm_data_histogram[x, y, z] += 1

    # THEN CREATE A UNIFORM DISTRIBUTION:
    # Load the data from the Excel file, specifying the sheet name
    uniform_histogram = np.ones((3, 3, 3), dtype=int)
        

    # PRINT SANITY CHECK
    # print("llm_data_histogram:\n", llm_data_histogram)
    # print("uniform_histogram:\n", uniform_histogram)


    # Then, flatten each 3x3x3 matrix into a 1D vector of length 27
    llm_vector = llm_data_histogram.flatten()
    uniform_vector = uniform_histogram.flatten()

    # Then, convert to probability distributions. Since KL Divergence operates on 
    # probability distributions, normalize each vector so that the sum of its elements is 1
    llm_prob = normalize(llm_vector) 
    uniform_prob = normalize(uniform_vector)

    # Next, add smoothing. Since some bins in B_prob are zero, add a small 
    # smoothing value (1e−9) to avoid division by zero in the KL Divergence calculation
    llm_prob_smooth = np.where(llm_prob == 0, 1e-9, llm_prob)
    uniform_prob_smooth = np.where(uniform_prob == 0, 1e-9, uniform_prob)  

    print(f"\n\n@@@ Jensen-Shannon Distance Betweem LLM-Generated Audio State '{state}' and Uniform Distribution @@@\n")

    JS_distance = jensen_shannon_distance(llm_prob_smooth, human_prob_smooth)
    print(f">>> Jensen-Shannon Distance for the state {state} is: {JS_distance:.3f}\n")




@@@ Jensen-Shannon Distance Betweem LLM-Generated Audio State 'stuck' and Uniform Distribution @@@

>>> Jensen-Shannon Distance for the state stuck is: 0.833



@@@ Jensen-Shannon Distance Betweem LLM-Generated Audio State 'accomplished' and Uniform Distribution @@@

>>> Jensen-Shannon Distance for the state accomplished is: 0.833



@@@ Jensen-Shannon Distance Betweem LLM-Generated Audio State 'progressing' and Uniform Distribution @@@

>>> Jensen-Shannon Distance for the state progressing is: 0.819

