In [5]:
import pandas as pd
from scipy.stats import ks_2samp

# Load two CSV files with sample distributions
def load_csv_data(file_path):
    """
    Load data from a CSV file and return the values as a list.

    Args:
        file_path (str): Path to the CSV file.

    Returns:
        list: List of values from the specified column in the CSV file.
    """
    try:
        data = pd.read_csv(file_path)
        # Assuming the data is in the first column; adjust column name if needed
        column_name = data.columns[4]
        return data[column_name].dropna().tolist()
    except Exception as e:
        print(f"Error reading the file {file_path}: {e}")
        return []

# Perform Kolmogorov-Smirnov test
def perform_ks_test(data1, data2):
    """
    Perform the Kolmogorov-Smirnov test for two distributions.

    Args:
        data1 (list): First distribution sample.
        data2 (list): Second distribution sample.

    Returns:
        dict: Test statistic and p-value.
    """
    ks_statistic, p_value = ks_2samp(data1, data2)
    return {
        "ks_statistic": ks_statistic,
        "p_value": p_value
    }

# Load data from CSV files
data1 = load_csv_data("Aggregated_Human.csv")
data2 = load_csv_data("Aggregated_LLM.csv")

if not data1 or not data2:
    print("Error: Failed to load data from one or both files.")
else:
    # Perform KS test
    results = perform_ks_test(data1, data2)

    # Display results
    print("Kolmogorov-Smirnov Test Results:")
    print(f"KS Statistic: {results['ks_statistic']}")
    print(f"P-Value: {results['p_value']}")

    # Interpret the result
    if results['p_value'] < 0.001:
        print("The distributions are significantly different (p < 0.001).")
    else:
        print("No significant difference between the distributions (p >= 0.001).")


Kolmogorov-Smirnov Test Results:
KS Statistic: 0.16037761117673371
P-Value: 0.00475421678084968
No significant difference between the distributions (p >= 0.001).
