In [None]:
import os
import numpy as np
import pandas as pd

In [None]:
# set the path to the results
predictions_path = "../results/ex2_vit_uc_merced/"

In [None]:
def get_predictions_from_csv(predictions_path):
    # Read the predictions csv file
    predictions = pd.read_csv(predictions_path)

    # Read the predictions csv file and get the true and predicted values as lists
    true_value = predictions["true_value"].astype(int).to_list()
    predicted_value = predictions["predicted_value"].astype(int).to_list()

    # Return the true and predicted values as a list of tuples:
    # [(true_value, predicted_value), (true_value, predicted_value), ...]
    return list(zip(true_value, predicted_value))

In [None]:
def get_predictions(path):
    #
    # Get the list of csv files in the path
    #
    list_of_files = []

    for root, dirs, files in os.walk(path, followlinks=True):
        for file in files:
            # Allow results to be ignored by including "ignore" in the directory or file name
            if "ignore" not in root:
                if file.endswith(".csv"):
                    list_of_files.append(os.path.join(root, file).replace(path, ""))

    predictions = {}

    # Read the csv files and append the results to a list
    for full_file_name in sorted(list_of_files):
        short_file_name = full_file_name.split("/")[1].split(".")[0]

        # If the run was deterministic or fixed seed, the run name will be the seed value
        environment = short_file_name.split("_seed_")[0]
        run_name = short_file_name.split("_seed_")[1]

        if environment not in predictions:
            predictions[environment] = {}

        run_predictions = get_predictions_from_csv(path + full_file_name)

        predictions[environment][run_name] = run_predictions

    return predictions


predictions = get_predictions(predictions_path)

In [None]:
def zhuang_calculate_churn(x_predictions, y_predictions):

    # Unzip the predictions into two lists: [0] true value, [1] predicted value
    x_predictions_unziped = list(zip(*x_predictions))
    y_predictions_unziped = list(zip(*y_predictions))

    x = np.array(x_predictions_unziped[1])
    y = np.array(y_predictions_unziped[1])

    return np.sum(x == y) / len(x)

In [None]:
num_seeds = 100

zhuang_churn_results = {}

for environment in predictions.keys():

    churn = []

    # Get all of the seeds from the environment as a list then only selct the first num_seeds
    x_seeds = list(predictions[environment].keys())[:num_seeds]
    y_seeds = list(predictions[environment].keys())[:num_seeds]

    # Loop through the seeds and calculate the churn
    for x_seed in x_seeds:
        
        for y_seed in y_seeds:
    
            # don't calculate churn for the same seed
            if x_seed != y_seed:
                churn.append(zhuang_calculate_churn(predictions[environment][x_seed], 
                                             predictions[environment][y_seed]))

        # Remove the seed from the y_seeds list so it isn't compared again
        y_seeds.remove(x_seed)

    zhuang_churn_results[environment] = 1 - sum(churn) / len(churn)

In [None]:
# Convert the zhuang_churn_results dictionary to a pandas DataFrame
df_zhuang_churn = pd.DataFrame.from_dict(zhuang_churn_results, orient='index', columns=['Churn Rate'])

# Reset the index to make 'environment' a column
df_zhuang_churn.reset_index(inplace=True)
df_zhuang_churn.rename(columns={'index': 'Environment'}, inplace=True)

ax = df_zhuang_churn.plot(x="Environment", kind="bar", rot=0, figsize=(10, 5))
ax.get_legend().remove()
ax.set_title("Zhuang Label Churn - %s Runs" % num_seeds)
ax.set_xlabel("Label Index")
ax.set_ylabel("Predictions")
ax.xaxis.set_tick_params(rotation=45)

In [None]:
df_zhuang_churn

In [None]:
def calculate_zhuang_churn_by_label(x_predictions, y_predictions):
    num_predictions = len(x_predictions)

    x_predictions_by_label = {}
    y_predictions_by_label = {}

    # Loop through all of the predictions
    for i in range(num_predictions):
        # Get the true value for the prediction at index i
        true_value = x_predictions[i][0]

        #
        # Create the label in the dict if it doesn't already exist for x and y
        #
        if str(true_value) not in x_predictions_by_label:
            x_predictions_by_label[str(true_value)] = []

        if str(true_value) not in y_predictions_by_label:
            y_predictions_by_label[str(true_value)] = []

        # Added the x and y predictions to the label
        x_predictions_by_label[str(true_value)].append(x_predictions[i][1])
        y_predictions_by_label[str(true_value)].append(y_predictions[i][1])

    churn_by_label = {}

    #
    # Loop through all of the labels and calculate the churn
    #
    for label in x_predictions_by_label.keys():
        x = np.array(x_predictions_by_label[label])
        y = np.array(y_predictions_by_label[label])

        churn_by_label[label] = 1 - (np.sum(x == y) / len(x))

    return churn_by_label

In [None]:
labels = []

for environment_list, run_list, in predictions.items():
    for seed, prediction_list in run_list.items():
        for individual_predictions in prediction_list:
            if individual_predictions[0] not in labels:
                labels.append(individual_predictions[0])

In [None]:
# Number of seeds used for the churn calculation
num_seeds = 100

# Create an empty dataframe 
df_zhuang_churn_by_label = pd.DataFrame()

# Add the labels column to the dataframe
df_zhuang_churn_by_label["labels"] = labels

# Loop through the environments and calculate the churn 
for environment in predictions:

    churn = []

    # Get all of the seeds from the environment as a list then only selct the first num_seeds
    x_seeds = list(predictions[environment].keys())[:num_seeds]
    y_seeds = list(predictions[environment].keys())[:num_seeds]

    # Loop through the seeds and calculate the churn
    for x_seed in x_seeds:

        for y_seed in y_seeds:
            
            # don't calculate churn for the same seed
            if x_seed != y_seed:
                churn.append(calculate_zhuang_churn_by_label(predictions[environment][x_seed], 
                                             predictions[environment][y_seed]))

        # Remove the seed from the y_seeds list so it isn't compared agai
        y_seeds.remove(x_seed)

    #
    # Get the report_value from the churn_by_label dict 
    #
    return_values = []

    for i in range(len(labels)):

        label_total = 0

        for churn_value in churn:
            label_total += churn_value[str(i)]

        label_total = label_total / len(churn)

        return_values.append(label_total)


    df_zhuang_churn_by_label[environment] = return_values


In [None]:
ax = df_zhuang_churn_by_label.plot(x="labels", kind="bar", rot=0, figsize=(10, 5))
ax.get_legend().set_loc("upper right")
ax.set_title("Zhuang Label Churn - %s Runs" % num_seeds)
ax.set_xlabel("Label Index")
ax.set_ylabel("Predictions")

#ax.set_ylim(0, .70 )

In [None]:
df_zhuang_churn_by_label

In [None]:
for enviroment in df_zhuang_churn_by_label.keys():
    if enviroment == "labels":
        continue
    mean = df_zhuang_churn_by_label[enviroment].mean()
    std = df_zhuang_churn_by_label[enviroment].std()
    mean = round(mean, 4)
    std = round(std, 4)
    print(f"Environment: {enviroment} - Mean: {mean} - Std: {std}")