In [198]:
import pandas as pd
import ast
from sklearn.utils import resample

def unbiased_oversampling(df_test_subtask):
    # Get the count of each class
    count_class_0 = sum(df_test_subtask.correct == 0)
    count_class_1 = sum(df_test_subtask.correct == 1)
    # print(count_class_0, count_class_1)
    # Check which class is the majority and which is the minority
    if count_class_0 > count_class_1:
        df_majority = df_test_subtask[df_test_subtask.correct == 0]
        df_minority = df_test_subtask[df_test_subtask.correct == 1]


    else:
        df_majority = df_test_subtask[df_test_subtask.correct == 1]
        df_minority = df_test_subtask[df_test_subtask.correct == 0]


    # Calculate the least common multiple (LCM) of the counts
    lcm = np.lcm(count_class_0, count_class_1)

    # Calculate the replication factors
    replication_factor_0 = lcm / count_class_0
    replication_factor_1 = lcm / count_class_1

    # Replicate each instance in both classes
    df_class_0_replicated = pd.concat([df_test_subtask[df_test_subtask.correct == 0]]*int(replication_factor_0), ignore_index=True)
    df_class_1_replicated = pd.concat([df_test_subtask[df_test_subtask.correct == 1]]*int(replication_factor_1), ignore_index=True)

    # Combine the replicated classes
    df_replicated = pd.concat([df_class_0_replicated, df_class_1_replicated])

    # Display new class counts
    # print(replication_factor_0, replication_factor_1)
    # print(df_replicated.correct.value_counts())
    return df_replicated


def highest_prob_option(s):
    dic = ast.literal_eval(s)
    keymax = max(zip(dic.values(), dic.keys()))[1]
    return keymax

def absolute_prob(s):
    dic = ast.literal_eval(s)
    keymax = max(zip(dic.values(), dic.keys()))[1]
    return dic[keymax]


def normalised_prob(s):
    dic = ast.literal_eval(s)
    keymax = max(zip(dic.values(), dic.keys()))[1]
    return dic[keymax]/sum(dic.values())


df = pd.read_csv("./SS_data/SS_data_v0.csv")
del df["Unnamed: 0"]
df["output"] = df["confidence"].map(highest_prob_option)
df["prob"] = df["confidence"].map(absolute_prob)
df["nor_prob"] = df["confidence"].map(normalised_prob)
df["correct"] = (df["correct_option"] == df["output"])*1
df['system_param_num'] = df['model'].apply(lambda x: '7 billion' if '7b' in x else '13 billion' if '13b' in x else 'unknown')
df.to_csv("SS_data.csv", index=False)

In [199]:
import random
random.seed(42)
OOD_tasks = random.sample([t for t in df["task"].unique()], 10)
df_OOD = df[df["task"].isin(OOD_tasks)]
df_OOD.to_csv("SS_data_ood.csv", index=False)

ID_tasks = [t for t in df["task"].unique() if t not in OOD_tasks]
df_ID = df[df["task"].isin(ID_tasks)]
df_ID.to_csv("SS_data_id.csv", index=False)