In [1]:
import pandas as pd

In [2]:
# Here, please use the output csv file produced during soft gating of MoE
df = pd.read_csv("/Users/kseniadvorkina/Documents/backup/scripts/MoE/sentence/moe_predictions_test_sentence.csv")


In [3]:
# Run this df preparation of you want to get year prediction aggregated by text; run the next cell to get per-batch dataset

agg_df = df.groupby("row_idx_df").agg({
    "loss": ["mean", "std", "count"],
    "perplexity": ["mean", "std"],
    "weight_0": "mean",
    "weight_1": "mean",
    "weight_2": "mean",
    "actual_year": "first"
})

# Flatten column names
agg_df.columns = ['loss_mean', 'loss_std', 'batch_count', "perplexity_mean", "perplexity_std", 'weight_0', 'weight_1', 'weight_2', 'actual_year']

agg_df = agg_df.reset_index()

# Calculate the sum of weights for each row
weight_sum = agg_df[['weight_0', 'weight_1', 'weight_2']].sum(axis=1)

# Normalize each weight column
agg_df['weight_0'] = agg_df['weight_0'] / weight_sum
agg_df['weight_1'] = agg_df['weight_1'] / weight_sum
agg_df['weight_2'] = agg_df['weight_2'] / weight_sum


# Bin actual years into their respective expert periods
def assign_period(year):
    if 1710 <= year < 1780:
        return "1710–1780"
    elif 1780 <= year < 1850:
        return "1780–1850"
    elif 1850 <= year <= 1920:
        return "1850–1920"
    else:
        return "Out of range"

agg_df["actual_period"] = agg_df["actual_year"].apply(assign_period)

weight_cols = ["weight_0", "weight_1", "weight_2"]
agg_df["dominant_expert"] = agg_df[weight_cols].idxmax(axis=1)

expert_period_map = {
    "weight_0": "1710–1780",
    "weight_1": "1780–1850",
    "weight_2": "1850–1920"
}
agg_df["predicted_period"] = agg_df["dominant_expert"].map(expert_period_map)


In [5]:
# Run this cell if you want to generate per-batch level dataset
all_df = df.copy()


# Bin actual years into their respective expert periods
def assign_period(year):
    if 1710 <= year < 1780:
        return "1710–1780"
    elif 1780 <= year < 1850:
        return "1780–1850"
    elif 1850 <= year <= 1920:
        return "1850–1920"
    else:
        return "Out of range"

all_df["actual_period"] = all_df["actual_year"].apply(assign_period)

weight_cols = ["weight_0", "weight_1", "weight_2"]
all_df["dominant_expert"] = all_df[weight_cols].idxmax(axis=1)

expert_period_map = {
    "weight_0": "1710–1780",
    "weight_1": "1780–1850",
    "weight_2": "1850–1920"
}
all_df["predicted_period"] = all_df["dominant_expert"].map(expert_period_map)


### Year Prediction (final function)

In [6]:
index_to_year = {
    0: "[1710-1780]",
    1: "[1780-1850]",
    2: "[1850-1920]"
}

def predict_year_and_calculate_error(df):
    """
    Predicts the year for each row in the DataFrame based on a given confidence threshold.
    Also calculates the absolute error and appends the predictions to the DataFrame.
    """
    predicted_years = []
    abs_errors = []
    cases = []

    # Process each row in the DataFrame
    for _, row in df.iterrows():
        probs = [row["weight_0"], row["weight_1"], row["weight_2"]]
        actual_year = row["actual_year"]

        top2 = sorted(enumerate(probs), key=lambda x: x[1], reverse=True)[:2]
        idx_a, w_a = top2[0]
        idx_b, w_b = top2[1]

        range_a = index_to_year[idx_a]
        range_b = index_to_year[idx_b]

        l_a, r_a = map(int, range_a.strip("[]").split("-"))
        l_b, r_b = map(int, range_b.strip("[]").split("-"))

        adjacent = r_a == l_b or r_b == l_a

        if adjacent:
            case = "A"
            w_s = w_b + w_a
            if l_a < l_b:
                pred_year = r_a + 70 * (w_b - w_a) / w_s
            else:
                pred_year = r_b + 70 * (w_a - w_b) / w_s
        else:
            case = "B"
            w_s = probs[2] + probs[0]
            pred_year = 1815 + 105 * (probs[2] - probs[0]) / w_s


        pred_year = round(pred_year)
        abs_error = abs(pred_year - int(actual_year))

        predicted_years.append(pred_year)
        abs_errors.append(abs_error)
        cases.append(case)

    # Append new columns to the DataFrame
    df["predicted_year"] = predicted_years
    df["absolute_error"] = abs_errors
    df["case"] = cases

    return df

In [7]:
# Run the year prediction
year_prediction_batches = predict_year_and_calculate_error(all_df)
year_prediction_texts = predict_year_and_calculate_error(agg_df)