In [1]:
import sys
from pathlib import Path

# Add the src directory to Python path
src_path = str(Path("./").resolve().parent.parent)
if src_path not in sys.path:
    sys.path.append(src_path)

sys.path

['/Library/Frameworks/Python.framework/Versions/3.11/lib/python311.zip',
 '/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11',
 '/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/lib-dynload',
 '',
 '/Users/anton/dev/MARS/correctness-model-internals/venv/lib/python3.11/site-packages',
 '/Users/anton/dev/MARS/correctness-model-internals']

In [2]:
from collections import defaultdict

import pandas as pd

from src.classifying import (
    ActivationsHandler,
    combine_activations_handlers,
    get_correctness_direction_classifier,
    get_logistic_regression_classifier,
    get_between_class_variance_and_within_class_variance,
)
from src.visualisations.utils import plot_interactive_lineplot
from src.utils.data import load_activations, load_labels, get_experiment_activations_configs_df_subset


In [49]:
BASE_PATH = "../../data_for_classification"
PCA_COMPONENTS = 100

N_DIMS = 500

PCA_COMONENTS_LABEL = f"_pca_{PCA_COMPONENTS}" if PCA_COMPONENTS else ""
N_DIMS_LABEL = f"_{N_DIMS}_dims"

In [22]:
import numpy as np
import torch as pt
from pathlib import Path
import os

def generate_dummy_data(base_path: str, n_layers: int = 32, n_files: int = 4, 
                       points_per_file: int = 25, n_dims: int = N_DIMS):
    """
    Generate dummy activation data with specific separation patterns across layers.
    
    Parameters:
    -----------
    base_path : str
        Base path for saving the data
    n_layers : int
        Number of layers to generate
    n_files : int
        Number of files per layer
    points_per_file : int
        Number of points per file
    n_dims : int
        Number of dimensions for each point
    """
    
    # Create base directories
    base_path = Path(base_path)
    activations_path = base_path / "activations" / "dummy_model" / "test_dataset" / "base" / "main" / "prompt_only"
    labels_path = base_path / "evaluations" / "dummy_model" / "test_dataset" / "base"
    
    os.makedirs(activations_path, exist_ok=True)
    os.makedirs(labels_path, exist_ok=True)
    
    # Generate indices and labels
    total_points = n_files * points_per_file
    indices = np.arange(total_points)
    labels = np.zeros(total_points, dtype=bool)
    labels[total_points//2:] = True  # Half True, Half False
    
    # Save labels
    labels_df = pd.DataFrame({
        'index': indices,
        'correct': labels
    })
    labels_df.to_csv(labels_path / "main_generations_evaluated.csv", index=False)
    
    # Function to calculate separation factor for each layer
    def get_separation_factor(layer):
        # Max separation at layer 15
        if layer <= 15:
            return (layer / 15) ** 2  # Quadratic increase to layer 15
        else:
            return max(0.1, 1 - ((layer - 15) / 16) ** 2)  # Quadratic decrease after
    
    # Generate data for each layer
    for layer in range(n_layers):
        layer_path = activations_path / f"layer_{layer}"
        os.makedirs(layer_path, exist_ok=True)
        
        separation = get_separation_factor(layer)
        
        # Base distributions
        false_base = np.random.normal(0, 1, (total_points//2, n_dims))
        true_base = np.random.normal(0, 1, (total_points//2, n_dims))
        
        # Add separation
        separation_vector = np.random.normal(0, 1, n_dims)
        separation_vector = separation_vector / np.linalg.norm(separation_vector)
        true_base += separation * 5 * separation_vector
        
        # Combine and shuffle consistently
        all_data = np.vstack([false_base, true_base])
        
        # Split into files
        for file_idx in range(n_files):
            start_idx = file_idx * points_per_file
            end_idx = start_idx + points_per_file
            
            file_data = pt.tensor(all_data[start_idx:end_idx]).float()
            pt.save(file_data, layer_path / f"batch_{file_idx*points_per_file}.pt")


generate_dummy_data("../../data_for_classification")

In [50]:
# Set None to get all
# MODEL_ID = "llama3.1_8b_chat"
MODEL_ID = "dummy_model"
# DATASET_ID = "gsm8k"
# PROMPT_ID = "cot_3_shot"
# SUBSET_ID = "main"
# INPUT_TYPE = "prompt_only"
DATASET_ID = None
PROMPT_ID = None
SUBSET_ID = None
INPUT_TYPE = None

activation_exp_configs_df = get_experiment_activations_configs_df_subset(
    base_path=BASE_PATH,
    model_id=MODEL_ID,
    dataset_id=DATASET_ID,
    prompt_id=PROMPT_ID,
    subset_id=SUBSET_ID,
    input_type=INPUT_TYPE,
)
activation_exp_configs_df

Unnamed: 0,model_id,dataset_id,prompt_id,subset_id,input_type,layer,path
384,dummy_model,test_dataset,base,main,prompt_only,0,../../data_for_classification/activations/dumm...
385,dummy_model,test_dataset,base,main,prompt_only,7,../../data_for_classification/activations/dumm...
386,dummy_model,test_dataset,base,main,prompt_only,9,../../data_for_classification/activations/dumm...
387,dummy_model,test_dataset,base,main,prompt_only,17,../../data_for_classification/activations/dumm...
388,dummy_model,test_dataset,base,main,prompt_only,28,../../data_for_classification/activations/dumm...
389,dummy_model,test_dataset,base,main,prompt_only,10,../../data_for_classification/activations/dumm...
390,dummy_model,test_dataset,base,main,prompt_only,26,../../data_for_classification/activations/dumm...
391,dummy_model,test_dataset,base,main,prompt_only,19,../../data_for_classification/activations/dumm...
392,dummy_model,test_dataset,base,main,prompt_only,21,../../data_for_classification/activations/dumm...
393,dummy_model,test_dataset,base,main,prompt_only,8,../../data_for_classification/activations/dumm...


In [51]:
between_over_within_class_variance_dict = defaultdict(list)

for (model_id, dataset_id, prompt_id, subset_id, input_type), config_df in activation_exp_configs_df.groupby(["model_id", "dataset_id", "prompt_id", "subset_id", "input_type"]):
    print(f"\n{model_id=}, {dataset_id=}, {prompt_id=}, {subset_id=}, {input_type=}")
    labels_df = load_labels(
        base_path=BASE_PATH,
        model_id=model_id,
        dataset_id=dataset_id,
        prompt_id=prompt_id,
        subset_id=subset_id,
    )

    check_indices = None
    for layer in config_df["layer"].astype(int).sort_values():
        print(f"{layer=}", end=", ")
        activations, indices = load_activations(
            base_path=BASE_PATH,
            model_id=model_id,
            dataset_id=dataset_id,
            prompt_id=prompt_id,
            subset_id=subset_id,
            input_type=input_type,
            layer=layer,
        )
        
        if check_indices is None:
            check_indices = indices.sample(frac=1, replace=False)
        
        if set(indices) != set(check_indices):
            raise RuntimeError(f"indices across layers are not the same")

        labels_df_subset = labels_df.iloc[check_indices]
        activations = activations[check_indices]

        activations_handler = ActivationsHandler(
            activations=activations,
            labels=labels_df_subset["correct"].astype(bool),
        )

        activations_handler, _ = activations_handler.reduce_dims(pca_components=PCA_COMPONENTS)

        between_class_variance, within_class_variance = get_between_class_variance_and_within_class_variance(
            activations_handler.sample_equally_across_groups(group_labels=(False, True)),
            group_labels=(False, True),
        )
        between_over_within_class_variance_dict["model_id"].append(model_id)
        between_over_within_class_variance_dict["dataset_id"].append(dataset_id)
        between_over_within_class_variance_dict["prompt_id"].append(prompt_id)
        between_over_within_class_variance_dict["subset_id"].append(subset_id)
        between_over_within_class_variance_dict["input_type"].append(input_type)
        between_over_within_class_variance_dict["layer"].append(layer)
        between_over_within_class_variance_dict["between_class_variance"].append(between_class_variance)
        between_over_within_class_variance_dict["within_class_variance"].append(within_class_variance)
        between_over_within_class_variance_dict["between_over_within_class_variance"].append(between_class_variance/within_class_variance)

between_over_within_class_variance_df = pd.DataFrame(between_over_within_class_variance_dict)



model_id='dummy_model', dataset_id='test_dataset', prompt_id='base', subset_id='main', input_type='prompt_only'
layer=0, layer=1, layer=2, layer=3, layer=4, layer=5, layer=6, layer=7, layer=8, layer=9, layer=10, layer=11, layer=12, layer=13, layer=14, layer=15, layer=16, layer=17, layer=18, layer=19, layer=20, layer=21, layer=22, layer=23, layer=24, layer=25, layer=26, layer=27, layer=28, layer=29, layer=30, layer=31, 

In [52]:
plot_interactive_lineplot(
    df_dict={str(k): v.set_index("layer")[["between_over_within_class_variance"]] for k, v in between_over_within_class_variance_df.groupby(["model_id", "dataset_id", "prompt_id", "subset_id", "input_type"])},
    x_label="Layer",
    y_label="Between Class Variance / Within Class Variance",
    save_path=f"./classification_data/figures/dummy_data_between_over_within_class_variance{N_DIMS_LABEL}{PCA_COMONENTS_LABEL}.html"
)

In [53]:

res_dict = defaultdict(list)

for (model_id, dataset_id, prompt_id, subset_id, input_type), config_df in activation_exp_configs_df.groupby(["model_id", "dataset_id", "prompt_id", "subset_id", "input_type"]):
    print(f"\n{model_id=}, {dataset_id=}, {prompt_id=}, {subset_id=}, {input_type=}")
    labels_df = load_labels(
        base_path=BASE_PATH,
        model_id=model_id,
        dataset_id=dataset_id,
        prompt_id=prompt_id,
        subset_id=subset_id,
    )

    check_indices = None
    for layer in config_df["layer"].astype(int).sort_values():
        print(f"{layer=}", end=", ")
        activations, indices = load_activations(
            base_path=BASE_PATH,
            model_id=model_id,
            dataset_id=dataset_id,
            prompt_id=prompt_id,
            subset_id=subset_id,
            input_type=input_type,
            layer=layer,
        )
        
        if check_indices is None:
            check_indices = indices.sample(frac=1, replace=False)
        
        if set(indices) != set(check_indices):
            raise RuntimeError(f"indices across layers are not the same")

        labels_df_subset = labels_df.iloc[check_indices]
        activations = activations[check_indices]

        activations_handler = ActivationsHandler(
            activations=activations,
            labels=labels_df_subset["correct"].astype(bool),
        )
        # activations_handler, _ = activations_handler.reduce_dims(pca_components=PCA_COMPONENTS)


        activations_handler_folds = list(
            activations_handler.split_dataset(split_sizes=[1/5] * 5)
        )

        fold_stats = {}
        for fold_i, activations_handler_test in enumerate(activations_handler_folds):
            activations_handler_train = combine_activations_handlers(
                [ah for j, ah in enumerate(activations_handler_folds) if j != fold_i]
            )
            
            activations_handler_train, pca_info = activations_handler_train.reduce_dims(pca_components=PCA_COMPONENTS)
            
            activations_handler_train = activations_handler_train.sample_equally_across_groups(
                group_labels=[False, True]
            )
            activations_handler_test, _ = activations_handler_test.reduce_dims(pca_components=PCA_COMPONENTS, pca_info=pca_info)
            activations_handler_test = activations_handler_test.sample_equally_across_groups(
                group_labels=[False, True]
            )

            res_dict["model_id"].append(model_id)
            res_dict["dataset_id"].append(dataset_id)
            res_dict["prompt_id"].append(prompt_id)
            res_dict["subset_id"].append(subset_id)
            res_dict["input_type"].append(input_type)
            res_dict["layer"].append(layer)
            res_dict["fold"].append(fold_i)

            direction_classifier, direction_calculator = get_correctness_direction_classifier(
                activations_handler_train=activations_handler_train,
                activations_handler_test=activations_handler_test,
            )
            # res_dict["classifying_direction"].append(direction_calculator.classifying_direction.tolist())
            for key, value in direction_classifier.classification_metrics.items():
                res_dict[f"direction_{key}"].append(value)
            
            for key, value in get_logistic_regression_classifier(
                    activations_handler_train=activations_handler_train,
                    activations_handler_test=activations_handler_test,
                )[0].classification_metrics.items():
                res_dict[f"logistic_regression_{key}"].append(value)



res_df = pd.DataFrame(res_dict)
res_df


model_id='dummy_model', dataset_id='test_dataset', prompt_id='base', subset_id='main', input_type='prompt_only'
layer=0, layer=1, layer=2, layer=3, layer=4, layer=5, layer=6, layer=7, layer=8, layer=9, layer=10, 


Precision is ill-defined and being set to 0.0 due to no predicted samples. Use `zero_division` parameter to control this behavior.



layer=11, layer=12, layer=13, layer=14, layer=15, layer=16, layer=17, layer=18, layer=19, layer=20, layer=21, layer=22, layer=23, layer=24, layer=25, layer=26, layer=27, layer=28, layer=29, layer=30, layer=31, 

Unnamed: 0,model_id,dataset_id,prompt_id,subset_id,input_type,layer,fold,direction_optimal_cut,direction_optimal_train_set_cut,direction_test_roc_auc,...,direction_f1_score,direction_precision_score,direction_recall_score,logistic_regression_optimal_cut,logistic_regression_optimal_train_set_cut,logistic_regression_test_roc_auc,logistic_regression_accuracy_score,logistic_regression_f1_score,logistic_regression_precision_score,logistic_regression_recall_score
0,dummy_model,test_dataset,base,main,prompt_only,0,0,-0.627963,-0.627963,0.570000,...,0.714286,0.555556,1.000000,0.5,0.953964,0.600000,0.600000,0.666667,0.571429,0.800000
1,dummy_model,test_dataset,base,main,prompt_only,0,1,-0.003010,-0.003010,0.580247,...,0.600000,0.545455,0.666667,0.5,0.955916,0.388889,0.388889,0.521739,0.428571,0.666667
2,dummy_model,test_dataset,base,main,prompt_only,0,2,1.212223,1.212223,0.472222,...,0.000000,0.000000,0.000000,0.5,0.955552,0.583333,0.583333,0.545455,0.600000,0.500000
3,dummy_model,test_dataset,base,main,prompt_only,0,3,0.931577,0.931577,0.551020,...,0.200000,0.333333,0.142857,0.5,0.956117,0.714286,0.714286,0.714286,0.714286,0.714286
4,dummy_model,test_dataset,base,main,prompt_only,0,4,0.049378,0.049378,0.470000,...,0.500000,0.500000,0.500000,0.5,0.956098,0.400000,0.400000,0.400000,0.400000,0.400000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155,dummy_model,test_dataset,base,main,prompt_only,31,0,0.941114,0.941114,0.340000,...,0.000000,0.000000,0.000000,0.5,0.958071,0.450000,0.450000,0.476190,0.454545,0.500000
156,dummy_model,test_dataset,base,main,prompt_only,31,1,0.486672,0.486672,0.407407,...,0.000000,0.000000,0.000000,0.5,0.956543,0.666667,0.666667,0.666667,0.666667,0.666667
157,dummy_model,test_dataset,base,main,prompt_only,31,2,0.741965,0.741965,0.361111,...,0.000000,0.000000,0.000000,0.5,0.952206,0.333333,0.333333,0.200000,0.250000,0.166667
158,dummy_model,test_dataset,base,main,prompt_only,31,3,1.041441,1.041441,0.224490,...,0.000000,0.000000,0.000000,0.5,0.958171,0.571429,0.571429,0.625000,0.555556,0.714286


In [54]:
# res_file = "./classification_data/res_df_llama31_8B_4_memory_datasets.csv"

In [55]:
# res_df.to_csv(res_file, index=False)

In [56]:
# res_df = pd.read_csv(res_file)

In [57]:
for classifier in ["direction", "logistic_regression"]:
    # for metric in ["f1_score", "accuracy_score", "precision_score", "recall_score"]:
    for metric in ["f1_score"]:
        plot_dict = {}
        for conf, res_df_ in res_df.groupby(["model_id", "dataset_id", "prompt_id", "subset_id", "input_type"]):
            # if conf[4] != "prompt_only":
            #     continue
            
            print(f"{conf=}")
            res_df_pivot = pd.pivot(
                res_df_.drop(columns=["model_id", "dataset_id", "prompt_id", "subset_id", "input_type"]),
                index='layer',
                columns='fold',
                # values=['direction_f1_score', 'logistic_regression_f1_score']  # add all metrics you want to keep
            )
            # for classifier in ["direction", "logistic_regression"]:
            #     for metric in ["f1_score", "accuracy_score", "precision_score", "recall_score"]:
            plot_dict[str(conf)] = res_df_pivot[[f"{classifier}_{metric}"]]

        plot_interactive_lineplot(
            plot_dict,
            x_label="Layer",
            y_label=f"{classifier}_{metric}".replace("_", " ").title(),
            save_path=f"./classification_data/figures/dummy_data_{classifier}_{metric}{N_DIMS_LABEL}{PCA_COMONENTS_LABEL}.html"
        ).show()



conf=('dummy_model', 'test_dataset', 'base', 'main', 'prompt_only')


conf=('dummy_model', 'test_dataset', 'base', 'main', 'prompt_only')
