In [None]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install libraries

```bash
conda create -n edu4 python=3.11 jupyter matplotlib
```

```bash 
! pip install -U -r requirements.txt
```

```bash
! pip install -U numpy
! pip install -U scikit-learn
```

## Update repository

In [None]:
# ! git pull

## Add import path

In [None]:
import gc
import os
import sys

In [None]:
def add_library_level(level=4):
    suf_path = ['..']
    path = '..'
    for i in range(0, level):
        join_path = suf_path * i
        path = '/'.join(join_path)
        module_path = os.path.abspath(os.path.join(path))
        if module_path not in sys.path:
            sys.path.append(module_path)
            print(f'Appendeding {path}')

In [None]:
add_library_level(level=5)

## Organize imports

In [None]:
import multiprocessing

In [None]:
from src.lattmc.fca.utils import *
from src.lattmc.fca.fca_utils import *

In [None]:
from src.lattmc.tc.transcoder_analyzers import ConceptAnalysis, init_analyzer

In [None]:
from src.lattmc.sae.nlp_sae_utils import init_device, gen_concept

In [None]:
import logging

#### Number of CPU cores

In [None]:
workers = multiprocessing.cpu_count()
workers

In [None]:
SEED = 2025

In [None]:
logging.basicConfig(level=logging.INFO)

In [None]:
device = init_device()
device

In [None]:
torch.__version__

In [None]:
np.__version__

In [None]:
# np.set_printoptions(precision=4, suppress=True)

## Initialize Paths

In [None]:
PATH = Path('data')
GPT2 = PATH / 'transcoders' / 'gpt2'
OWT_TOKENS_DIR = GPT2 / 'owt_tokens'
TOKENS_PATH = OWT_TOKENS_DIR / 'owt_tokens_torch.pt'
OWT_TOKENS_DIR.mkdir(exist_ok=True, parents=True)

## Load trancoders

In [None]:
layers = list(range(12))
layers = [0, 4, 6, 8, 10, 11]

In [None]:
tr_analyzer = init_analyzer(
    layers,
    TOKENS_PATH,
    GPT2,
    device=device
)

In [None]:
import gc
gc.collect()
torch.cuda.empty_cache()

## Experiments Pos and Negs Black

In [None]:
tok_indx = 143

In [None]:
text_detoken = tr_analyzer.to_clean(tr_analyzer.tokens[tok_indx])
text_detoken

In [None]:
concept_an = ConceptAnalysis(text_detoken, tr_analyzer)

In [None]:
concept_an.analyze_concepts()

In [None]:
i, j = 27, 28
t_idcs = [28, 127]

In [None]:
layer_0 = 0

In [None]:
black_text = ' black intelligents and black district'
concept_bl = ConceptAnalysis(black_text, tr_analyzer)
concept_bl.analyze_concepts()

In [None]:
t_idbl = [1, 5]

In [None]:
concept_an.gen_text(t_idcs, layer_0)

In [None]:
concept_bl.gen_text(t_idbl, layer_0)

In [None]:
det_22 = concept_an.detected_vs[layer_0][22]
det_22

In [None]:
v_b1 = det_22[28][32]
v_b2 = det_22[127][32]

In [None]:
v_b3 = det_22[127][96]
v_b4 = det_22[127][119]

In [None]:
v_b = join(v_b1, v_b2)

In [None]:
topK(v_b1, 20)

In [None]:
topK(v_b3, 20)

In [None]:
topK(v_b4, 20)

In [None]:
np.all(v_b1 == v_b2)

In [None]:
layer_8 = 8

In [None]:
concept_an.gen_text(t_idcs, layer_8)

In [None]:
concept_bl.gen_text(t_idbl, layer_8)

In [None]:
concept_an.detected_vs[layer_8]

In [None]:
layer_11 = 11

In [None]:
concept_an.gen_text(t_idcs, layer_11)

In [None]:
concept_bl.gen_text(t_idbl, layer_11)

In [None]:
intersect(concept_an.c_is[11].A, concept_bl.c_is[11].A)

In [None]:
concept_an.detected_vs[layer_11][3]

In [None]:
concept_an.detected_vs[layer_11][8]

In [None]:
concept_an.detected_vs[layer_11][9]

In [None]:
v_white = concept_an.detected_vs[layer_11][3][28][47]
v_black = concept_an.detected_vs[layer_11][3][127][76]

In [None]:
v_white = concept_an.detected_vs[layer_11][8][28][62]
v_black = concept_an.detected_vs[layer_11][8][127][34]

In [None]:
v_white = concept_an.detected_vs[layer_11][9][28][74]
v_black = concept_an.detected_vs[layer_11][9][127][92]

In [None]:
topK(v_white, 10)

In [None]:
topK(v_black, 10)

In [None]:
v_meet = meet(v_white, v_black)

In [None]:
vals, indcs = topK(v_meet, 100)
vals, indcs

In [None]:
c_meet = concept_an.gen_concept(indcs, vals, layer_11)
c_meet

In [None]:
concept_an.gen_and_print(indcs, vals, layer_11)

In [None]:
c_meet = concept_an.gen_concept(indcs[1], vals[1], layer_11)
c_meet

In [None]:
concept_an.gen_and_print(indcs[1:4], vals[1:4], layer_11, with_text=True, limit=20)

In [None]:
concept_an.gen_and_print([21836], [17], layer_11, with_text=True, limit=20)

In [None]:
vs = concept_an.transcoder(concept_an.corpus[8], layer_11)[0]

In [None]:
topK(vs[62], 10)

## Embed Text

In [None]:
emb_text = ' wonderful options'
concept_bl = ConceptAnalysis(emb_text, tr_analyzer)
concept_bl.analyze_concepts()

In [None]:
t_idbl = [1, 2]

In [None]:
concept_bl.gen_text(t_idbl, layer_0)

In [None]:
concept_bl.gen_text(t_idbl, layer_8)

In [None]:
concept_bl.gen_text(t_idbl, layer_11)