In [9]:
import os
import openai
import dotenv
dotenv.load_dotenv()

True

In [10]:
from trulens_eval import Tru

tru = Tru()
tru.reset_database()

In [11]:
from llama_index import SimpleDirectoryReader
from llama_index import Document

documents = SimpleDirectoryReader(
    input_files=["./data/dynalist-2023-12-8.txt"]
).load_data()

document = Document(text="\n\n".join([doc.text for doc in documents]))

In [12]:
len(document.text)

255302

In [13]:
from utils import build_sentence_window_index

In [6]:
from llama_index.llms import OpenAI

llm = OpenAI(model="gpt-3.5-turbo", temperature=0.1)

sentence_index = build_sentence_window_index(
    document,
    llm,
    embed_model="local:BAAI/bge-small-en-v1.5",
    save_dir="sentence_index"
)

In [7]:
from utils import read_eval_questions
eval_questions = read_eval_questions()
eval_questions

In [8]:
from utils import get_sentence_window_query_engine
sentence_window_engine = get_sentence_window_query_engine(sentence_index)

Downloading config.json:   0%|          | 0.00/799 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

Downloading (…)tencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

In [14]:
output = sentence_window_engine.query(
    "What is mechanistic interpretability?")
output.response

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


"Mechanistic interpretability refers to the field of study that focuses on reverse engineering neural networks from the learned weights to human-interpretable algorithms. It involves understanding the actual mechanisms and algorithms that make up the network. In contrast to other forms of interpretability, which explain how the network's outputs relate to high-level concepts without referencing the network's functioning, mechanistic interpretability aims to uncover the inner workings of the network."

## Feedback functions

In [15]:
import nest_asyncio

nest_asyncio.apply()

In [16]:
from trulens_eval import OpenAI as fOpenAI

provider = fOpenAI()

### (1) Answer Relevance

In [17]:
from trulens_eval import Feedback

f_qa_relevance = Feedback(
    provider.relevance_with_cot_reasons,
    name="Answer Relevance"
).on_input_output()

✅ In Answer Relevance, input prompt will be set to __record__.main_input or `Select.RecordInput` .
✅ In Answer Relevance, input response will be set to __record__.main_output or `Select.RecordOutput` .


### (2) Context Relevance

In [18]:
from trulens_eval import TruLlama

context_selection = TruLlama.select_source_nodes().node.text

In [19]:
import numpy as np

f_qs_relevance = (
    Feedback(provider.qs_relevance,
             name="Context Relevance")
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .


In [20]:
import numpy as np

f_qs_relevance = (
    Feedback(provider.qs_relevance_with_cot_reasons,
             name="Context Relevance")
    .on_input()
    .on(context_selection)
    .aggregate(np.mean)
)

✅ In Context Relevance, input question will be set to __record__.main_input or `Select.RecordInput` .
✅ In Context Relevance, input statement will be set to __record__.app.query.rets.source_nodes[:].node.text .


### (3) Groundedness

In [21]:
from trulens_eval.feedback import Groundedness

grounded = Groundedness(groundedness_provider=provider)

In [22]:
f_groundedness = (
    Feedback(grounded.groundedness_measure_with_cot_reasons,
             name="Groundedness"
            )
    .on(context_selection)
    .on_output()
    .aggregate(grounded.grounded_statements_aggregator)
)

✅ In Groundedness, input source will be set to __record__.app.query.rets.source_nodes[:].node.text .
✅ In Groundedness, input statement will be set to __record__.main_output or `Select.RecordOutput` .


## Evaluation

In [23]:
from trulens_eval import TruLlama
from trulens_eval import FeedbackMode

tru_recorder = TruLlama(
    sentence_window_engine,
    app_id="00_Mech_Interpretability",
    feedbacks=[
        f_qa_relevance,
        f_qs_relevance,
        f_groundedness
    ]
)

In [24]:
for question in eval_questions:
    with tru_recorder as recording:
        output = sentence_window_engine.query(question)
        print(output.response)
        print()

boto3,botocore is/are required for using BedrockEndpoint. You should be able to install it/them with
	pip install boto3 botocore
Mechanistic interpretability refers to the field of study that focuses on reverse engineering neural networks from the learned weights to human-interpretable algorithms. It involves understanding the actual mechanisms and algorithms that make up the network. In contrast to other forms of interpretability, which explain how the network's outputs relate to high-level concepts without referencing the network's functioning, mechanistic interpretability aims to uncover the inner workings of the network.



huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Yes, mechanistic interpretability is applicable to the real world. Mechanistic interpretability is a field of study that focuses on reverse engineering neural networks to understand the underlying mechanisms and algorithms that compose them. This approach aims to make neural networks more interpretable and understandable to humans. By gaining insights into the inner workings of neural networks, researchers can better understand why AI systems make certain decisions and how they arrive at their outputs. This understanding can have real-world applications in various domains, such as healthcare, finance, and autonomous systems, where interpretability and transparency are crucial for trust, accountability, and safety.

Superposition refers to a situation where a model represents more features than the number of dimensions in its activation space. In other words, the model is able to simulate a larger model by using a set of interpretable directions that is larger than the number of dimensi

In [25]:
records, feedback = tru.get_records_and_feedback(app_ids=[])
records.head()

Unnamed: 0,app_id,app_json,type,record_id,input,output,tags,record_json,cost_json,perf_json,ts,Answer Relevance,Context Relevance,Groundedness,Answer Relevance_calls,Context Relevance_calls,Groundedness_calls,latency,total_tokens,total_cost
0,00_Mech_Interpretability,"{""app_id"": ""00_Mech_Interpretability"", ""tags"":...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_a554613dc706075d8ffb2475485efa03,"""What is mechanistic interpretability?""","""Mechanistic interpretability refers to the fi...",-,"{""record_id"": ""record_hash_a554613dc706075d8ff...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-08T14:23:38.032519"", ""...",2023-12-08T14:23:41.470001,1.0,0.9,1.0,[{'args': {'prompt': 'What is mechanistic inte...,[{'args': {'question': 'What is mechanistic in...,"[{'args': {'source': 'More generally, if somet...",3,970,0.001498
1,00_Mech_Interpretability,"{""app_id"": ""00_Mech_Interpretability"", ""tags"":...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_e2cf6815609e57f9acf5ebf92126b2e9,"""Is mechanistic interpretability appliable to ...","""Yes, mechanistic interpretability is applicab...",-,"{""record_id"": ""record_hash_e2cf6815609e57f9acf...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-08T14:23:41.647744"", ""...",2023-12-08T14:23:45.612452,1.0,0.8,0.8,[{'args': {'prompt': 'Is mechanistic interpret...,[{'args': {'question': 'Is mechanistic interpr...,"[{'args': {'source': 'Where possible, I link t...",3,1009,0.001574
2,00_Mech_Interpretability,"{""app_id"": ""00_Mech_Interpretability"", ""tags"":...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_195632953a5788d3f4014d80bf371b5b,"""What is superposition and how can I understan...","""Superposition refers to a situation where a m...",-,"{""record_id"": ""record_hash_195632953a5788d3f40...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-08T14:23:45.756853"", ""...",2023-12-08T14:23:50.768928,0.9,0.4,0.833333,[{'args': {'prompt': 'What is superposition an...,[{'args': {'question': 'What is superposition ...,[{'args': {'source': 'We can both use polysema...,5,715,0.001147
3,00_Mech_Interpretability,"{""app_id"": ""00_Mech_Interpretability"", ""tags"":...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_3fb3f67700492d460b6ef182224eeab5,"""How activation functions relate to interpreta...","""Activation functions can play a role in the i...",-,"{""record_id"": ""record_hash_3fb3f67700492d460b6...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-08T14:23:50.920176"", ""...",2023-12-08T14:23:56.127787,0.9,0.8,0.633333,[{'args': {'prompt': 'How activation functions...,[{'args': {'question': 'How activation functio...,[{'args': {'source': 'A final takeaway is that...,5,844,0.00135
4,00_Mech_Interpretability,"{""app_id"": ""00_Mech_Interpretability"", ""tags"":...",RetrieverQueryEngine(llama_index.query_engine....,record_hash_e9993a4ad2473a1963a0e16ddea2c660,"""What is feature?""","""A feature is a property of an input to a mode...",-,"{""record_id"": ""record_hash_e9993a4ad2473a1963a...","{""n_requests"": 1, ""n_successful_requests"": 1, ...","{""start_time"": ""2023-12-08T14:23:56.269574"", ""...",2023-12-08T14:24:01.023450,0.9,0.85,1.0,"[{'args': {'prompt': 'What is feature?', 'resp...","[{'args': {'question': 'What is feature?', 'st...",[{'args': {'source': 'This isn't necessarily a...,4,875,0.001378


In [26]:
import pandas as pd

pd.set_option("display.max_colwidth", None)
records[["input", "output"] + feedback]

Unnamed: 0,input,output,Answer Relevance,Context Relevance,Groundedness
0,"""What is mechanistic interpretability?""","""Mechanistic interpretability refers to the field of study that focuses on reverse engineering neural networks from the learned weights to human-interpretable algorithms. It involves understanding the actual mechanisms and algorithms that make up the network. In contrast to other forms of interpretability, which explain how the network's outputs relate to high-level concepts without referencing the network's functioning, mechanistic interpretability aims to uncover the inner workings of the network.""",1.0,0.9,1.0
1,"""Is mechanistic interpretability appliable to real world?""","""Yes, mechanistic interpretability is applicable to the real world. Mechanistic interpretability is a field of study that focuses on reverse engineering neural networks to understand the underlying mechanisms and algorithms that compose them. This approach aims to make neural networks more interpretable and understandable to humans. By gaining insights into the inner workings of neural networks, researchers can better understand why AI systems make certain decisions and how they arrive at their outputs. This understanding can have real-world applications in various domains, such as healthcare, finance, and autonomous systems, where interpretability and transparency are crucial for trust, accountability, and safety.""",1.0,0.8,0.8
2,"""What is superposition and how can I understand it?""","""Superposition refers to a situation where a model represents more features than the number of dimensions in its activation space. In other words, the model is able to simulate a larger model by using a set of interpretable directions that is larger than the number of dimensions. This set of directions is called an overcomplete basis. It is important to note that in the case of superposition, there cannot be an interpretable basis, meaning that features as neurons cannot perfectly hold. To understand superposition, it is helpful to think of it as a form of lossy compression, where the model is able to represent more features but at the cost of adding noise and interference between features. Finding the optimal balance between representing more features and minimizing noise and interference is crucial.""",0.9,0.4,0.833333
3,"""How activation functions relate to interpretability of a model?""","""Activation functions can play a role in the interpretability of a model. In the context provided, the SoLU activation function is mentioned as a function that seems to make neurons more interpretable. It is suggested that using SoLU as a replacement for other activation functions like GELU or ReLU can reduce the amount of neuron superposition in the model and make neurons more monosemantic. This can potentially make it easier to identify and understand the specific contributions of individual neurons to the model's computations. By localizing the effects of different activations and identifying which parts of the model matter for specific tasks, it becomes possible to form a clean mechanistic story and reverse engineer the underlying circuit represented by the model. Therefore, activation functions can have an impact on the interpretability of a model by influencing the behavior and characteristics of individual neurons.""",0.9,0.8,0.633333
4,"""What is feature?""","""A feature is a property of an input to a model or some subset of that input. It can be a meaningful and articulable property of the input that the network encodes as a direction in activation space. Features can vary depending on the type of model being used, such as curve detector or car detector neurons in a convolutional neural network. However, the concept of a feature is not limited to human-understandable properties and can encompass any \""independent units\"" that a neural network representation can be decomposed into. Defining a feature in a satisfying way can be challenging, but it is an important aspect of understanding and interpreting machine learning models.""",0.9,0.85,1.0
5,"""What is the difference between activation and a feature?""","""Activation refers to the intermediate values computed when running a neural network, specifically the outputs of each layer. On the other hand, a feature can be defined as an \""independent unit\"" that a neural network representation can be decomposed into. In other words, features are the meaningful, articulable properties of the input that the network encodes as directions in activation space. While activations are the values themselves, features are the properties or characteristics that these values represent.""",1.0,0.55,0.75
6,"""What is a circuit?""","""A circuit, in the context provided, refers to a computational subgraph within a model. It is a subset of nodes and edges that are sufficient for performing a specific computation. In this framework, nodes represent components of the model, such as attention heads and neurons, while edges represent the flow of information between these components. The output of each layer in the model is the sum of the outputs of its components, and the input to each layer is the sum of the outputs of every previous layer. This allows for the consideration of subsets of nodes and edges, making it easy to understand the effect of adding or removing terms. Overall, a circuit represents a part of the model that performs a comprehensible computation to generate interpretable features.""",1.0,,
7,"""What is an Induction Head?""","""An Induction Head is a type of head that implements the induction behavior. It attends to the token immediately after an earlier copy of the current token and predicts that the token attended to will come next. It is a statement about the attention pattern and does not provide information about the output of the head (OV circuit).""",1.0,,
8,"""How Induction Head was found?""","""The induction heads were discovered by studying two-layer attention-only models. These models provided a simpler setting to analyze compared to real language models. Through this study, a deep principle of transformers was uncovered, which turned out to be generalizable. The discovery of induction heads in these toy models has sparked excitement and the desire for further research in studying similar models and exploring what can be learned from them.""",0.0,,
9,"""Why induction head is so important?""","""Induction heads are important because they play a crucial role in language modeling and in-context learning. They allow transformers to use tokens from far back in the context to predict the next token, which improves the model's ability to understand and generate coherent and contextually relevant language. This is why there is a visible bump in the loss curve when induction heads form during training.""",,,


In [27]:
tru.get_leaderboard(app_ids=[])

Unnamed: 0_level_0,Answer Relevance,Context Relevance,Groundedness,latency,total_cost
app_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
00_Mech_Interpretability,0.855556,0.716667,0.836111,3.153846,0.00133


In [28]:
tru.run_dashboard()

Starting dashboard ...
Config file already exists. Skipping writing process.
Credentials file already exists. Skipping writing process.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Accordion(children=(VBox(children=(VBox(children=(Label(value='STDOUT'), Output())), VBox(children=(Label(valu…

Dashboard started at http://192.168.68.103:8501 .


<Popen: returncode: None args: ['streamlit', 'run', '--server.headless=True'...>