In [None]:
# Goal: Find where in the llm truth lies the most or the least 
## Assumption: The place with the best model is the place with the best truth 
## Hypothesis 2: Multiple paragraphs of sentence together will have spikes of truth values across the sentences 

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [82]:
# ## Data
import pandas as pd
df = pd.read_csv('./data/animals_true_false.csv')
sentences = df['statement']
labels = df['label']
df

Unnamed: 0,statement,label
0,The giant anteater uses walking for locomotion.,1
1,The eagle has a habitat of urban/wild.,0
2,The tortoise has an iridescent tail with eye-l...,0
3,"Human uses for hyena include conservation, res...",0
4,The platypus uses swimming for locomotion.,1
...,...,...
1003,The pigeon has the atomic number of mammal.,0
1004,The kangaroo is a mammal.,1
1005,The raccoon has a diet of omnivore.,1
1006,The chimpanzee has a habitat of mountain.,0


In [29]:
import pandas as pd
from itertools import zip_longest

# Read the data
df = pd.read_csv('./data/animals_true_false.csv')
sentences = df['statement']
labels = df['label']

# Group sentences by class
grouped = df.groupby('label')['statement'].apply(list)

# Merge sentences into pairs within each class
merged_sentences = []
merged_classes = []

for class_val, sentences in grouped.items():
    pairs = [' '.join(pair) for pair in zip_longest(sentences[::2], sentences[1::2], fillvalue='')]
    merged_sentences.extend(pairs)
    merged_classes.extend([class_val] * len(pairs))

# Create a new DataFrame with merged sentences and their classes
merged_df = pd.DataFrame({'merged_sentence': merged_sentences, 'class': merged_classes})

In [61]:
# merged_df
# sentences = merged_df['merged_sentence']
# labels = merged_df['class']

In [62]:
from transformers import AutoTokenizer, AutoModelForCausalLM
def init_model(model_name):
  tokenizer = AutoTokenizer.from_pretrained(model_name)
  model = AutoModelForCausalLM.from_pretrained(model_name)
  return model, tokenizer

In [83]:

def generate_embeddings(data, model, tokenizer):
    layer_embeddings = {layer: [] for layer in range(49)} 
    for item in data:
        input_text = f'Evaluate the truth of these statements: {item}'
        tokens = tokenizer(input_text, return_tensors='pt')
        output = model.generate(tokens.input_ids, attention_mask=tokens.attention_mask, output_hidden_states=True, return_dict_in_generate=True, max_new_tokens=1, min_new_tokens=1)
        states = output.hidden_states
        # Extract embeddings for all layers
        for layer in range(len(states[0])):
            embeddings = states[0][layer][0][-1]
            layer_embeddings[layer].append(embeddings)

    return layer_embeddings

In [1]:
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

def probe(embeddings, layer, input_dim):
  X = np.array(embeddings)  
  y = np.array(labels)

  save_embeddings(X, layer)

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

  # Define and compile model
  model = Sequential()
  model.add(Dense(256, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(128, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(64, activation='relu'))
  model.add(Dropout(0.5))
  model.add(Dense(1, activation='sigmoid'))  # Assuming binary classification
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

  # Train the model
  model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

  # Evaluate the model
  test_loss, test_acc = model.evaluate(X_test, y_test)
  save_data(test_loss, test_acc, layer)

def save_embeddings(embeddings, layer):
  save_path = f'./data/embeddings/gpt2/xl_{layer}.npy'
  np.save(save_path, embeddings)

def save_data(test_loss, test_acc, layer):
  write_data = f'{layer}: {test_loss} {test_acc}'
  with open('./gpt2_xl_location.txt', 'a') as file:
    file.write(f'{write_data}\n')



In [87]:
#main function 
model, tokenizer = init_model('openai-community/gpt2-xl')
# embeddings = generate_embeddings(sentences, model, tokenizer)

In [91]:
model.config.hidden_size

1600

In [None]:
for layer in range(49):
  probe(embeddings[layer], layer, model.config.hidden_size)