In [None]:
# Notebook to (i) extract L17 & L21 hidden dims for the last token on Llama3 model with SF/NP-v-control data
#             (ii) run logistic regression models on this data

# pip install nnsight transformers torch scikit-learn seaborn


In [None]:
# load llama3
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np

XT = "hf_your_token_here"  # replace with your Hugging Face token
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", token=XT)
tokenizer.pad_token_id = tokenizer.eos_token_id
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", 
                                             token=XT, device_map="auto", torch_dtype=torch.float16)

#!nvidia-smi

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

In [3]:
# Function to get the output of Layer X
def get_hidden_LT(texts, model, tokenizer, layer=17, token_pos=-1):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True)
    # Extract the hidden dims for layer  & token_pos  (-1 == last)
    hidden_states = outputs.hidden_states[layer]
    hidden_output = hidden_states[:, token_pos :].squeeze().cpu().numpy()
    return hidden_output


In [None]:
# Load Data
import pandas as pd
df_ = pd.read_csv("../1-Generation-and-Dataset/data-stories-for-probing-20240525.csv")
print(df_.frame_label.value_counts())

df1 = df_[df_.frame_label!='nurturing-parent']  # df for: SF-v-control
df2 = df_[df_.frame_label!='strict-father']  # df for: NP-v-control
labels1 = np.array(df1.frame_label.map({'strict-father': 1, 'nurturing-parent':1, 'control': 0}))  
labels2 = np.array(df2.frame_label.map({'strict-father': 1, 'nurturing-parent':1, 'control': 0}))  
texts1 = list(df1.story)
texts2 = list(df2.story)
#print(df_.story.apply(lambda x: len(x.split())).max())  # longest text is 213 words  ≈ 425 tokens

frame_label
nurturing-parent    39
control             38
strict-father       38
Name: count, dtype: int64


In [5]:
# Extract features for each text
features1 = []
for text in texts1:
    features1.append(get_hidden_LT(text, model, tokenizer))
    #torch.cuda.empty_cache()  # Clear the CUDA cache

features2 = []
for text in texts2:
    features2.append(get_hidden_LT(text, model, tokenizer))
    #torch.cuda.empty_cache()  # Clear the CUDA cache

print(len(features1), features1[0].shape)  # 76 (4096,)
print(len(features2), features2[0].shape)  # 77 (4096,)

76 (4096,)
77 (4096,)


In [6]:
# # Export as Pickle
import pickle
# with open('hidden-sf-17.pkl', 'wb') as p:
#     dat = {'features': features1, 'labels': labels1, 'info':'SFvctrl @L17'}
#     pickle.dump(dat, p)

# with open('hidden-np-17.pkl', 'wb') as p:
#     dat = {'features': features2, 'labels': labels2, 'info':'NPvctrl @L17'}
#     pickle.dump(dat, p)

In [7]:
# CLASSIFY SF
print("Classifying SF")
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import RFE

# Split the data into training and testing sets
X1 = np.array(features1)
y1 = np.array(labels1)
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

# Initialize RFE with the logistic regression model to select top N features
clf1 = LogisticRegression(solver='liblinear')  # a faster solver (?)
rfe1 = RFE(estimator=clf1, n_features_to_select=5)
print("going for rfe")
rfe1.fit(X_train1, y_train1)
print("done, now clf")

# Transform the training and testing data to the selected features and train logistic model on these
X_train1_rfe = rfe1.transform(X_train1)
X_test1_rfe = rfe1.transform(X_test1)
clf1.fit(X_train1_rfe, y_train1)

# Make predictions & evaluate
y_test_pred1 = clf1.predict(X_test1_rfe)
y_train_pred1 = clf1.predict(X_train1_rfe)

print("Test/Train Accuracy≈F1", accuracy_score(y_test1, y_test_pred1), accuracy_score(y_train1, y_train_pred1)) 
#print(classification_report(y_test1, y_pred1))
print("Selected Feats:",  np.where(rfe1.support_)[0])  # also rfe1.ranking_

# WOW 0.94 => RFE-10: lowers it to 0.88, not bad (same as RFE-100), RFE5= 0.94/0.93, RFE3=0.88/0.85, RFE1=0.81/0.78
# Selected Feats: [ 133** 1292 2040* 2692*** 3261]

Classifying SF
going for rfe


done, now clf
Test/Train Accuracy≈F1 0.9375 0.9333333333333333
Selected Feats: [ 133 1292 2040 2692 3261]


In [8]:
# CLASSIFY NP
print("Classifying NP")

# Split the data into training and testing sets
X2 = np.array(features2)
y2 = np.array(labels2)
X_train2, X_test2, y_train2, y_test2 = train_test_split(X2, y2, test_size=0.2, random_state=42)

# Initialize and train the logistic regression model
clf2 = LogisticRegression(solver='liblinear')
rfe2 = RFE(estimator=clf2, n_features_to_select=5)
print("going for rfe")
rfe2.fit(X_train2, y_train2)
print("done, now clf")

# Transform the training and testing data to the selected features and train logistic model on these
X_train2_rfe = rfe2.transform(X_train2)
X_test2_rfe = rfe2.transform(X_test2)
clf2.fit(X_train2_rfe, y_train2)

# Evaluate
y_testpred2 = clf2.predict(X_test2_rfe)
y_trainpred2 = clf2.predict(X_train2_rfe)

print("Test/Train Accuracy≈F1", accuracy_score(y_test2, y_testpred2), accuracy_score(y_train2, y_trainpred2))  
#print(classification_report(y_test2, y_pred2))
print("Selected Feats:", np.where(rfe2.support_)[0])  

# => RFE40=0.94!, RFE30=0.88, RFE20=0.94!, RFE10=0.88, RFE5=0.88/???, RFE2=0.94/0.86, RF1=0.88/0.80
# Selected Feats:  [ 529*** 540 1658** 1707 2209]]

Classifying NP
going for rfe


done, now clf
Test/Train Accuracy≈F1 0.875 0.9016393442622951
Selected Feats: [ 529  540 1658 1707 2209]


In [9]:
# import matplotlib.pyplot as plt

# # Extract the coefficients (beta values)
# coefficients1 = clf1.coef_[0]

# # Plot the coefficients to see their magnitude
# plt.figure(figsize=(10, 6))
# plt.bar(range(len(coefficients1)), coefficients1)
# plt.xlabel('Feature Index')
# plt.ylabel('Coefficient Value')
# plt.title('Feature Importance (Coefficient Values)')
# plt.show()

# # Identify the most important features (largest absolute coefficients)
# important_features = np.argsort(np.abs(coefficients1))[::-1]
# print("Top 10 most important features (by absolute value of coefficients):")
# print(important_features[:10])
# print("Corresponding coefficient values:")
# print(coefficients1[important_features[:10]])