In [1]:
import sys
sys.path.append("../../")

from shared.sae_actions import load_pretrained_sae, sae_featurize_data 
from shared.models import MiniPileDataset
from shared.features import Feature, FeatureSample

%load_ext autoreload
%autoreload 2

* 'allow_mutation' has been removed


In [2]:
# Load SAE
sae = load_pretrained_sae("../training_sae/saes/spam_messages_test_20241203_013809")

  return torch.load(io.BytesIO(b))


featurize train

In [3]:
import glob
import json
import os

def load_features(feature_dir):
    """Load features from JSON files and convert to Feature objects"""
    # Get all JSON files from the features directory
    feature_files = glob.glob(os.path.join(feature_dir, "*.json"))

    # Load all feature JSONs into Feature objects
    features = []
    for file in feature_files:
        with open(file) as f:
            feature_dict = json.load(f)
            
            # Convert samples to FeatureSample objects
            high_act_samples = [
                FeatureSample(text=s["text"], act=s["act"]) 
                for s in feature_dict["high_act_samples"]
            ]
            low_act_samples = [
                FeatureSample(text=s["text"], act=s["act"])
                for s in feature_dict["low_act_samples"] 
            ]
            
            feature = Feature(
                index=feature_dict["index"],
                label=feature_dict["label"],
                attributes=feature_dict["attributes"],
                reasoning=feature_dict["reasoning"],
                density=feature_dict["density"],
                confidence=feature_dict["confidence"],
                high_act_samples=high_act_samples,
                low_act_samples=low_act_samples
            )
            features.append(feature)

    # Sort features by index to maintain order
    features.sort(key=lambda x: x.index)

    # Extract labels and ids for easy reference
    autointerp_feature_labels = [f.label for f in features]
    autointerp_feature_ids = [f.index for f in features]

    return features, autointerp_feature_labels, autointerp_feature_ids

# Load features
feature_dir = "../feature_extraction/features/20241203_021311"
features, autointerp_feature_labels, autointerp_feature_ids = load_features(feature_dir)


In [5]:
import pandas as pd
import numpy as np
import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize

def prepare_dataset(sentences_file, embeddings_file, sae, autointerp_feature_ids=None, feature_registry_file="feature_registry.npy", label_column='label', text_key="text", split_sentences=True):
    # Load dataset
    df = pd.read_csv(sentences_file)
    
    # Create MiniPileDataset
    mini_pile_dataset = MiniPileDataset(sentences_file, embeddings_file, key=text_key)
    
    # Featurize data
    try:
        X = np.memmap(
            feature_registry_file,
            dtype="float32",
            mode="r",
            shape=(sae.encoder.weight.shape[0], len(mini_pile_dataset.sentences)),
        )
    except FileNotFoundError:
        X = sae_featurize_data(mini_pile_dataset, sae, output_file=feature_registry_file)
    
    X = X.T
    
    # Apply num_features if specified
    if autointerp_feature_ids is not None:
        X = X[:, autointerp_feature_ids]
    
    y = np.where(df[label_column] == 'ham', 0, 1)
    
    if split_sentences:
        X_split = []
        y_split = []
        for i, text in enumerate(df[text_key]):
            sentences = sent_tokenize(text)
            X_split.extend([X[i]] * len(sentences))
            y_split.extend([y[i]] * len(sentences))
        X = np.array(X_split)
        y = np.array(y_split)
    
    return X, y

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [6]:
sentences_file = "../data_preparation/data/spam_messages_train.csv"
embeddings_file = "../data_preparation/embedding_chunks/embedded_chunks/spam_messages_train_20241106_234743/embeddings.npy"
X_train, y_train = prepare_dataset(sentences_file, embeddings_file, sae, autointerp_feature_ids=autointerp_feature_ids, feature_registry_file="feature_registry_train_all.npy")

In [7]:
sentences_file = "../data_preparation/data/spam_messages_val.csv"
embeddings_file = "../data_preparation/embedding_chunks/embedded_chunks/spam_messages_val_20241107_005540/embeddings.npy"
X_val, y_val = prepare_dataset(sentences_file, embeddings_file, sae, autointerp_feature_ids=autointerp_feature_ids, feature_registry_file="feature_registry_val_all.npy")

In [8]:
sentences_file = "../data_preparation/data/spam_messages_test.csv"
embeddings_file = "../data_preparation/embedding_chunks/embedded_chunks/spam_messages_test_20241107_005747/embeddings.npy"
X_test, y_test = prepare_dataset(sentences_file, embeddings_file, sae, autointerp_feature_ids=autointerp_feature_ids, feature_registry_file="feature_registry_test_all.npy")

decision tree intervention

In [9]:
from itertools import combinations


class FeatureMixer:
  def __init__(self, feature_group):
    self.feature_group = feature_group

  def grid(self, k_features_per_combo: int =2):
    """Perform a grid search over all possible combinations of features"""

    # Get all possible combinations of features
    return list(combinations(self.feature_group, k_features_per_combo))

In [11]:
from sklearn import tree
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score, accuracy_score, f1_score
import numpy as np
import tqdm
import concurrent.futures

# Grid search may take a while, you can curate the feature list to speed this process up significantly
def train_tree(x, y, depth):
  train_x, test_x, train_y, test_y = train_test_split(x, y, train_size=0.5, random_state=42)

  # Create a nice regularized tree
  model = tree.DecisionTreeClassifier(
      max_depth=depth,
      min_samples_leaf=len(train_x) // 20,
      random_state=42
  )

  model.fit(train_x, train_y)

  pred = model.predict(test_x)

  # Calculate the f1 score of the model
  accuracy = balanced_accuracy_score(test_y, pred)
  score = f1_score(test_y, pred)

  return model, pred, score, accuracy


def find_best_combo(features, k_features_per_combo = 2):
  combos = FeatureMixer(features).grid(k_features_per_combo=k_features_per_combo)
  best_combo = None
  best_model = None
  best_score = 0
  best_accuracy = 0

  MAX_WORKERS = 8

  futures_list = []

  with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    for combo in combos:
      def _test_combo(combo):
        def _select_feature_acts(combo, row):
          output = []
          for index, feature in enumerate(combo):
            for feature_act in row:
              if feature_act.feature.index == feature.index:
                output.append(feature_act.activation)
                break

          return output

        model, pred, score, accuracy = train_tree(X_train, y_train, depth=len(combo))

        return model, pred, score, accuracy, combo

      futures_list.append(executor.submit(_test_combo, combo))

    for future in tqdm.tqdm(futures_list):
      model, pred, score, accuracy, combo = future.result()

      if score > best_score:
        best_score = score
        best_combo = combo
        best_model = model
        best_accuracy = accuracy

  return best_combo, best_score, best_model, best_accuracy


best_combo_at_k = {}
for i in range(2):
  best_combo, best_score, best_model, best_accuracy = find_best_combo(features, k_features_per_combo = i + 1)
  print(i + 1, best_combo, best_score, best_accuracy, best_model)
  best_combo_at_k[i + 1] = (best_combo, best_score, best_model)


100%|██████████| 200/200 [00:20<00:00,  9.90it/s]


1 (Feature(index=166, label='Formal professional correspondence', attributes='formal language and structure, reference to organizational context, complete and structured sentences, calls to action or responses', reasoning='Upon analyzing both sets of samples, the texts from the first set consistently display elements of formality, official language, and often contain organizational or professional context. This includes mentions of formal titles, structured communication styles, and references to institutional practices. In contrast, the texts from the second set are casual, informal, and often contain unfinished thoughts or informal language. Moreover, the first set often includes complete structured sentences and explicit calls to action, such as invitations or requests for responses, which are absent in the second set.', density=0.004195095938602364, confidence=20.0, high_act_samples=[FeatureSample(text='dear sir / madam , my name is ahmed abdalla , director and board member , trans

 39%|███▊      | 7676/19900 [19:45<36:21,  5.60it/s]  

In [None]:
# Function to visualize the decision tree
def visualize_tree(tree_model, features, class_names=['negative', 'positive']):
    import graphviz
    dot_data = tree.export_graphviz(
        tree_model, 
        out_file=None, 
        feature_names=[feature.label for feature in features],
        class_names=class_names,
        filled=True,
        rounded=True,
        special_characters=True
    )
    graph = graphviz.Source(dot_data)
    return graph

In [None]:
# Anyways let's look at the best overall tree
BEST_TREE_INDEX = 3
best_features = best_combo_at_k[BEST_TREE_INDEX][0]
best_score = best_combo_at_k[BEST_TREE_INDEX][1]
best_tree = best_combo_at_k[BEST_TREE_INDEX][2]

# Visualize the tree
print(best_tree)
print(best_features)
print(best_score)
visualize_tree(best_tree, best_features)

feature selection

In [20]:
from sklearn.ensemble import RandomForestClassifier
from boruta import BorutaPy

boruta_selector = BorutaPy(
  RandomForestClassifier(n_jobs=-1, class_weight="balanced", max_depth=5),
  n_estimators="auto",
  verbose=0,
  random_state=1,
)
boruta_selector.fit(X_train[:,:200], y_train)

In [21]:
boruta_selector.n_features_

95

In [9]:
boruta_selector.ranking_

array([ 50,  41,   1,   2,   1,   2,  44,  19,   3,  77, 113,   2,  47,
        66,  94,  48, 113,   1, 113,   1,   2,  69,   1,   1,   1,   1,
         1,  15,   1,  44,  88,   1,   1,   1,   1,  99, 113,   1,   1,
        30,  81,   1,  85,  65,  71,   1,  91,  54,   1,  46,   1,   1,
        32,  28,   1,   1,   1, 105,   1,   1,   1, 103,  38,   1,  60,
        20,   1,  82,   1, 104,  63,   1,  21,   1,   1,  53,  56,   1,
         1,  40,   1,   1,   3, 113,  34,   1,  33,  77,   1,   1,   1,
        15,  70,  34,   5,  84,  95,  44,   1,   1,  38, 113,   7,  83,
         1,  17,  28,  26,  85,  60,  93, 113,  59,   1,   1,  15,  42,
         7,  10,   1,   1,  52,  10,   3,  79,  75,   1,  36,  95,  71,
       113,   1,   1,   1,  22,  91, 106,   1,  89,  67, 101,   1,  63,
       102,  13, 113,  18, 113, 113,   1,   1,  98,   1,  90,   1,   1,
         5, 113, 113,  74,  49, 113,   1,   1,  50,  77,  68,   1,   2,
        23,  79,  25,   1,  37,  99,  56,  12,   1,   1,   1,   

In [22]:
features_folder = "../feature_extraction/features/20241106_222955"

def get_feature_labels_from_mask(mask):
    import json
    import os
    from tabulate import tabulate
    
    feature_info = []
    for i, is_selected in enumerate(mask):
        if is_selected:
            feature_file = os.path.join(features_folder, f"feature_{i}.json")
            with open(feature_file) as f:
                feature_data = json.load(f)
                feature_info.append({
                    'index': i,
                    'label': feature_data['label'],
                    'confidence': feature_data.get('confidence', 'N/A')
                })
    
    # Print nicely formatted table
    headers = ['Index', 'Label', 'Confidence']
    table = [[info['index'], info['label'], info['confidence']] for info in feature_info]
    print(tabulate(table, headers=headers, tablefmt='grid'))
    
    return [info['label'] for info in feature_info]


In [23]:
print("\nSelected Features:")
get_feature_labels_from_mask(boruta_selector.support_weak_)


Selected Features:
+---------+----------------------------------------------+--------------+
|   Index | Label                                        |   Confidence |
|       4 | Contains contact information                 |           80 |
+---------+----------------------------------------------+--------------+
|      20 | Spam or promotional indication               |           26 |
+---------+----------------------------------------------+--------------+
|      21 | Fragmented, urgent, obfuscated text patterns |            6 |
+---------+----------------------------------------------+--------------+
|      39 | Text with special patterns and symbols       |           75 |
+---------+----------------------------------------------+--------------+
|      97 | Contains direct call to action               |           30 |
+---------+----------------------------------------------+--------------+
|     121 | Spam-like and promotional structure          |           30 |
+---------+-------

['Contains contact information',
 'Spam or promotional indication',
 'Fragmented, urgent, obfuscated text patterns',
 'Text with special patterns and symbols',
 'Contains direct call to action',
 'Spam-like and promotional structure',
 'Financial gain discussion',
 'Repetitive text sequences',
 'Promotional or sensitive content presence',
 'Structured financial and web content']

ok let's actually train it and evaluate it

In [24]:
from classifier_model import BinaryClassifierModel

# Initialize model
model = BinaryClassifierModel()

# Train model
model.train_model(
    X_train=X_train, 
    y_train=y_train,
    X_val=X_val,
    y_val=y_val,
    use_feature_selection=False
)

# Evaluate model
model.evaluate_model(X_test, y_test)

Scaling features...
Will try 5 different max_depths and 3 different n_estimators
Total combinations to try: 15


Training Progress:   0%|          | 0/15 [00:00<?, ?it/s]


Iteration Results (max_depth=10, n_estimators=10):
Accuracy: 0.6544
Precision: 0.9940
Recall: 0.2747
F1: 0.4304
AUC-ROC: 0.7318

Iteration Results (max_depth=10, n_estimators=50):
Accuracy: 0.6546
Precision: 0.9909
Recall: 0.2760
F1: 0.4317
AUC-ROC: 0.7341

Iteration Results (max_depth=10, n_estimators=100):
Accuracy: 0.6560
Precision: 0.9923
Recall: 0.2785
F1: 0.4349
AUC-ROC: 0.7347

Iteration Results (max_depth=20, n_estimators=10):
Accuracy: 0.6584
Precision: 0.9925
Recall: 0.2835
F1: 0.4410
AUC-ROC: 0.7327

Iteration Results (max_depth=20, n_estimators=50):
Accuracy: 0.6569
Precision: 0.9926
Recall: 0.2802
F1: 0.4370
AUC-ROC: 0.7350

Iteration Results (max_depth=20, n_estimators=100):
Accuracy: 0.6570
Precision: 0.9924
Recall: 0.2805
F1: 0.4374
AUC-ROC: 0.7374

Iteration Results (max_depth=30, n_estimators=10):
Accuracy: 0.6563
Precision: 0.9793
Recall: 0.2828
F1: 0.4389
AUC-ROC: 0.7318

Iteration Results (max_depth=30, n_estimators=50):
Accuracy: 0.6552
Precision: 0.9721
Recall: 

let's try it with a logistic regression model

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import numpy as np

# Initialize scaler and model
scaler = MinMaxScaler(feature_range=(0, 1))
model = LogisticRegression(max_iter=1000)

# Scale the features
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Train model
model.fit(X_train_scaled, y_train)

# Get predictions on validation set
y_pred_val = model.predict(X_val_scaled)
y_pred_proba_val = model.predict_proba(X_val_scaled)[:, 1]

# Calculate validation metrics
print("\nValidation Metrics:")
print(f"Accuracy: {accuracy_score(y_val, y_pred_val):.4f}")
print(f"Precision: {precision_score(y_val, y_pred_val):.4f}")
print(f"Recall: {recall_score(y_val, y_pred_val):.4f}")
print(f"F1: {f1_score(y_val, y_pred_val):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_val, y_pred_proba_val):.4f}")

# Get predictions on test set
y_pred_test = model.predict(X_test_scaled)
y_pred_proba_test = model.predict_proba(X_test_scaled)[:, 1]

# Calculate test metrics
print("\nTest Metrics:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_test):.4f}")
print(f"Precision: {precision_score(y_test, y_pred_test):.4f}")
print(f"Recall: {recall_score(y_test, y_pred_test):.4f}")
print(f"F1: {f1_score(y_test, y_pred_test):.4f}")
print(f"AUC-ROC: {roc_auc_score(y_test, y_pred_proba_test):.4f}")

# Print feature indexes with the highest weights and their labels
feature_weights = model.coef_[0]
top_feature_indices = np.argsort(np.abs(feature_weights))[::-1][:10]  # Get top 10 feature indices
print("\nTop 10 Features with Highest Weights:")

# Create boolean mask for get_feature_labels_from_mask
mask = np.zeros(len(feature_weights), dtype=bool)
mask[top_feature_indices] = True

# Get labels and print table with weights
feature_labels = get_feature_labels_from_mask(mask)
for idx, label in zip(top_feature_indices, feature_labels):
    print(f"Feature {idx} ({label}): weight = {feature_weights[idx]:.4f}")



Validation Metrics:
Accuracy: 0.6817
Precision: 0.9637
Recall: 0.2387
F1: 0.3826
AUC-ROC: 0.6811

Test Metrics:
Accuracy: 0.6944
Precision: 0.9498
Recall: 0.2355
F1: 0.3774
AUC-ROC: 0.6744

Top 10 Features with Highest Weights:
+---------+----------------------------------------------+--------------+
|   Index | Label                                        |   Confidence |
|      60 | Business and finance communication in Polish |           79 |
+---------+----------------------------------------------+--------------+
|      68 | Business and operational references          |           16 |
+---------+----------------------------------------------+--------------+
|      71 | Incomplete URL presence                      |           79 |
+---------+----------------------------------------------+--------------+
|      81 | Mentions Vikings and sports content          |           80 |
+---------+----------------------------------------------+--------------+
|      89 | Finance and economi