In [1]:
from torchvision import transforms
import glob
import pathlib
import torch
import torch.utils.data as dt
import torchvision

In [None]:
import numpy as np

def soft_thresholding(x, lambd):
    return np.sign(x) * np.maximum(np.abs(x) - lambd, 0)

def fista_update(X, D, alpha, lambd, max_iter=100):
    L = np.linalg.norm(D, ord=2) ** 2
    t = 1
    Z = np.zeros_like(alpha)
    Y = np.copy(Z)
    
    for _ in range(max_iter):
        Z_old = np.copy(Z)
        Z = soft_thresholding(Y + (1 / L) * D.T @ (X - D @ Y), lambd / L)
        t_new = (1 + np.sqrt(1 + 4 * t ** 2)) / 2
        Y = Z + ((t - 1) / t_new) * (Z - Z_old)
        t = t_new
    
    return Z

def dictionary_update(X, Z):
    return X @ np.linalg.pinv(Z)

# Example usage
np.random.seed(0)
X = np.random.randn(100, 50)  # Data matrix
D = np.random.randn(100, 30)  # Initial dictionary
alpha = np.random.randn(30, 50)  # Initial sparse codes
lambd = 0.1  # Regularization parameter

# Perform dictionary learning
for _ in range(10):
    alpha = fista_update(X, D, alpha, lambd)
    D = dictionary_update(X, alpha)

print("Learned dictionary:", D)
print("Sparse codes:", alpha)

In [3]:
from datasets import load_dataset
fw = load_dataset("HuggingFaceFW/fineweb-edu", name="CC-MAIN-2024-10", split="train", streaming=True)

In [None]:
i = 0
for x in fw:
    print(x)
    i += 1
    if i > 20:
        break

In [5]:
from transformers import BertModel

model = BertModel.from_pretrained("bert-base-uncased", torch_dtype=torch.float16, attn_implementation="sdpa")

In [None]:
# # Function to extract activations from the BERT model
# def extract_activations(text, model, tokenizer):
#     inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
#     with torch.no_grad():
#         outputs = model(**inputs)
#     return outputs.last_hidden_state

# # Extract activations for the first 10 entries
# activations = []
# for i, entry in enumerate(fw):
#     if i >= 10:
#         break
#     text = entry['text']  # Assuming the dataset has a 'text' field
#     activation = extract_activations(text, model, tokenizer)
#     activations.append(activation)

# # Print the activations
# for i, activation in enumerate(activations):
#     print(f"Entry {i} activations: {activation}")

In [None]:
import torch
from transformers import AutoTokenizer, BloomModel

# Load pre-trained model and tokenizer from Hugging Face
tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-560m")
model = BloomModel.from_pretrained("bigscience/bloom-560m")
model.eval()

def extract_embeddings(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs, output_hidden_states=True).mean(axis=(0, 1))
    return outputs

# Example usage with the streaming dataset
for x in fw:
    text = x['text']  # Assuming the dataset has a 'text' field
    embeddings = extract_embeddings(text, model, tokenizer)
    for i, layer_embedding in enumerate(embeddings):
        print(f"Layer {i} embeddings: {layer_embedding}")

    break  # Remove this break to process the entire dataset