In [1]:
import os
import pandas as pd
from tokenizers import Tokenizer
from transformers import AutoTokenizer, AutoModel
import torch
from torch import Tensor
import torch.nn.functional as F

MAX_CONTEXT_LENGTH = 512
OVERLAP = 50
PASSAGE_TOKEN = [6019, 1024] #Tokenized version of "passage: "
tokenizer = AutoTokenizer.from_pretrained('intfloat/e5-large-v2')
#model = AutoModel.from_pretrained('intfloat/e5-large-v2')

def average_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
    return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]

def preprocess_tokens(tokens):
    '''Pads text, handles long text by separating it into smaller batches with some overlap'''
    BOS_TOKEN = 101
    EOS_TOKEN = 102
    if len(tokens) > MAX_CONTEXT_LENGTH:
        output = tokens[:MAX_CONTEXT_LENGTH-2] + [EOS_TOKEN]
        remaining_tokens = [BOS_TOKEN] + PASSAGE_TOKEN + tokens[MAX_CONTEXT_LENGTH-OVERLAP-1:]
        return [output] + preprocess_tokens(remaining_tokens)
    else:
        return [tokens]

def create_embedding(batch_dict):
    '''Pass a list of documents of the correct batch size with proper padding (eg. 32 x 512)'''
    outputs = model(**batch_dict)
    embeddings = average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

    # normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    return embeddings

def preprocess_files(text):
    tokens = tokenizer.encode(text)

    if len(tokens) > MAX_CONTEXT_LENGTH:
        documents = preprocess_tokens(tokens)
    else:
        documents = [tokens]
    documents = [tokenizer.decode(doc, skip_special_tokens=True) for doc in documents]

    return documents

    '''
    for doc in documents:
        e = create_embedding(doc)
        file_text = f"Embedding: [{e}] \n Contents: {decode(doc)}
        write(file_text)
    
    '''


def process_files(root_folder, destination):
    documents = []
    for dirpath, dirnames, filenames in os.walk(root_folder):
        for filename in filenames:
            if filename.endswith(".txt"):
                file_path = os.path.join(dirpath, filename)
                try:
                    with open(file_path, "r", encoding="utf-8") as f:
                        text = "passage: " + f.read()
                        documents += preprocess_files(text)

                except UnicodeDecodeError:
                    assert 1==1
    return documents

ROOT = r"C:\Users\DSU\OneDrive - Dakota State University\Documents\cnh-clm\MITREattack"
DESTINATION = r"mitre.txt"
documents = process_files(ROOT, DESTINATION)

Token indices sequence length is longer than the specified maximum sequence length for this model (543 > 512). Running this sequence through the model will result in indexing errors


In [None]:
def vectorize_docs(documents, embedding_file_path, batch_size=32):
    l = len(documents)
    i = 0
    with open(embedding_file_path, "w") as f:
        while i < l:
            batch_dict = tokenizer(documents[i:i+batch_size], padding=True, max_length=512, truncation=True, return_tensors='pt')
            #embeddings = create_embedding(batch_dict)
            embeddings = torch.rand(32, 10)
            for k, emb in enumerate(embeddings.tolist()):
                f.write(f"{i+k}: [{",".join([str(p) for p in emb])}]\n")
            i += batch_size

vectorize_docs(documents, DESTINATION)

In [None]:
batch_dict = tokenizer(documents[0: 32], padding=True, max_length=512, truncation=True, return_tensors='pt')
embeddings = create_embedding(batch_dict)

In [109]:
s = ["one " * 10, "two " * 10]
batch_dict = tokenizer(s, padding=True, max_length=512, truncation=True, return_tensors='pt')
#x = preprocess_tokens(batch_dict["input_ids"].tolist()[0])
batch_dict["input_ids"].shape

torch.Size([2, 12])

In [78]:
tokenizer.decode([101, 2028, 102, 0])

'[CLS] one [SEP] [PAD]'