In [None]:
import h5py
import numpy as np
import torch
from transformers import DistilBertTokenizer
from tqdm import tqdm

# Initialize tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

"""
Preprocess an HDF5 file by tokenizing HTML content into input IDs and attention masks.
Save the processed data into a new HDF5 file.
"""

# Paths for the original and new HDF5 files
original_file_path = '/Users/imack/transfer/phishing_output.h5'
new_file_path = '/Users/imack/transfer/phishing_output_tokenized.h5'

def tokenize_with_overlap(html_content, max_chunk_length=512, stride=256):
    """
    Tokenize an HTML document into overlapping chunks with proper padding.
    """
    # Tokenize the entire document
    tokens = tokenizer(html_content, add_special_tokens=False, return_tensors='np')["input_ids"].squeeze()  # Flatten to 1D array

    # Create overlapping chunks
    chunks = []
    attention_masks = []
    for i in range(0, len(tokens), stride):
        chunk = tokens[i:i + max_chunk_length]
        padded_chunk = np.pad(
            chunk,
            (0, max(0, max_chunk_length - len(chunk))),  # Pad only if the chunk is shorter
            constant_values=tokenizer.pad_token_id
        )
        chunks.append(padded_chunk)
        attention_mask = [1] * len(chunk) + [0] * (max_chunk_length - len(chunk))
        attention_masks.append(attention_mask)
    
    # Convert to arrays of consistent shape
    chunks = np.array(chunks, dtype=np.int32)
    attention_masks = np.array(attention_masks, dtype=np.int32)
    
    return chunks, attention_masks

# Create the new HDF5 file
with h5py.File(original_file_path, 'r') as original_file, h5py.File(new_file_path, 'w') as outfile:
    # Iterate through slices (train, dev, test)
    for slice_name in original_file.keys():
        slice_group = original_file[slice_name]
        print(f"slice_name: {slice_name}, slice_group: {slice_group}")
        
        # Prepare new group in the new HDF5 file
        new_group = outfile.create_group(slice_name)
        
        # Copy labels directly
        new_group.create_dataset('labels', data=slice_group['labels'][:])
        new_group.create_dataset('urls', data=slice_group['labels'][:])

        html_contents = slice_group['html_content'][:]
        urls = slice_group['urls'][:]
        
        # Create new datasets for processed data
        input_ids_dataset = new_group.create_dataset(
            "html_input_ids", 
            shape=(0,), 
            maxshape=(None,), 
            dtype=h5py.special_dtype(vlen=np.dtype('int32'))
        )
        attention_masks_dataset = new_group.create_dataset(
            "html_attention_masks", 
            shape=(0,), 
            maxshape=(None,), 
            dtype=h5py.special_dtype(vlen=np.dtype('int32'))
        )
        
        url_input_ids_dataset = new_group.create_dataset(
            "url_input_ids", 
            shape=(0, 128),  # Fixed-length for URLs
            maxshape=(None, 128), 
            dtype=np.int32
        )
        url_attention_masks_dataset = new_group.create_dataset(
            "url_attention_masks", 
            shape=(0, 128), 
            maxshape=(None, 128), 
            dtype=np.int32
        )
        
        # Process each HTML document
        for i, html_content in tqdm(enumerate(html_contents), total=len(html_contents)):
            chunks, attention_masks = tokenize_with_overlap(html_content.decode('utf-8'))
            url = urls[i].decode('utf-8')
            
            # Flatten the list of chunks and attention masks
            flat_input_ids = np.concatenate(chunks).astype(np.int32)
            flat_attention_masks = np.concatenate(attention_masks).astype(np.int32)
            
            # Resize and append to datasets
            input_ids_dataset.resize((input_ids_dataset.shape[0] + 1,))
            attention_masks_dataset.resize((attention_masks_dataset.shape[0] + 1,))
            
            input_ids_dataset[-1] = flat_input_ids
            attention_masks_dataset[-1] = flat_attention_masks
            
            encoded_url_input = tokenizer(
                url,
                padding='max_length',
                truncation=True,
                max_length=128,
                return_tensors='np'
            )
            
            url_input_ids_dataset.resize((url_input_ids_dataset.shape[0] + 1, 128))
            url_input_ids_dataset[-1, :] = encoded_url_input['input_ids']
            
            url_attention_masks_dataset.resize((url_attention_masks_dataset.shape[0] + 1, 128))
            url_attention_masks_dataset[-1, :] = encoded_url_input['attention_mask']


    print("Preprocessing complete. Saved to", new_file_path)



slice_name: dev, slice_group: <HDF5 group "/dev" (4 members)>


  0%|          | 0/7126 [00:00<?, ?it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (8661 > 512). Running this sequence through the model will result in indexing errors
  1%|          | 42/7126 [00:19<20:12,  5.84it/s]  