# Pythia Analysis - train small models on HDFS data

* use tokenized version of preprocessed HDFS events
* start with very small pythia models, test increasing size
* start with fine-tuning, then consider resetting weights and training from scratch
* experiment with different tokenizers
  * https://chatgpt.com/share/67448f53-29a0-800f-9913-af22d6ed0894


In [1]:
try:
  from google.colab import userdata

  !git clone https://github.com/honicky/deep-log-analysis.git
  !mv deep-log-analysis/* .
  !rm -rf deep-log-analysis
except:
  pass

In [2]:
try:
    import logparser.Drain as Drain
except ImportError:
    %pip install requests git+https://github.com/logpai/logparser

%pip install transformers torch torchvision torchaudio wandb python-dotenv datasets


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [3]:
%load_ext autoreload
%autoreload 2
import dataloaders as dl

# Load secrets

If we are in colab, we get them from the `userdata` module, otherwise we get them from a .env file


In [4]:
import os
try:
  from google.colab import userdata
  os.environ["HF_WRITE_TOKEN"] = userdata.get('HF_WRITE_TOKEN')
  os.environ["WANDB_API_KEY"] = userdata.get('WANDB_API_KEY')
except ImportError:
  from dotenv import load_dotenv
  load_dotenv()


# Load the validation dataset

We will do some hyperparameter tuning on the validation set, in particular the threshold for the perplexity to be considered anomalous, and the number of anomalies per block to be considered anomalous.

In [5]:
from datasets import load_dataset
encoded_blocks_dataset = load_dataset("honicky/hdfs-logs-encoded-blocks")



  from .autonotebook import tqdm as notebook_tqdm


In [6]:
encoded_blocks_val_pdf = encoded_blocks_dataset['validation'].to_pandas()
encoded_blocks_val_pdf.head()

Unnamed: 0,event_encoded,tokenized_block,block_id,label,__index_level_0__
0,<|sep|>0 /10.251.125.193:49078 /10.251.125.193...,"[50277, 17, 1227, 740, 15, 21451, 15, 9312, 15...",blk_8706546487798466885,Normal,370570
1,<|sep|>0 /10.251.74.192:36984 /10.251.74.192:5...,"[50277, 17, 1227, 740, 15, 21451, 15, 3566, 15...",blk_3164806166289090589,Normal,387094
2,<|sep|>0 /10.251.67.113:44473 /10.251.67.113:5...,"[50277, 17, 1227, 740, 15, 21451, 15, 2251, 15...",blk_6334862664379948501,Normal,524461
3,<|sep|>0 /10.250.15.67:36719 /10.250.15.67:500...,"[50277, 17, 1227, 740, 15, 9519, 15, 1010, 15,...",blk_-4209139676364491359,Normal,491282
4,<|sep|>0 /10.251.111.228:56317 /10.251.111.228...,"[50277, 17, 1227, 740, 15, 21451, 15, 10768, 1...",blk_-7362312881779468190,Normal,671


In [7]:
encoded_blocks_val_pdf.label.value_counts()

label
Normal     55800
Anomaly     1706
Name: count, dtype: int64

# Load the model and tokenizer

Make sure that the tokenizer has the <|sep|> token

In [8]:
from transformers import GPTNeoXForCausalLM, GPTNeoXTokenizerFast

model = GPTNeoXForCausalLM.from_pretrained("honicky/pythia-14m-hdfs-logs")
tokenizer = GPTNeoXTokenizerFast.from_pretrained("honicky/pythia-14m-hdfs-logs")

tokenizer


GPTNeoXTokenizerFast(name_or_path='honicky/pythia-14m-hdfs-logs', vocab_size=50254, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'sep_token': '<|sep|>', 'pad_token': '<|endoftext|>', 'additional_special_tokens': ['<|sep|>']}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<|padding|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	50254: AddedToken("                        ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50255: AddedToken("                       ", rstrip=False, lstrip=False, single_word=False, normalized=True, special=False),
	50256: AddedToken("                      ", rstrip=False, lstrip=F

In [9]:
import model_utils
import numpy as np
import torch
from tqdm.auto import tqdm


def calculate_perplexity(token_ids, model):
    device = model_utils.get_device()
    model = model.to(device)
    model.eval()
    
    # Convert token IDs to tensor and truncate if needed
    max_length = model_utils.training_params()["MAX_LENGTH"]
    input_ids = torch.tensor([token_ids[:max_length]], device=device)
    
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids)
        # Get logits and shift them for calculating loss
        logits = outputs.logits[0, :-1, :]  # Remove last position
        labels = input_ids[0, 1:]  # Remove first position
        
        # Calculate per-token loss
        loss_fct = torch.nn.CrossEntropyLoss(reduction='none')
        token_losses = loss_fct(logits, labels)
        
        token_perplexities = torch.exp(token_losses)
        
    return token_perplexities

# Calculate perplexity for each row in the validation dataset
perplexities = []
for tokens in tqdm(encoded_blocks_val_pdf['tokenized_block'], desc="Calculating perplexities"):
    perp = calculate_perplexity(tokens, model).to("cpu")
    perplexities.append(perp)

# Add perplexities as a new column
encoded_blocks_val_pdf['perplexities'] = [
    np.array(p) for p in perplexities
]

  input_ids = torch.tensor([token_ids[:max_length]], device=device)
Calculating perplexities: 100%|██████████| 57506/57506 [34:11<00:00, 28.04it/s] 


In [19]:

encoded_blocks_val_pdf['perplexities'] = [
    np.array(p) for p in perplexities
]

  np.array(p) for p in perplexities


In [34]:
def is_anomaly(perplexities, threshold=1000, num_anomalies=2):
    return np.sum(perplexities > threshold) >= num_anomalies





0        False
1        False
2        False
3        False
4        False
         ...  
57501    False
57502    False
57503    False
57504    False
57505    False
Name: perplexities, Length: 57506, dtype: bool

In [39]:
import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc
import numpy as np
# ... imports and setup remain the same ...

# Create arrays to store results
thresholds = 2 ** np.linspace(1, np.log2(100000), 100)
num_anomalies_range = range(1, 11)
fig = go.Figure()

y_true = (encoded_blocks_val_pdf['label'] == 'Anomaly').astype(int)

# Calculate ROC curve for each num_anomalies
for num_anom in num_anomalies_range:
    y_pred_scores = []
    predictions_per_threshold = []  # Store predictions for each threshold
    
    # Calculate predictions for each threshold
    for _, row in encoded_blocks_val_pdf.iterrows():
        # Get binary predictions for each threshold
        thresh_predictions = [np.sum(row['perplexities'] > thresh) >= num_anom for thresh in thresholds]
        predictions_per_threshold.append(thresh_predictions)
        y_pred_scores.append(np.mean(thresh_predictions))
    
    predictions_per_threshold = np.array(predictions_per_threshold)
    
    fpr, tpr, _ = roc_curve(y_true, y_pred_scores)
    auc_score = auc(fpr, tpr)
    
    # Calculate precision and recall for our original thresholds
    precision_per_threshold = []
    recall_per_threshold = []
    
    for i in range(len(thresholds)):
        thresh_preds = predictions_per_threshold[:, i]
        true_positives = np.sum((y_true == 1) & (thresh_preds == 1))
        false_positives = np.sum((y_true == 0) & (thresh_preds == 1))
        false_negatives = np.sum((y_true == 1) & (thresh_preds == 0))
        
        # Calculate precision and recall
        precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
        recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
        
        precision_per_threshold.append(precision)
        recall_per_threshold.append(recall)
    
    # Create hover text
    hover_text = [
        f'Threshold: {thresh:.2f}<br>'
        f'Precision: {prec:.3f}<br>'
        f'Recall: {rec:.3f}'
        for thresh, prec, rec in zip(thresholds, precision_per_threshold, recall_per_threshold)
    ]
    
    # Calculate FPR for our thresholds
    fpr_per_threshold = []
    for i in range(len(thresholds)):
        thresh_preds = predictions_per_threshold[:, i]
        false_positives = np.sum((y_true == 0) & (thresh_preds == 1))
        true_negatives = np.sum((y_true == 0) & (thresh_preds == 0))
        fpr_per_threshold.append(false_positives / (false_positives + true_negatives))
    
    # Add ROC curve to plot with hover information
    fig.add_trace(go.Scatter(
        x=fpr_per_threshold, 
        y=recall_per_threshold,  # recall is the same as TPR
        name=f'num_anomalies={num_anom} (AUC={auc_score:.3f})',
        mode='lines+markers',
        hovertext=hover_text,
        hoverinfo='text',
        marker=dict(size=5)
    ))

# Add diagonal line
fig.add_trace(go.Scatter(
    x=[0, 1], 
    y=[0, 1],
    name='Random',
    line=dict(dash='dash', color='gray'),
    mode='lines'
))

# Update layout
fig.update_layout(
    title='ROC Curves for Different Numbers of Anomalies',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    legend_title='Parameters',
    width=800,
    height=600
)

fig.show()

In [40]:
import plotly.graph_objects as go
import numpy as np

# Create arrays to store results
thresholds = 2 ** np.linspace(1, np.log2(100000), 100)
num_anomalies_range = range(1, 11)
fig = go.Figure()

y_true = (encoded_blocks_val_pdf['label'] == 'Anomaly').astype(int)

# HTML template for confusion matrix table
table_template = """
<b>Threshold:</b> {thresh:.2f}<br>
<b>Precision:</b> {prec:.3f}<br>
<b>Recall:</b> {rec:.3f}<br>
<table style="border-collapse: collapse; margin-top: 10px;">
    <tr>
        <td style="border: 1px solid gray; padding: 5px;"></td>
        <td style="border: 1px solid gray; padding: 5px;"><b>Predicted +</b></td>
        <td style="border: 1px solid gray; padding: 5px;"><b>Predicted -</b></td>
    </tr>
    <tr>
        <td style="border: 1px solid gray; padding: 5px;"><b>Actual +</b></td>
        <td style="border: 1px solid gray; padding: 5px;">{tp}</td>
        <td style="border: 1px solid gray; padding: 5px;">{fn}</td>
    </tr>
    <tr>
        <td style="border: 1px solid gray; padding: 5px;"><b>Actual -</b></td>
        <td style="border: 1px solid gray; padding: 5px;">{fp}</td>
        <td style="border: 1px solid gray; padding: 5px;">{tn}</td>
    </tr>
</table>
"""

# Calculate Precision-Recall curve for each num_anomalies
for num_anom in num_anomalies_range:
    predictions_per_threshold = []
    
    # Calculate predictions for each threshold
    for _, row in encoded_blocks_val_pdf.iterrows():
        thresh_predictions = [np.sum(row['perplexities'] > thresh) >= num_anom for thresh in thresholds]
        predictions_per_threshold.append(thresh_predictions)
    
    predictions_per_threshold = np.array(predictions_per_threshold)
    
    # Calculate precision and recall for each threshold
    precision_per_threshold = []
    recall_per_threshold = []
    
    # Create hover text with confusion matrix table
    hover_text = []
    for i, thresh in enumerate(thresholds):
        thresh_preds = predictions_per_threshold[:, i]
        tp = np.sum((y_true == 1) & (thresh_preds == 1))
        tn = np.sum((y_true == 0) & (thresh_preds == 0))
        fp = np.sum((y_true == 0) & (thresh_preds == 1))
        fn = np.sum((y_true == 1) & (thresh_preds == 0))
        
        precision = tp / (tp + fp) if (tp + fp) > 0 else 1.0
        recall = tp / (tp + fn) if (tp + fn) > 0 else 0
        
        precision_per_threshold.append(precision)
        recall_per_threshold.append(recall)
        
        hover_text.append(table_template.format(
            thresh=thresh,
            prec=precision,
            rec=recall,
            tp=tp,
            tn=tn,
            fp=fp,
            fn=fn
        ))
    
    # Calculate average precision (AP)
    ap = 0
    for i in range(len(recall_per_threshold)-1):
        ap += (recall_per_threshold[i+1] - recall_per_threshold[i]) * precision_per_threshold[i]
    
    # Add Precision-Recall curve to plot
    fig.add_trace(go.Scatter(
        x=recall_per_threshold,
        y=precision_per_threshold,
        name=f'num_anomalies={num_anom} (AP={ap:.3f})',
        mode='lines+markers',
        hovertext=hover_text,
        hoverinfo='text',
        marker=dict(size=5)
    ))

# Update layout
fig.update_layout(
    title='Precision-Recall Curves for Different Numbers of Anomalies',
    xaxis_title='Recall',
    yaxis_title='Precision',
    legend_title='Parameters',
    width=800,
    height=600,
    yaxis_range=[0, 1.05],
    xaxis_range=[0, 1]
)

fig.show()

In [37]:

import plotly.graph_objects as go
from sklearn.metrics import roc_curve, auc
import numpy as np

# Create arrays to store results
thresholds = 2 ** np.linspace(1, np.log2(100000), 100)
# Calculate percentage steps between 0.1% and 1%
percentages = np.linspace(0.001, 0.01, 20)  # 20 steps from 0.1% to 1%
fig = go.Figure()

# Convert labels to binary (1 for Anomaly, 0 for Normal)
y_true = (encoded_blocks_val_pdf['label'] == 'Anomaly').astype(int)

# Calculate ROC curve for each percentage
for percentage in percentages:
    y_pred_scores = []
    for _, row in encoded_blocks_val_pdf.iterrows():
        # Calculate required number of anomalies based on block length
        block_length = len(row['perplexities'])
        num_anom = max(1, int(block_length * percentage))
        
        # Calculate scores using the dynamic num_anom
        scores = [np.sum(row['perplexities'] > thresh) >= num_anom for thresh in thresholds]
        y_pred_scores.append(np.mean(scores))
    
    fpr, tpr, _ = roc_curve(y_true, y_pred_scores)
    auc_score = auc(fpr, tpr)
    
    # Add ROC curve to plot with percentage in name
    fig.add_trace(go.Scatter(
        x=fpr, 
        y=tpr,
        name=f'{percentage:.1%} of block (AUC={auc_score:.3f})',
        mode='lines'
    ))


# Add diagonal line
fig.add_trace(go.Scatter(
    x=[0, 1], 
    y=[0, 1],
    name='Random',
    line=dict(dash='dash', color='gray'),
    mode='lines'
))

# Update layout
fig.update_layout(
    title='ROC Curves for Different Numbers of Anomalies',
    xaxis_title='False Positive Rate',
    yaxis_title='True Positive Rate',
    legend_title='Parameters',
    width=800,
    height=600
)

fig.show()