In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import warnings
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import RobertaTokenizer, RobertaModel, AdamW, get_linear_schedule_with_warmup
from datetime import datetime
import numpy as np
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, confusion_matrix
import logging
import torch.nn as nn
from tqdm import tqdm, trange
import re
from sklearn.model_selection import KFold
import seaborn as sns
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

df = pd.read_csv("dataset.csv")
df = df.head(22000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22000 entries, 0 to 21999
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   timestamp      22000 non-null  object
 1   LineId         22000 non-null  int64 
 2   Component      22000 non-null  object
 3   Content        22000 non-null  object
 4   EventId        22000 non-null  object
 5   EventTemplate  22000 non-null  object
 6   log            22000 non-null  object
dtypes: int64(1), object(6)
memory usage: 1.2+ MB


In [3]:
def preprocess_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def preprocess_df(df):
    features = ['log']
    target_labels = ['timestamp', 'LineId', 'Component', 'Content', 'EventId', 'EventTemplate']
    label_encoders = {}
    label_columns = []

    df['log'] = df['log'].apply(preprocess_text)

    for col in target_labels:
        le = LabelEncoder()
        df[col + '_encoded'] = le.fit_transform(df[col])
        label_encoders[col] = le
        label_columns.append(col + '_encoded')

        logger.info(f"Column: {col}")
        logger.info(f"Unique values: {len(np.unique(df[col + '_encoded']))}")
        logger.info(f"Min: {df[col + '_encoded'].min()}, Max: {df[col + '_encoded'].max()}")
        logger.info("---")

    return df, label_columns, label_encoders

df, label_columns, label_encoders = preprocess_df(df)

INFO:__main__:Column: timestamp
INFO:__main__:Unique values: 13167
INFO:__main__:Min: 0, Max: 13166
INFO:__main__:---
INFO:__main__:Column: LineId
INFO:__main__:Unique values: 2000
INFO:__main__:Min: 0, Max: 1999
INFO:__main__:---
INFO:__main__:Column: Component
INFO:__main__:Unique values: 299
INFO:__main__:Min: 0, Max: 298
INFO:__main__:---
INFO:__main__:Column: Content
INFO:__main__:Unique values: 11001
INFO:__main__:Min: 0, Max: 11000
INFO:__main__:---
INFO:__main__:Column: EventId
INFO:__main__:Unique values: 341
INFO:__main__:Min: 0, Max: 340
INFO:__main__:---
INFO:__main__:Column: EventTemplate
INFO:__main__:Unique values: 944
INFO:__main__:Min: 0, Max: 943
INFO:__main__:---


In [4]:
class LogDataset(Dataset):
    def __init__(self, logs, labels, tokenizer, max_len):
        self.logs = logs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.num_labels = [len(np.unique(labels[:, i])) for i in range(labels.shape[1])]

    def __len__(self):
        return len(self.logs)

    def __getitem__(self, item):
        log = str(self.logs[item])
        labels = self.labels[item]

        encoding = self.tokenizer.encode_plus(
            log,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'log_text': log,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(labels, dtype=torch.long)
        }

In [5]:
class MultiTaskRobertaModel(nn.Module):
    def __init__(self, model_name, num_labels):
        super(MultiTaskRobertaModel, self).__init__()
        self.roberta = RobertaModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.classifiers = nn.ModuleList([nn.Linear(self.roberta.config.hidden_size, num_label) for num_label in num_labels])
        
    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.roberta(input_ids=input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        pooled_output = self.dropout(sequence_output[:, 0])

        logits = [classifier(pooled_output) for classifier in self.classifiers]

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            losses = [loss_fct(logit, label) for logit, label in zip(logits, labels.T)]
            loss = sum(losses)
            return loss, logits
        return logits

print("Unique label values:", np.unique(df[label_columns].values))
print("Max label value:", np.max(df[label_columns].values))
print("Min label value:", np.min(df[label_columns].values))

Unique label values: [    0     1     2 ... 13164 13165 13166]
Max label value: 13166
Min label value: 0


In [6]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
num_labels = [len(np.unique(df[col])) for col in label_columns]
print("Number of labels for each task:", num_labels)
model = MultiTaskRobertaModel('roberta-base', num_labels)

logs = df['log'].values.tolist()
labels = df[label_columns].values

dataset = LogDataset(logs, labels, tokenizer, max_len=128)

Number of labels for each task: [13167, 2000, 299, 11001, 341, 944]


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

num_epochs = 10
batch_size = 16
learning_rate = 2e-5
num_warmup_steps = 0
num_folds = 5

kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)

False


In [8]:
best_model = None
best_val_loss = float('inf')

for fold, (train_idx, val_idx) in enumerate(kf.split(dataset)):
    print(f"Fold {fold + 1}/{num_folds}")
    
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
    val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)
    
    train_loader = DataLoader(dataset, batch_size=batch_size, sampler=train_subsampler)
    val_loader = DataLoader(dataset, batch_size=batch_size, sampler=val_subsampler)
    
    model = MultiTaskRobertaModel('roberta-base', num_labels).to(device)
    
    optimizer = AdamW(model.parameters(), lr=learning_rate, correct_bias=False)
    total_steps = len(train_loader) * num_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=total_steps)

    for epoch in trange(num_epochs, desc="Epochs"):
        model.train()
        total_loss = 0
        all_preds = [[] for _ in range(len(label_columns))]
        all_labels = [[] for _ in range(len(label_columns))]
        
        progress_bar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs}")
        for batch in progress_bar:
            optimizer.zero_grad()
            inputs = {
                'input_ids': batch['input_ids'].to(device),
                'attention_mask': batch['attention_mask'].to(device),
            }
            labels = batch['labels'].to(device)
            loss, logits = model(**inputs, labels=labels)
            total_loss += loss.item()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            scheduler.step()
            
            for i, logit in enumerate(logits):
                preds = logit.argmax(dim=-1).detach().cpu().numpy()
                all_preds[i].extend(preds)
                all_labels[i].extend(labels[:, i].cpu().numpy())
            
            progress_bar.set_postfix({'loss': f"{loss.item():.4f}"})
            
        avg_loss = total_loss / len(train_loader)
        
        model.eval()
        val_loss = 0
        val_preds = [[] for _ in range(len(label_columns))]
        val_labels = [[] for _ in range(len(label_columns))]
        
        with torch.no_grad():
            for batch in val_loader:
                inputs = {
                    'input_ids': batch['input_ids'].to(device),
                    'attention_mask': batch['attention_mask'].to(device),
                }
                labels = batch['labels'].to(device)
                loss, logits = model(**inputs, labels=labels)
                val_loss += loss.item()
                
                for i, logit in enumerate(logits):
                    preds = logit.argmax(dim=-1).detach().cpu().numpy()
                    val_preds[i].extend(preds)
                    val_labels[i].extend(labels[:, i].cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        
        logger.info(f"Fold {fold + 1}, Epoch {epoch + 1}/{num_epochs}")
        logger.info(f"Training Loss: {avg_loss:.4f}, Validation Loss: {avg_val_loss:.4f}")
        
        for i, (preds, labels) in enumerate(zip(val_preds, val_labels)):
            precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
            accuracy = accuracy_score(labels, preds)
            
            logger.info(f"Task {i+1} - Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")
            
            cm = confusion_matrix(labels, preds)
            plt.figure(figsize=(10, 8))
            sns.heatmap(cm, annot=True, fmt='d')
            plt.title(f'Confusion Matrix for Task {i+1}')
            plt.ylabel('Actual')
            plt.xlabel('Predicted')
            plt.savefig(f'confusion_matrix_fold{fold+1}_task{i+1}_epoch{epoch+1}.png')
            plt.close()

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model = model.state_dict()
            torch.save(best_model, './best_model.pth')
            print(f"New best model saved with validation loss: {best_val_loss}")


print(f"Best model had a validation loss of {best_val_loss}")

model.load_state_dict(torch.load('./best_model.pth'))

tokenizer.save_pretrained('./tokenizer')

Fold 1/5


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/10:   2%|▏         | 19/1100 [01:05<1:01:48,  3.43s/it, loss=42.1495]
Epochs:   0%|          | 0/10 [01:05<?, ?it/s]


KeyboardInterrupt: 

In [None]:
device = torch.device("cpu")
model = MultiTaskRobertaModel('roberta-base', [len(le.classes_) for le in label_encoders.values()])
model.load_state_dict(torch.load('best_model.pth', map_location=device))
model.eval()
tokenizer = RobertaTokenizer.from_pretrained('./tokenizer')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load('best_model.pth', map_location=device))


In [None]:
from collections import OrderedDict

def predict_log_details(log_text, model, tokenizer, label_encoders, device):
    model.eval()
    inputs = tokenizer(preprocess_text(log_text), return_tensors="pt", truncation=True, padding=True, max_length=128)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs)
    
    predictions = [logit.cpu().numpy().argmax(axis=1)[0] for logit in logits]
    confidence_scores = [torch.softmax(logit, dim=1).max().item() for logit in logits]
    
    interpreted_predictions = OrderedDict([
        ('LineId', label_encoders['LineId'].inverse_transform([predictions[1]])[0]),
        ('timestamp', label_encoders['timestamp'].inverse_transform([predictions[0]])[0]),
        ('Component', label_encoders['Component'].inverse_transform([predictions[2]])[0]),
        ('Content', label_encoders['Content'].inverse_transform([predictions[3]])[0]),
        ('EventId', label_encoders['EventId'].inverse_transform([predictions[4]])[0]),
        ('EventTemplate', label_encoders['EventTemplate'].inverse_transform([predictions[5]])[0])
    ])
    
    result = OrderedDict()
    result['input_log'] = log_text
    result['predictions'] = OrderedDict()
    
    for i, (key, value) in enumerate(interpreted_predictions.items()):
        result['predictions'][key] = {
            'value': value,
            'confidence': f"{confidence_scores[i]:.4f}"
        }
    
    return result

new_log = "5,2016-09-28,04:30:31,Info,CBS,Ending TrustedInstaller initialization.,E17,Ending TrustedInstaller initialization."
predicted_details = predict_log_details(new_log, model, tokenizer, label_encoders, device)

def print_predictions(predictions):
    print("Input Log:")
    print(f"  {predictions['input_log']}")
    print("\nPredictions:")
    for key, value in predictions['predictions'].items():
        print(f"  {key}:")
        print(f"    Value: {value['value']}")
        print(f"    Confidence: {value['confidence']}")
        print()

print_predictions(predicted_details)

Input Log:
  5,2016-09-28,04:30:31,Info,CBS,Ending TrustedInstaller initialization.,E17,Ending TrustedInstaller initialization.

Predictions:
  LineId:
    Value: 1852
    Confidence: 0.0313

  timestamp:
    Value: 2016-09-29 00:00:46
    Confidence: 0.0021

  Component:
    Value: CBS
    Confidence: 0.2770

  Content:
    Value: Interrupting SendWorker
    Confidence: 0.0122

  EventId:
    Value: E7
    Confidence: 0.1519

  EventTemplate:
    Value: Interrupting SendWorker
    Confidence: 0.0196



In [None]:
def batch_predict_log_details(log_texts, model, tokenizer, label_encoders, device, batch_size=32):
    model.eval()
    all_predictions = []
    
    for i in range(0, len(log_texts), batch_size):
        batch_texts = log_texts[i:i+batch_size]
        batch_texts = [preprocess_text(text) for text in batch_texts]
        inputs = tokenizer(batch_texts, return_tensors="pt", truncation=True, padding=True, max_length=128)
        inputs = {k: v.to(device) for k, v in inputs.items()}
        
        with torch.no_grad():
            logits = model(**inputs)
        
        batch_predictions = [logit.cpu().numpy().argmax(axis=1) for logit in logits]
        confidence_scores = [torch.softmax(logit, dim=1).max(dim=1)[0].cpu().numpy() for logit in logits]
        
        for j, original_text in enumerate(log_texts[i:i+batch_size]):
            predictions = [pred[j] for pred in batch_predictions]
            scores = [score[j] for score in confidence_scores]
            
            result = OrderedDict()
            result['input_log'] = original_text
            result['predictions'] = OrderedDict()
            
            for k, (key, le) in enumerate(label_encoders.items()):
                result['predictions'][key] = {
                    'value': le.inverse_transform([predictions[k]])[0],
                    'confidence': f"{scores[k]:.4f}"
                }
            
            all_predictions.append(result)
    
    return all_predictions

In [None]:
batch_logs = ["03-17 16:13:38.811  1702  2395 D WindowManager: printFreezingDisplayLogsopening app wtoken = AppWindowToken{9f4ef63 token=Token{a64f992 ActivityRecord{de9231d u0 com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity t761}}}, allDrawn= false, startingDisplayed =  false, startingMoved =  false, isRelaunching =  false",
              "1117838976 2005.06.03 R02-M1-N0-C:J12-U11 2005-06-03-15.49.36.156884 R02-M1-N0-C:J12-U11 RAS KERNEL INFO instruction cache parity error corrected",
              "2015-10-18 18:01:50,572 INFO [main] org.apache.hadoop.yarn.event.AsyncDispatcher: Registering class org.apache.hadoop.mapreduce.v2.app.commit.CommitterEventType for class org.apache.hadoop.mapreduce.v2.app.commit.CommitterEventHandler",
              "081109 204842 663 INFO dfs.DataNode$DataXceiver: Receiving block blk_1724757848743533110 src: /10.251.111.130:49851 dest: /10.251.111.130:50010",
              "20171223-22:15:29:950|Step_SPUtils|30002312|setTodayTotalDetailSteps=1514038440000##7008##548365##8661##12456##27174269",
              "360778 node-130 unix.hw state_change.unavailable 1141108031 1 Component State Change: Component \042alt0\042 is in the unavailable state (HWID=2478)",
              "Jun 15 02:04:59 combo sshd(pam_unix)[20892]: authentication failure; logname= uid=0 euid=0 tty=NODEVssh ruser= rhost=220-135-151-1.hinet-ip.hinet.net  user=root",
              "Jul  1 09:04:37 authorMacBook-Pro symptomsd[215]: __73-[NetworkAnalyticsEngine observeValueForKeyPath:ofObject:change:context:]_block_invoke unexpected switch value 2",
              "Dec 10 07:08:28 LabSZ sshd[24208]: reverse mapping checking getaddrinfo for ns.marryaldkfaczcz.com [173.234.31.186] failed - POSSIBLE BREAK-IN ATTEMPT!",
              "nova-compute.log.1.2017-05-16_13:55:31 2017-05-16 00:00:04.500 2931 INFO nova.compute.manager [req-3ea4052c-895d-4b64-9e2d-04d64c4d94ab - - - - -] [instance: b9000564-fe1a-409b-b8cc-1e88b294cd1d] VM Started (Lifecycle Event)",
              "[10.30 16:49:08] chrome.exe - proxy.cse.cuhk.edu.hk:5070 close, 1190 bytes (1.16 KB) sent, 1671 bytes (1.63 KB) received, lifetime 00:02",
              "17/06/09 20:10:41 INFO storage.DiskBlockManager: Created local directory at /opt/hdfs/nodemanager/usercache/curi/appcache/application_1485248649253_0147/blockmgr-70293f72-844a-4b39-9ad6-fb0ad7e364e4",
              "1131566461 2005.11.09 dn700 Nov 9 12:01:01 dn700/dn700 crond(pam_unix)[2912]: session opened for user root by (uid=0)",
              "2016-09-28 04:30:31, Info                  CBS    SQM: Failed to start upload with file pattern: C:\Windows\servicing\sqm\*_std.sqm, flags: 0x2 [HRESULT = 0x80004005 - E_FAIL]",
              "2015-07-29 19:13:27,721 - WARN  [RecvWorker:188978561024:QuorumCnxManager$RecvWorker@762] - Connection broken for id 188978561024, my id = 1, error = ",
              ]

  "2016-09-28 04:30:31, Info                  CBS    SQM: Failed to start upload with file pattern: C:\Windows\servicing\sqm\*_std.sqm, flags: 0x2 [HRESULT = 0x80004005 - E_FAIL]",


In [None]:
batch_predictions = batch_predict_log_details(batch_logs, model, tokenizer, label_encoders, device)

In [None]:
def print_batch_predictions(predictions):
    for i, prediction in enumerate(predictions):
        print(f"Log {i + 1}:")
        print(f"  Input: {prediction['input_log']}")
        print("  Predictions:")
        for key, value in prediction['predictions'].items():
            print(f"    {key}:")
            print(f"      Value: {value['value']}")
            print(f"      Confidence: {value['confidence']}")
        print()

print_batch_predictions(batch_predictions)

Log 1:
  Input: 03-17 16:13:38.811  1702  2395 D WindowManager: printFreezingDisplayLogsopening app wtoken = AppWindowToken{9f4ef63 token=Token{a64f992 ActivityRecord{de9231d u0 com.tencent.qt.qtl/.activity.info.NewsDetailXmlActivity t761}}}, allDrawn= false, startingDisplayed =  false, startingMoved =  false, isRelaunching =  false
  Predictions:
    timestamp:
      Value: 2017-05-16 00:01:17.920
      Confidence: 0.0005
    LineId:
      Value: 1745
      Confidence: 0.0026
    Component:
      Value: symptomsd
      Confidence: 0.0694
    Content:
      Value: -[NetworkAnalyticsEngine _writeJournalRecord:fromCellFingerprint:key:atLOI:ofKind:lqm:isFaulty:] Hashing of the primary key failed. Dropping the journal record.
      Confidence: 0.0008
    EventId:
      Value: E7
      Confidence: 0.0432
    EventTemplate:
      Value: -[UABestAppSuggestionManager notifyBestAppChanged:type:options:bundleIdentifier:activityType:dynamicIdentifier:when:confidence:deviceName:deviceIdentifier:de