In [None]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset, Dataset
import re
from torch.utils.data import Dataset, DataLoader
from transformers import RobertaTokenizer, RobertaForSequenceClassification, AdamW
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

In [9]:
def read(structured,raw):
    structured_df = pd.read_csv(structured)
    with open(raw, 'r') as file:
        data = file.read()
        
    lines = data.split('\n')
    cleaned_lines = [line.strip() for line in lines if line.strip()]
    raw_df = pd.DataFrame(cleaned_lines, columns=['log'])
    
    struct_df_reset = structured_df.reset_index(drop=True)
    raw_df_reset = raw_df.reset_index(drop=True) 
    combined_df = pd.concat([raw_df_reset,struct_df_reset], axis=1)
    df = combined_df.head(10)
    return df

In [None]:
def preprocess_df(df):
  # Convert Date and Time to timestamp
  df['DateTime'] = pd.to_datetime(df['Date'] + ' ' + df['Time'])
  df['Timestamp'] = df['DateTime'].astype('int64') // 10**9
  
  # Encode categorical variables
  label_encoders = {}
  categorical_columns = ['Level', 'Component', 'EventId', 'EventTemplate']
  for col in categorical_columns:
      le = LabelEncoder()
      df[col + '_encoded'] = le.fit_transform(df[col])
      label_encoders[col] = le
  
  # Select features for labels
  label_columns = ['LineId', 'Timestamp', 'Level_encoded', 'Component_encoded', 'EventId_encoded', 'EventTemplate_encoded']
  
  return df, label_columns, label_encoders

In [14]:
android_structured = "structured_data\Android_2k.log_structured.csv"
android_raw = "raw_data\Android_2k.log"
android_df = read(android_structured,android_raw)

apache_structured = "structured_data\Apache_2k.log_structured.csv"
apache_raw = "raw_data\Apache_2k.log"
apache_df = read(apache_structured,apache_raw)

bgl_structured = "structured_data\BGL_2k.log_structured.csv"
bgl_raw = "raw_data\BGL_2k.log"
bgl_df = read(bgl_structured,bgl_raw)

handoop_structured = "structured_data\Hadoop_2k.log_structured.csv"
handoop_raw = "raw_data\Hadoop_2k.log"
handoop_df = read(handoop_structured,handoop_raw)

hdfs_structured = "structured_data\HDFS_2k.log_structured.csv"
hdfs_raw = "raw_data\HDFS_2k.log"
hdfs_df = read(hdfs_structured,hdfs_raw)

healthapp_structured = "structured_data\HealthApp_2k.log_structured.csv"
healthapp_raw = "raw_data\HealthApp_2k.log"
healthapp_df = read(healthapp_structured,healthapp_raw)

hpc_structured = "structured_data\HPC_2k.log_structured.csv"
hpc_raw = "raw_data\HPC_2k.log"
hpc_df = read(hpc_structured,hpc_raw)

linux_structured = "structured_data\Linux_2k.log_structured.csv"
linux_raw = "raw_data\Linux_2k.log"
linux_df = read(linux_structured,linux_raw)

mac_structured = "structured_data\Mac_2k.log_structured.csv"
mac_raw = "raw_data\Mac_2k.log"
mac_df = read(mac_structured,mac_raw)

openssh_structured = "structured_data\OpenSSH_2k.log_structured.csv"
openssh_raw = "raw_data\OpenSSH_2k.log"
openssh_df = read(openssh_structured,openssh_raw)

openstack_structured = "structured_data\OpenStack_2k.log_structured.csv"
openstack_raw = "raw_data\OpenStack_2k.log"
openstack_df = read(openstack_structured,openstack_raw)

proxifier_structured = "structured_data\Proxifier_2k.log_structured.csv"
proxifier_raw = "raw_data\Proxifier_2k.log"
proxifier_df = read(proxifier_structured,proxifier_raw)

spark_structured = "structured_data\Spark_2k.log_structured.csv"
spark_raw = "raw_data\Spark_2k.log"
spark_df = read(spark_structured,spark_raw)

thunderbird_structured = "structured_data\Thunderbird_2k.log_structured.csv"
thunderbird_raw = "raw_data\Thunderbird_2k.log"
thunderbird_df = read(thunderbird_structured,thunderbird_raw)

windows_structured = "structured_data\Windows_2k.log_structured.csv"
windows_raw = "raw_data\Windows_2k.log"
windows_df = read(windows_structured,windows_raw)

zookeeper_structured = "structured_data\Zookeeper_2k.log_structured.csv"
zookeeper_raw = "raw_data\Zookeeper_2k.log"
zookeeper_df = read(zookeeper_structured,zookeeper_raw)

df = windows_df


In [7]:
df, label_columns, label_encoders = preprocess_df(df)

In [87]:
class LogDataset(Dataset):
    def __init__(self, logs, labels, tokenizer, max_len):
        self.logs = logs
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.logs)
    
    def __getitem__(self, item):
        log = str(self.logs[item])
        labels = self.labels[item]
        
        encoding = self.tokenizer.encode_plus(
            log,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'log_text': log,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.FloatTensor(labels)
        }

In [88]:
logs = df['log'].tolist()
labels = df[label_columns].values.tolist()

In [89]:
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=len(labels[0]))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [90]:
dataset = LogDataset(logs, labels, tokenizer, max_len=128)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

In [91]:
optimizer = AdamW(model.parameters(), lr=2e-5)



In [92]:
num_epochs = 10
for epoch in range(num_epochs):
    for batch in dataloader:
        optimizer.zero_grad()
        inputs = {'input_ids': batch['input_ids'], 'attention_mask': batch['attention_mask'], 'labels': batch['labels']}
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

In [93]:
model.save_pretrained('./log_analysis_model')

In [94]:
def predict_log_details(log_text):
    inputs = tokenizer(log_text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    outputs = model(**inputs)
    predictions = outputs.logits.sigmoid()
    raw_predictions = predictions.tolist()[0]

    # Interpret predictions
    interpreted_predictions = {}
    
    # LineId
    interpreted_predictions['LineId'] = round(raw_predictions[0])
    
    # Timestamp to DateTime
    timestamp = int(raw_predictions[1])
    interpreted_predictions['DateTime'] = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S')
    
    # Level
    level_encoded = round(raw_predictions[2])
    interpreted_predictions['Level'] = label_encoders['Level'].inverse_transform([level_encoded])[0]
    
    # Component
    component_encoded = round(raw_predictions[3])
    interpreted_predictions['Component'] = label_encoders['Component'].inverse_transform([component_encoded])[0]
    
    # EventId
    eventid_encoded = round(raw_predictions[4])
    interpreted_predictions['EventId'] = label_encoders['EventId'].inverse_transform([eventid_encoded])[0]
    
    # EventTemplate
    eventtemplate_encoded = round(raw_predictions[5])
    interpreted_predictions['EventTemplate'] = label_encoders['EventTemplate'].inverse_transform([eventtemplate_encoded])[0]

    return interpreted_predictions

In [95]:
new_log = "32,2016-09-28,04:30:31,Info,CBS,Warning: Unrecognized packageExtended attribute.,E50,Warning: Unrecognized packageExtended attribute."
predicted_details = predict_log_details(new_log)
print(predicted_details)

{'LineId': 1, 'DateTime': '1970-01-01 05:30:00', 'Level': 'Info', 'Component': 'CSI', 'EventId': 'E17', 'EventTemplate': '<*>@<*>/<*>/<*>:<*>:<*>:<*>.<*> WcpInitialize (wcp.dll version <*>) called (stack @<*>)'}
