In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import numpy as np
import random
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
from sklearn.cluster import KMeans

# Define the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
random_state = 20
torch.manual_seed(random_state)
torch.cuda.empty_cache()

In [None]:
cd /home/whut4/liyafei/newtype

In [None]:
def concatenate_arrays(arr):
    num_arrays = len(arr) // 20  # 计算可以拼接的数组组数
    concatenated = []

    for i in range(num_arrays):
        start_index = i * 20  # 计算每组开始的索引
        end_index = start_index + 20  # 计算每组结束的索引
        concat_array = np.concatenate(arr[start_index:end_index], axis=0)
        reshaped_array = concat_array.reshape((20, 384))  # 重新调整数组的形状
        concatenated.append(reshaped_array)

    return np.stack(concatenated)

In [None]:
def load_data(D1, portion):
    with open("{}/{}_normal.pickle".format(D1, D1), "rb") as f:
        D1_normal_logs = pickle.load(f)
    with open("{}/{}_abnormal.pickle".format(D1, D1), "rb") as f:
        D1_abnormal_logs = pickle.load(f)

    D1_normal_logs = concatenate_arrays(D1_normal_logs)
    D1_abnormal_logs = concatenate_arrays(D1_abnormal_logs)

    D1_normal_logs = D1_normal_logs[:int(len(D1_normal_logs) * portion)]
    D1_abnormal_logs = D1_abnormal_logs[:int(len(D1_abnormal_logs) * portion)]

    D1_normal_logs = torch.tensor(D1_normal_logs)
    D1_abnormal_logs = torch.tensor(D1_abnormal_logs)

    return D1_normal_logs, D1_abnormal_logs

In [None]:
def random_sample_and_remove(tensor, num):
    # 生成一个随机排列的索引
    indices = torch.randperm(tensor.size(0))

    # 选择num个随机索引以创建子tensor A
    selected_indices = indices[:num]
    tensor_A = torch.index_select(tensor, 0, selected_indices)

    # 使用剩余的索引创建一个删除了tensor A的tensor
    remaining_indices = indices[num:]
    tensor_remaining = torch.index_select(tensor, 0, remaining_indices)

    return tensor_A, tensor_remaining

In [None]:
class LogDataset(Dataset):
    def __init__(self, normal_data, abnormal_data):
        self.data = torch.cat((normal_data, abnormal_data), dim=0)
        self.labels = torch.cat((torch.zeros(normal_data.size(0), dtype=torch.long),
                                 torch.ones(abnormal_data.size(0), dtype=torch.long)), dim=0)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size = 384, hidden_size = 500, num_layers = 2, num_classes=2, dropout=0.3):
        super(LSTMClassifier, self).__init__()

        self.lstm = nn.LSTM(input_size,
                            hidden_size,
                            num_layers,
                            batch_first=True,
                            dropout=dropout,
                            bidirectional=True)

        # Considering bidirectional LSTM, hence multiplying by 2 for the final layer output size
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        # Passing the input sequence through LSTM layers
        lstm_out, _ = self.lstm(x)

        # For feature extraction, we will use the last output of the sequence before classification
        features = lstm_out[:, -1, :]
        out = self.fc(features)

        return out, features  # return both output and features

In [None]:
class BidirectionalTransformerClassifier(nn.Module):
    def __init__(self, num_layers=2, embedding_size=384, num_heads=4, dropout_rate=0.2):
        super().__init__()

        self.embedding_size = embedding_size

        # Forward Transformer Encoder
        self.forward_encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_size, nhead=num_heads, dropout=dropout_rate)
        self.forward_transformer_encoder = nn.TransformerEncoder(self.forward_encoder_layer, num_layers=num_layers)

        # Backward Transformer Encoder
        self.backward_encoder_layer = nn.TransformerEncoderLayer(d_model=embedding_size, nhead=num_heads, dropout=dropout_rate)
        self.backward_transformer_encoder = nn.TransformerEncoder(self.backward_encoder_layer, num_layers=num_layers)

        # Linear layer for classification
        self.fc = nn.Linear(embedding_size * 2, 2)  # Multiply by 2 for concatenating forward and backward outputs

    def forward(self, x):
        # Permute to (sequence_length, batch_size, embedding_size)
        x = x.permute(1, 0, 2)

        # Forward pass
        forward_output = self.forward_transformer_encoder(x)

        # Backward pass
        backward_output = self.backward_transformer_encoder(torch.flip(x, [0]))

        # Concatenate forward and backward outputs along the feature dimension
        concatenated_output = torch.cat([forward_output, torch.flip(backward_output, [0])], dim=-1)

        # Permute back to (batch_size, sequence_length, embedding_size * 2)
        concatenated_output = concatenated_output.permute(1, 0, 2)

        # Extract the last output of the sequence before classification
        fea = concatenated_output[:, -1, :]

        # Apply linear layer for classification
        x = self.fc(fea)

        return x, fea

In [None]:
# direction = 'HDFS->Hadoop'
# direction = 'Hadoop->HDFS'

direction = 'BGL->TB'
# direction = 'TB->BGL'

# direction = 'TB->Spirit'
# direction = 'Spirit->TB'

# direction = 'BGL->Spirit'
# direction = 'Spirit->BGL'

train_size_B = 100

D1_name = direction.split('->')[0]
D2_name = direction.split('->')[1]

In [None]:
D1_normal_logs, D1_abnormal_logs = load_data(D1_name, portion = 1)

# 创建数据集
dataset_A = LogDataset(D1_normal_logs, D1_abnormal_logs)

# 划分训练集和测试集
train_size_A = int(0.8 * len(dataset_A))
test_size_A = len(dataset_A) - train_size_A
train_dataset_A, test_dataset_A = random_split(dataset_A, [train_size_A, test_size_A])

# 创建数据加载器
batch_size = 32
train_loader_A = DataLoader(train_dataset_A, batch_size=batch_size, shuffle=True)
test_loader_A = DataLoader(test_dataset_A, batch_size=batch_size, shuffle=False)


D2_normal_logs, D2_abnormal_logs = load_data(D2_name, portion = 1)
dataset_B = LogDataset(D2_normal_logs, D2_abnormal_logs)
test_size_B = len(dataset_B) - train_size_B

train_dataset_B, test_dataset_B = random_split(dataset_B, [train_size_B, test_size_B])

train_loader_B = DataLoader(train_dataset_B, batch_size=batch_size, shuffle=True)
test_loader_B = DataLoader(test_dataset_B, batch_size=batch_size, shuffle=False)

In [None]:
# Initialize the model
model = BidirectionalTransformerClassifier().to(device)