In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import pickle
import numpy as np
import random
from sklearn.metrics import precision_recall_fscore_support
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn.functional as F
from sklearn.metrics import accuracy_score

# Define the device to use
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
random_state = 20
torch.manual_seed(random_state)
torch.cuda.empty_cache()

In [None]:
cd /home/whut4/liyafei/newtype

In [None]:
def concatenate_arrays(arr):
    num_arrays = len(arr) // 20  # 计算可以拼接的数组组数
    concatenated = []

    for i in range(num_arrays):
        start_index = i * 20  # 计算每组开始的索引
        end_index = start_index + 20  # 计算每组结束的索引
        concat_array = np.concatenate(arr[start_index:end_index], axis=0)
        reshaped_array = concat_array.reshape((20, 384))  # 重新调整数组的形状
        concatenated.append(reshaped_array)

    return np.stack(concatenated)

In [None]:
def load_data(D1, portion):
    with open("{}/{}_normal.pickle".format(D1, D1), "rb") as f:
        D1_normal_logs = pickle.load(f)
    with open("{}/{}_abnormal.pickle".format(D1, D1), "rb") as f:
        D1_abnormal_logs = pickle.load(f)

    D1_normal_logs = concatenate_arrays(D1_normal_logs)
    D1_abnormal_logs = concatenate_arrays(D1_abnormal_logs)

    D1_normal_logs = D1_normal_logs[:int(len(D1_normal_logs) * portion)]
    D1_abnormal_logs = D1_abnormal_logs[:int(len(D1_abnormal_logs) * portion)]

    D1_normal_logs = torch.tensor(D1_normal_logs)
    D1_abnormal_logs = torch.tensor(D1_abnormal_logs)

    return D1_normal_logs, D1_abnormal_logs

In [None]:
def random_sample_and_remove(tensor, num):
    # 生成一个随机排列的索引
    indices = torch.randperm(tensor.size(0))

    # 选择num个随机索引以创建子tensor A
    selected_indices = indices[:num]
    tensor_A = torch.index_select(tensor, 0, selected_indices)

    # 使用剩余的索引创建一个删除了tensor A的tensor
    remaining_indices = indices[num:]
    tensor_remaining = torch.index_select(tensor, 0, remaining_indices)

    return tensor_A, tensor_remaining

In [None]:
class LogDataset(Dataset):
    def __init__(self, normal_data, abnormal_data):
        self.data = torch.cat((normal_data, abnormal_data), dim=0)
        self.labels = torch.cat((torch.zeros(normal_data.size(0), dtype=torch.long),
                                 torch.ones(abnormal_data.size(0), dtype=torch.long)), dim=0)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [None]:
class LSTMClassifier(nn.Module):
    def __init__(self, input_size = 384, hidden_size = 500, num_layers = 2, num_classes=2, dropout=0.3):
        super(LSTMClassifier, self).__init__()

        self.lstm = nn.LSTM(input_size,
                            hidden_size,
                            num_layers,
                            batch_first=True,
                            dropout=dropout,
                            bidirectional=True)

        # Considering bidirectional LSTM, hence multiplying by 2 for the final layer output size
        self.fc = nn.Linear(hidden_size * 2, num_classes)

    def forward(self, x):
        # Passing the input sequence through LSTM layers
        lstm_out, _ = self.lstm(x)

        # For feature extraction, we will use the last output of the sequence before classification
        features = lstm_out[:, -1, :]
        out = self.fc(features)

        return out, features  # return both output and features

In [None]:
def feature_distance(features1, features2, labels_D1, labels_D2):
    # Extract normal features based on labels
    normal_features1 = features1[labels_D1 == 0]
    normal_features2 = features2[labels_D2 == 0]

    # Extract abnormal features based on labels
    abnormal_features1 = features1[labels_D1 == 1]
    abnormal_features2 = features2[labels_D2 == 1]

    # Ensure the normal and abnormal features are of the same size across domains
    # Take the minimum size and slice both features1 and features2 accordingly
    min_normal_size = min(normal_features1.size(0), normal_features2.size(0))
    min_abnormal_size = min(abnormal_features1.size(0), abnormal_features2.size(0))

    normal_features1 = normal_features1[:min_normal_size]
    normal_features2 = normal_features2[:min_normal_size]

    abnormal_features1 = abnormal_features1[:min_abnormal_size]
    abnormal_features2 = abnormal_features2[:min_abnormal_size]

    # Compute the distance between normal features and abnormal features
    normal_distance = F.mse_loss(normal_features1, normal_features2)
    abnormal_distance = F.mse_loss(abnormal_features1, abnormal_features2)

    # Compute the distance between entire features1 and features2
    # Ensure the two features are of the same size
    min_size = min(features1.size(0), features2.size(0))
    total_features_distance = F.mse_loss(features1[:min_size], features2[:min_size])

    # Combine both distances (here I am taking an average, but you can combine them in any other way that suits your needs)
    average_distance = (normal_distance + abnormal_distance) / 2.0

    return average_distance, total_features_distance

In [None]:
def feature_distance2(features1, features2, labels_D1, labels_D2):
    # Extract normal features based on labels
    normal_mask1 = labels_D1 == 0
    normal_mask2 = labels_D2 == 0

    # Extract abnormal features based on labels
    abnormal_mask1 = labels_D1 == 1
    abnormal_mask2 = labels_D2 == 1

    # Broadcast and repeat the normal and abnormal masks if necessary
    normal_mask1, normal_mask2 = torch.broadcast_tensors(normal_mask1, normal_mask2)
    abnormal_mask1, abnormal_mask2 = torch.broadcast_tensors(abnormal_mask1, abnormal_mask2)

    # Compute the mean squared error loss for normal features
    normal_distance = F.mse_loss(features1[normal_mask1], features2[normal_mask2])

    # Compute the mean squared error loss for abnormal features
    abnormal_distance = F.mse_loss(features1[abnormal_mask1], features2[abnormal_mask2])

    # Broadcast and repeat the entire features if necessary
    features1, features2 = torch.broadcast_tensors(features1, features2)

    # Compute the mean squared error loss for entire features1 and features2
    total_features_distance = F.mse_loss(features1, features2)

    # Combine both distances (here I am taking an average, but you can combine them in any other way that suits your needs)
    average_distance = (normal_distance + abnormal_distance) / 2.0

    return average_distance, total_features_distance

In [None]:
def train_model_source(model, train_loader, num_epochs, learning_rate, patience, factor, froze = False):
    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)

    if froze:
        for param in model.fc.parameters():
            param.requires_grad = False

    # Learning rate scheduler
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=factor, patience=patience, verbose=True)

    # Model training
    for epoch in range(num_epochs):
        model.train()

        epoch_loss = 0.0
        for i, (logs, labels) in enumerate(train_loader):
            optimizer.zero_grad()

            logs = logs.to(device)
            outputs_D1, features_D1 = model(logs)

            labels = labels.to(device)
            loss = criterion(outputs_D1, labels)

            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

            if (i+1) % 50 == 0:
                print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.4f}')

        # Average loss for the epoch
        epoch_loss /= len(train_loader)

        # Step the learning rate scheduler based on the epoch's average loss
        scheduler.step(epoch_loss)

    print('Finished Training')

In [None]:
def train_model(model, train_loader_D1, train_loader_D2, num_epochs, learning_rate, patience, factor, froze, alpha=0.5, beta = 0.5):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=factor, patience=patience, verbose=True)

    if froze:
        for param in model.fc.parameters():
            param.requires_grad = False

    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0

        for (logs_D1, labels_D1), (logs_D2, labels_D2) in zip(train_loader_D1, train_loader_D2):
            optimizer.zero_grad()

            logs_D1, logs_D2 = logs_D1.to(device), logs_D2.to(device)
            outputs_D1, features_D1 = model(logs_D1)
            outputs_D2, features_D2 = model(logs_D2)

            labels_D1, labels_D2 = labels_D1.to(device), labels_D2.to(device)
            loss_classification_D1 = criterion(outputs_D1, labels_D1)
            loss_classification_D2 = criterion(outputs_D2, labels_D2)

            # Calculate the distance between features of D1 and D2
            loss_distance, entire_distance = feature_distance(features_D1, features_D2, labels_D1, labels_D2)

            # Total loss is a combination of classification loss and feature distance
            total_loss = alpha * loss_classification_D1 + (1 - alpha) * loss_distance - beta * entire_distance



            total_loss.backward()
            optimizer.step()

            # print(alpha * loss_classification_D1)
            # print((1 - alpha) * loss_distance)
            # print(beta * entire_distance)
            # print(total_loss.item())
            # print('---')

            epoch_loss += total_loss.item()

        epoch_loss /= len(train_loader_D1)

        scheduler.step(epoch_loss)
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss:.4f}')

    print('Finished Training')

In [None]:
def test_model(model, test_loader, dataset_name):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for logs, labels in test_loader:
            logs = logs.to(device)
            outputs, _ = model(logs)
            _, predicted = torch.max(outputs, 1)
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average='binary')
    accuracy = accuracy_score(all_labels, all_preds)  # compute accuracy

    print(f"\nResults for {dataset_name}:")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"Accuracy: {accuracy:.4f}\n")  # display accuracy

    return precision, recall, f1, accuracy  # return accuracy as well

In [None]:
def generate_two_digit_decimal():
    return round(random.uniform(0, 1), 2)

In [None]:
direction = 'HDFS->Hadoop'
# direction = 'Hadoop->HDFS'

# direction = 'BGL->TB'
# direction = 'TB->BGL'

# direction = 'TB->Spirit'
# direction = 'Spirit->TB'

# direction = 'BGL->Spirit'
# direction = 'Spirit->BGL'

train_size_B = 50

D1_name = direction.split('->')[0]
D2_name = direction.split('->')[1]

In [None]:
D1_normal_logs, D1_abnormal_logs = load_data(D1_name, portion = 1)

# 创建数据集
dataset_A = LogDataset(D1_normal_logs, D1_abnormal_logs)

# 划分训练集和测试集
train_size_A = int(0.8 * len(dataset_A))
test_size_A = len(dataset_A) - train_size_A
train_dataset_A, test_dataset_A = random_split(dataset_A, [train_size_A, test_size_A])

# 创建数据加载器
batch_size = 32
train_loader_A = DataLoader(train_dataset_A, batch_size=batch_size, shuffle=True)
test_loader_A = DataLoader(test_dataset_A, batch_size=batch_size, shuffle=False)

D2_normal_logs, D2_abnormal_logs = load_data(D2_name, portion = 1)

D2_normal_train_logs, D2_normal_test_logs = random_sample_and_remove(D2_normal_logs, int(train_size_B / 10))
D2_abnormal_train_logs, D2_abnormal_test_logs = random_sample_and_remove(D2_abnormal_logs, train_size_B)

train_dataset_B = LogDataset(D2_normal_train_logs, D2_abnormal_train_logs)
test_dataset_B = LogDataset(D2_normal_test_logs, D2_abnormal_test_logs)

train_loader_B = DataLoader(train_dataset_B, batch_size=batch_size, shuffle=True)
test_loader_B = DataLoader(test_dataset_B, batch_size=batch_size, shuffle=False)

In [None]:
# Initialize the model
input_size = 384
num_layers = 2
model = LSTMClassifier().to(device)

In [None]:
model.load_state_dict(torch.load(f'{direction}_cdd.pth'))
train_model(model, train_loader_B, train_loader_A, num_epochs=500, learning_rate=0.001, patience=50, factor=0.5, froze=True, alpha=0.3, beta = 0.3)
test_model(model, test_loader_B, dataset_name=D2_name)

In [None]:
model.load_state_dict(torch.load(f'{direction}_cdd.pth'))
train_model(model, train_loader_B, train_loader_A, num_epochs=500, learning_rate=0.001, patience=50, factor=0.5, froze=True, alpha=1, beta = 0)
test_model(model, test_loader_B, dataset_name=D2_name)