In [5]:
import json
import os
import random
from collections import defaultdict

import pandas as pd
import numpy as np

random.seed(123)

In [6]:
# train, test
train_ratio, test_ratio = [0.6, 0.4]
assert train_ratio + test_ratio == 1

In [7]:
# aiops_2022
DATA_DIR = '/root/lqh/multimodal-RCA/datasets/aiops_2022'
TARGET_DIR = '/root/lqh/multimodal-RCA/datasets/aiops_2022'
DATES = [
    "2022-03-20", "2022-03-21", "2022-03-22", "2022-03-24", "2022-03-26", "2022-03-28", "2022-03-29", "2022-03-30",
    "2022-03-31", "2022-04-01"
]

dfs = []
for date in sorted(DATES):
    df = pd.read_csv(os.path.join(DATA_DIR, f'groundtruth_csv/groundtruth-k8s-1-{date}.csv'))
    df = df[(df['level'] != 'node') & (df['failure_type'] != 'k8s容器网络资源包重复发送')]
    dfs.append(df)
label_df = pd.concat(dfs)

faults_per_type = defaultdict(list)
for _, row in label_df.iterrows():
    timestamp, cmdb_id, fault_type = row['timestamp'], row['cmdb_id'], row['failure_type']
    faults_per_type[(row['failure_type'])].append({
        'timestamp': timestamp // 60 * 60,
        'cmdb_id': cmdb_id,
        'fault_type': fault_type
    })

train_data = []
test_data = []
for fault_type, faults in faults_per_type.items():
    random.shuffle(faults)
    train_num = int(len(faults) * train_ratio)
    train_data.extend(faults[:train_num])
    test_data.extend(faults[train_num:])
data = {'train': train_data, 'test': test_data}
num_faults = len(train_data) + len(test_data)
print(f'train: {len(train_data)}, {len(train_data) / num_faults:.3f}. test: {len(test_data)}, {len(test_data) / num_faults:.3f}.')

with open(os.path.join(TARGET_DIR, 'aiops_2022_split.json'), 'w') as fp:
    json.dump(data, fp, indent=2)

train: 120, 0.588. test: 84, 0.412.


In [8]:
# TrainTicket_2024
def pod_to_service(pod):
    return '-'.join(pod.split('-')[:-2])

DATA_DIR = '/root/lqh/multimodal-RCA/datasets/TrainTicket_2024'
TARGET_DIR = '/root/lqh/multimodal-RCA/datasets/TrainTicket_2024'
DATES = ['2024-04-05', '2024-04-06', '2024-04-25']

faults_per_type = defaultdict(list)
for date in DATES:
    with open(os.path.join(DATA_DIR, f'{date}-fault_list.json'), 'r') as fp:
        tmp = json.load(fp)
    for fault in tmp:
        faults_per_type[(fault['inject_type'])].append({
            'timestamp': fault['inject_timestamp'],
            'cmdb_id': pod_to_service(fault['inject_pod']),
            'fault_type': fault['inject_type']
        })

train_data = []
test_data = []
for fault_type, faults in faults_per_type.items():
    random.shuffle(faults)
    train_num = int(len(faults) * train_ratio)
    train_data.extend(faults[:train_num])
    test_data.extend(faults[train_num:])
data = {'train': train_data, 'test': test_data}
num_faults = len(train_data) + len(test_data)
print(f'train: {len(train_data)}, {len(train_data) / num_faults:.3f}. test: {len(test_data)}, {len(test_data) / num_faults:.3f}.')

with open(os.path.join(TARGET_DIR, 'trainticket_2024_split.json'), 'w') as fp:
    json.dump(data, fp, indent=2)

train: 45, 0.577. test: 33, 0.423.
