In [25]:
import pandas as pd

# Load your current cleaned dataset
df = pd.read_csv('../data/nova_logs_cleaned.csv')  # Your Stage 1 output

# Apply the sampling strategy
abnormal_logs = df[df['label'] != 'normal']
normal_logs = df[df['label'] == 'normal'].sample(n=15000, random_state=42)

# Combine for balanced, manageable dataset
df_manageable = pd.concat([abnormal_logs, normal_logs], ignore_index=True)
df_manageable = df_manageable.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Final dataset: {len(df_manageable)} logs")
print(df_manageable['label'].value_counts())

# Save the manageable dataset for Stage 2 and beyond
df_manageable.to_csv('../data/nova_logs_manageable.csv', index=False)
print("Saved manageable dataset for clustering and downstream stages")


Final dataset: 54646 logs
label
mixed        17405
normal       15000
abnormal2    10473
abnormal1     8177
abnormal3     3591
Name: count, dtype: int64
Saved manageable dataset for clustering and downstream stages


In [26]:
df = pd.read_csv('../data/nova_logs_manageable.csv')
df.head()

Unnamed: 0,log_id,raw_log_text,source_file,label
0,26204,WARNING oslo.service.loopingcall [-] Function ...,../data/openstack-nova-undefine-vm-after-creat...,abnormal2
1,20962,INFO nova.compute.manager [req-b9d6411c-b3ea-4...,../data/openstack-nova-sample.log,mixed
2,43989,ERROR nova.compute.manager [instance: c265f382...,../data/openstack-nova-undefine-vm-after-creat...,abnormal2
3,12752,INFO nova.compute.manager [req-ac9f5721-5c52-4...,../data/openstack-nova-sample.log,mixed
4,9605,INFO nova.compute.manager [None req-0f69838f-e...,../data/openstack-vm-destroy-immediately-after...,abnormal1


In [27]:
df.shape

(54646, 4)

In [28]:
df.columns.tolist()


['log_id', 'raw_log_text', 'source_file', 'label']

In [29]:
df.dtypes

log_id           int64
raw_log_text    object
source_file     object
label           object
dtype: object

In [30]:
df.isnull().sum()

log_id          0
raw_log_text    0
source_file     0
label           0
dtype: int64

In [31]:
label_counts = df['label'].value_counts()
print(label_counts)
print(f"Label percentages:\n{df['label'].value_counts(normalize=True) * 100}")

label
mixed        17405
normal       15000
abnormal2    10473
abnormal1     8177
abnormal3     3591
Name: count, dtype: int64
Label percentages:
label
mixed        31.850456
normal       27.449402
abnormal2    19.165172
abnormal1    14.963584
abnormal3     6.571387
Name: proportion, dtype: float64


In [32]:
df['text_length'] = df['raw_log_text'].str.len()
print(f"Min length: {df['text_length'].min()}")
print(f"Max length: {df['text_length'].max()}")
print(f"Mean length: {df['text_length'].mean():.2f}")
print(f"Median length: {df['text_length'].median()}")


Min length: 4
Max length: 4658
Mean length: 184.81
Median length: 161.0


In [33]:
for label in df['label'].unique():
    print(f"\n--- {label.upper()} LOGS (sample) ---")
    sample_logs = df[df['label'] == label]['raw_log_text'].head(3)
    for i, log in enumerate(sample_logs, 1):
        print(f"{i}. {log[:100]}...")


--- ABNORMAL2 LOGS (sample) ---
2. ERROR nova.compute.manager [instance: c265f382-e5d8-44fb-98c8-84abd4592037]     self.force_reraise()...
3. <entry name='serial'>f41265c7-0cc0-4212-8ab4-89626d362895</entry>...

--- MIXED LOGS (sample) ---
1. INFO nova.compute.manager [req-b9d6411c-b3ea-4307-a707-ec546b0192b3] [instance: 8192614e-4a86-47cc-a...
2. INFO nova.compute.manager [req-ac9f5721-5c52-4ec3-ba8a-e494d9780d53] [instance: fd750099-65bf-4119-b...
3. INFO nova.compute.manager [req-46123093-5f3c-4ff2-b9a3-b013b39f3f26] [instance: 26fa461c-54bc-4aaf-a...

--- ABNORMAL1 LOGS (sample) ---
1. INFO nova.compute.manager [None req-0f69838f-e506-4e79-9b37-85b2cba0ab35 None None] [instance: dc040...
2. INFO nova.compute.manager [None req-46123093-5f3c-4ff2-b9a3-b013b39f3f26 None None] [instance: 92de5...
3. INFO nova.compute.manager [None req-eb337492-dfed-408c-89f6-433726e82021 None None] [instance: 056fd...

--- NORMAL LOGS (sample) ---
1. INFO nova.compute.manager [None req-ac9f5721-5c52-4

In [34]:
total_logs = len(df)
unique_logs = df['raw_log_text'].nunique()
duplicates = total_logs - unique_logs
print(f"Total logs: {total_logs}")
print(f"Unique logs: {unique_logs}")
print(f"Duplicate logs: {duplicates} ({duplicates/total_logs*100:.2f}%)")


Total logs: 54646
Unique logs: 54646
Duplicate logs: 0 (0.00%)


In [35]:
df['log_prefix'] = df['raw_log_text'].str.split().str[:3].str.join(' ')
common_prefixes = df['log_prefix'].value_counts().head(10)
print("Most common log prefixes:")
print(common_prefixes)

Most common log prefixes:
log_prefix
INFO nova.compute.manager [None                                         10120
ERROR nova.compute.manager [instance:                                    3929
INFO nova.compute.manager [req-ac9f5721-5c52-4ec3-ba8a-e494d9780d53]     3886
INFO nova.virt.libvirt.driver [None                                      3058
INFO nova.compute.manager [-]                                            2473
INFO nova.virt.libvirt.driver [-]                                        2233
INFO os_vif [None                                                        2050
INFO nova.scheduler.client.report [None                                   961
INFO nova.compute.claims [None                                            954
INFO nova.compute.manager [req-d96524cb-6283-416d-95af-d47508fa2ec5]      596
Name: count, dtype: int64
