In [34]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('../data/nova_logs_cleaned.csv') 

In [35]:
df.head(10)

Unnamed: 0,log_id,raw_log_text,source_file,label
0,0,INFO nova.compute.claims [None req-a4498d64-47...,../data/openstack-nova-normal-vm-create.log,normal
1,1,INFO nova.virt.libvirt.driver [None req-a4498d...,../data/openstack-nova-normal-vm-create.log,normal
2,2,INFO os_vif [None req-a4498d64-47bb-491f-adde-...,../data/openstack-nova-normal-vm-create.log,normal
3,3,INFO nova.compute.manager [None req-ac9f5721-5...,../data/openstack-nova-normal-vm-create.log,normal
4,4,INFO nova.compute.manager [None req-ac9f5721-5...,../data/openstack-nova-normal-vm-create.log,normal
5,5,INFO nova.compute.manager [None req-ac9f5721-5...,../data/openstack-nova-normal-vm-create.log,normal
6,6,INFO nova.compute.manager [None req-ac9f5721-5...,../data/openstack-nova-normal-vm-create.log,normal
7,7,INFO nova.virt.libvirt.driver [-] [instance: 1...,../data/openstack-nova-normal-vm-create.log,normal
8,8,INFO nova.compute.manager [None req-a4498d64-4...,../data/openstack-nova-normal-vm-create.log,normal
9,9,INFO nova.compute.claims [None req-e665dbf5-dc...,../data/openstack-nova-normal-vm-create.log,normal


In [36]:
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")


Dataset shape: (137540, 4)
Columns: ['log_id', 'raw_log_text', 'source_file', 'label']


In [37]:
print(f"Data types:\n{df.dtypes}")
print(f"Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

Data types:
log_id           int64
raw_log_text    object
source_file     object
label           object
dtype: object
Memory usage: 51.82 MB


In [38]:
print(df.isnull().sum())

log_id          0
raw_log_text    0
source_file     0
label           0
dtype: int64


In [39]:
label_counts = df['label'].value_counts()
print(label_counts)
print(f"Label percentages:\n{df['label'].value_counts(normalize=True) * 100}")

label
normal       97894
mixed        17405
abnormal2    10473
abnormal1     8177
abnormal3     3591
Name: count, dtype: int64
Label percentages:
label
normal       71.174931
mixed        12.654501
abnormal2     7.614512
abnormal1     5.945180
abnormal3     2.610877
Name: proportion, dtype: float64


In [40]:
df['text_length'] = df['raw_log_text'].str.len()
print(f"Min length: {df['text_length'].min()}")
print(f"Max length: {df['text_length'].max()}")
print(f"Mean length: {df['text_length'].mean():.2f}")
print(f"Median length: {df['text_length'].median()}")


Min length: 4
Max length: 4658
Mean length: 190.78
Median length: 162.0


In [41]:
for label in df['label'].unique():
    print(f"\n--- {label.upper()} LOGS (sample) ---")
    sample_logs = df[df['label'] == label]['raw_log_text'].head(3)
    for i, log in enumerate(sample_logs, 1):
        print(f"{i}. {log[:100]}...")


--- NORMAL LOGS (sample) ---
1. INFO nova.compute.claims [None req-a4498d64-47bb-491f-adde-effccaba43f0 admin admin] [instance: 1176...
2. INFO nova.virt.libvirt.driver [None req-a4498d64-47bb-491f-adde-effccaba43f0 admin admin] [instance:...
3. INFO os_vif [None req-a4498d64-47bb-491f-adde-effccaba43f0 admin admin] Successfully plugged vif VIF...

--- ABNORMAL1 LOGS (sample) ---
1. INFO nova.compute.claims [None req-e47da3d5-a6e4-42de-a382-c9005209a73f admin admin] [instance: afcf...
2. INFO nova.virt.libvirt.driver [None req-e47da3d5-a6e4-42de-a382-c9005209a73f admin admin] [instance:...
3. ERROR nova.virt.libvirt.imagebackend [None req-e47da3d5-a6e4-42de-a382-c9005209a73f admin admin] /op...

--- ABNORMAL2 LOGS (sample) ---
1. INFO nova.compute.claims [req-569df606-8ce5-4227-a7a3-8a47e476aad8] [instance: 11e8dd0e-df7c-44bc-84...
2. INFO nova.virt.libvirt.driver [req-569df606-8ce5-4227-a7a3-8a47e476aad8] [instance: 11e8dd0e-df7c-44...
3. INFO os_vif [req-569df606-8ce5-4227-a7a3-8a47

In [42]:
total_logs = len(df)
unique_logs = df['raw_log_text'].nunique()
duplicates = total_logs - unique_logs
print(f"Total logs: {total_logs}")
print(f"Unique logs: {unique_logs}")
print(f"Duplicate logs: {duplicates} ({duplicates/total_logs*100:.2f}%)")


Total logs: 137540
Unique logs: 137540
Duplicate logs: 0 (0.00%)


In [43]:
df['log_prefix'] = df['raw_log_text'].str.split().str[:3].str.join(' ')
common_prefixes = df['log_prefix'].value_counts().head(10)
print("Most common log prefixes:")
print(common_prefixes)

Most common log prefixes:
log_prefix
INFO nova.compute.manager [None                                         43361
INFO nova.virt.libvirt.driver [None                                     15581
INFO nova.compute.manager [-]                                           11254
INFO nova.virt.libvirt.driver [-]                                       10593
INFO os_vif [None                                                       10444
INFO nova.compute.claims [None                                           5141
INFO nova.scheduler.client.report [None                                  5134
ERROR nova.compute.manager [instance:                                    3929
INFO nova.compute.manager [req-ac9f5721-5c52-4ec3-ba8a-e494d9780d53]     3886
INFO nova.compute.manager [req-d96524cb-6283-416d-95af-d47508fa2ec5]      596
Name: count, dtype: int64


In [44]:
df.columns

Index(['log_id', 'raw_log_text', 'source_file', 'label', 'text_length',
       'log_prefix'],
      dtype='object')

In [45]:
df.head()

Unnamed: 0,log_id,raw_log_text,source_file,label,text_length,log_prefix
0,0,INFO nova.compute.claims [None req-a4498d64-47...,../data/openstack-nova-normal-vm-create.log,normal,171,INFO nova.compute.claims [None
1,1,INFO nova.virt.libvirt.driver [None req-a4498d...,../data/openstack-nova-normal-vm-create.log,normal,153,INFO nova.virt.libvirt.driver [None
2,2,INFO os_vif [None req-a4498d64-47bb-491f-adde-...,../data/openstack-nova-normal-vm-create.log,normal,396,INFO os_vif [None
3,3,INFO nova.compute.manager [None req-ac9f5721-5...,../data/openstack-nova-normal-vm-create.log,normal,161,INFO nova.compute.manager [None
4,4,INFO nova.compute.manager [None req-ac9f5721-5...,../data/openstack-nova-normal-vm-create.log,normal,160,INFO nova.compute.manager [None


In [46]:
df['log_prefix']

0                            INFO nova.compute.claims [None
1                       INFO nova.virt.libvirt.driver [None
2                                         INFO os_vif [None
3                           INFO nova.compute.manager [None
4                           INFO nova.compute.manager [None
                                ...                        
137537    INFO nova.compute.manager [req-1e7f48d9-ddf5-4...
137538    INFO nova.compute.manager [req-b4025695-7c8a-4...
Name: log_prefix, Length: 137540, dtype: object

In [47]:
df.to_csv('../data/nova_logs_refined.csv', index=False)