In [6]:
import pandas as pd
import numpy as np

# Load the dataset you just created
df = pd.read_csv('nova_logs_manageable.csv')

In [7]:
df.columns.tolist()

['log_id', 'raw_log_text', 'source_file', 'label']

In [8]:
from sentence_transformers import SentenceTransformer

# Load the embedding model
print("Loading sentence transformer model...")
model = SentenceTransformer('all-MiniLM-L6-v2')
print("Model loaded successfully!")


Loading sentence transformer model...
Model loaded successfully!


In [9]:
# Extract just the log text column
log_texts = df['raw_log_text'].astype(str).tolist()
print(f"Prepared {len(log_texts)} log texts for embedding")


Prepared 54646 log texts for embedding


In [10]:
# Generate embeddings (this will take a few minutes)
print("Computing embeddings... (this may take 3-5 minutes)")
embeddings = model.encode(
    log_texts, 
    batch_size=128,
    show_progress_bar=True,
    convert_to_numpy=True
)

print(f"Generated embeddings with shape: {embeddings.shape}")


Computing embeddings... (this may take 3-5 minutes)


Batches:   0%|          | 0/427 [00:00<?, ?it/s]

Generated embeddings with shape: (54646, 384)


In [15]:
# # Try multiple parameter combinations to find optimal clustering
# from sklearn.cluster import DBSCAN
# import numpy as np

# eps_values = [0.1, 0.2, 0.3, 0.4]
# min_samples_values = [15, 20, 25, 30]

# print("Testing different DBSCAN parameters:")
# for eps in eps_values:
#     for min_samples in min_samples_values:
#         dbscan = DBSCAN(eps=eps, min_samples=min_samples, metric='cosine')
#         cluster_labels = dbscan.fit_predict(embeddings)
        
#         n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
#         n_noise = list(cluster_labels).count(-1)
        
#         print(f"eps={eps}, min_samples={min_samples}: {n_clusters} clusters, {n_noise} noise ({n_noise/len(df)*100:.1f}%)")


Testing different DBSCAN parameters:
eps=0.1, min_samples=15: 61 clusters, 854 noise (1.6%)
eps=0.1, min_samples=20: 57 clusters, 929 noise (1.7%)
eps=0.1, min_samples=25: 56 clusters, 953 noise (1.7%)
eps=0.1, min_samples=30: 53 clusters, 1055 noise (1.9%)
eps=0.2, min_samples=15: 23 clusters, 525 noise (1.0%)
eps=0.2, min_samples=20: 23 clusters, 558 noise (1.0%)
eps=0.2, min_samples=25: 23 clusters, 560 noise (1.0%)
eps=0.2, min_samples=30: 20 clusters, 647 noise (1.2%)
eps=0.3, min_samples=15: 16 clusters, 309 noise (0.6%)
eps=0.3, min_samples=20: 15 clusters, 345 noise (0.6%)
eps=0.3, min_samples=25: 15 clusters, 347 noise (0.6%)
eps=0.3, min_samples=30: 14 clusters, 375 noise (0.7%)
eps=0.4, min_samples=15: 4 clusters, 222 noise (0.4%)
eps=0.4, min_samples=20: 4 clusters, 225 noise (0.4%)
eps=0.4, min_samples=25: 4 clusters, 226 noise (0.4%)
eps=0.4, min_samples=30: 4 clusters, 226 noise (0.4%)


In [16]:
# # Add the cluster assignments back to your dataframe
# df['cluster_id'] = cluster_labels

# # Basic cluster statistics
# n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
# n_noise = list(cluster_labels).count(-1)

# print(f"Found {n_clusters} clusters")
# print(f"Noise points: {n_noise}")


Found 4 clusters
Noise points: 226


In [17]:
# import collections

# # Count logs per cluster
# cluster_counts = collections.Counter(cluster_labels)
# print("\nTop 10 clusters by size:")
# for cid, count in cluster_counts.most_common(10):
#     print(f"  Cluster {cid}: {count} logs")



Top 10 clusters by size:
  Cluster 0: 54203 logs
  Cluster -1: 226 logs
  Cluster 1: 93 logs
  Cluster 2: 93 logs
  Cluster 3: 31 logs


In [19]:
from sklearn.cluster import KMeans

# KMeans with fixed number of clusters (more reliable for your use case)
n_clusters = 25  # Reasonable number for your dataset size

kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
cluster_labels = kmeans.fit_predict(embeddings)

df['cluster_id'] = cluster_labels

# Analyze results
import collections
cluster_counts = collections.Counter(cluster_labels)
print(f"KMeans created {n_clusters} clusters")
print("Top 10 clusters by size:")
for cid, count in cluster_counts.most_common(10):
    print(f"  Cluster {cid}: {count} logs")


KMeans created 25 clusters
Top 10 clusters by size:
  Cluster 1: 5268 logs
  Cluster 8: 4507 logs
  Cluster 18: 4405 logs
  Cluster 4: 3643 logs
  Cluster 2: 3285 logs
  Cluster 3: 2967 logs
  Cluster 0: 2887 logs
  Cluster 11: 2693 logs
  Cluster 7: 2500 logs
  Cluster 5: 2467 logs


In [22]:
print(f"KMeans created {n_clusters} clusters")
print("Cluster sizes:")
for cid, count in sorted(cluster_counts.items()):
    print(f"  Cluster {cid}: {count} logs")


KMeans created 25 clusters
Cluster sizes:
  Cluster 0: 2887 logs
  Cluster 1: 5268 logs
  Cluster 2: 3285 logs
  Cluster 3: 2967 logs
  Cluster 4: 3643 logs
  Cluster 5: 2467 logs
  Cluster 6: 2462 logs
  Cluster 7: 2500 logs
  Cluster 8: 4507 logs
  Cluster 9: 1100 logs
  Cluster 10: 1658 logs
  Cluster 11: 2693 logs
  Cluster 12: 2241 logs
  Cluster 13: 937 logs
  Cluster 14: 1188 logs
  Cluster 15: 1941 logs
  Cluster 16: 807 logs
  Cluster 17: 1352 logs
  Cluster 18: 4405 logs
  Cluster 19: 1130 logs
  Cluster 20: 2021 logs
  Cluster 21: 1058 logs
  Cluster 22: 864 logs
  Cluster 23: 606 logs
  Cluster 24: 659 logs


In [23]:
# Use your KMeans results and analyze cluster content
import collections

# Analyze what's in each cluster
print("=== CLUSTER CONTENT ANALYSIS ===")
for cluster_id in range(25):  # Top 5 clusters
    cluster_logs = df[df['cluster_id'] == cluster_id]
    print(f"\n--- CLUSTER {cluster_id} (size: {len(cluster_logs)}) ---")
    
    # Sample logs
    sample_logs = cluster_logs['raw_log_text'].head(3)
    for i, log in enumerate(sample_logs, 1):
        print(f"{i}. {log[:100]}...")
    
    # Label distribution in this cluster
    label_dist = cluster_logs['label'].value_counts()
    print(f"Labels: {dict(label_dist)}")


=== CLUSTER CONTENT ANALYSIS ===

--- CLUSTER 0 (size: 2887) ---
1. INFO nova.compute.manager [req-b9d6411c-b3ea-4307-a707-ec546b0192b3] [instance: 8192614e-4a86-47cc-a...
2. INFO nova.compute.manager [None req-b98e753d-2fc9-45fa-b5ea-c9cb88685b4c admin admin] [instance: 6c3...
3. INFO nova.compute.manager [None req-c425b326-7176-44af-a1e2-3dcb26a94253 admin admin] [instance: b61...
Labels: {'mixed': np.int64(1139), 'normal': np.int64(767), 'abnormal1': np.int64(411), 'abnormal2': np.int64(371), 'abnormal3': np.int64(199)}

--- CLUSTER 1 (size: 5268) ---
1. INFO nova.virt.libvirt.driver [req-297e6cd3-84b7-43af-982e-af5be68422dd] [instance: 5897334a-6797-4c...
2. INFO nova.virt.libvirt.driver [req-65c799f8-e7f0-4c7f-ad2e-db39859caa15] [instance: 9f4a73a0-9f9c-44...
3. INFO nova.virt.libvirt.driver [req-b54a3755-1572-459d-82f9-4b7c57ae3e80] [instance: 8191edca-f070-40...
Labels: {'mixed': np.int64(2189), 'normal': np.int64(1558), 'abnormal1': np.int64(593), 'abnormal2': np.int64(528), 'a

In [24]:
# Save your clustered DataFrame (the one with cluster_id column)
df.to_csv('../data/nova_logs_clustered.csv', index=False)

# Verify it was saved correctly
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"Cluster distribution:")
print(df['cluster_id'].value_counts().sort_index())


Dataset shape: (54646, 5)
Columns: ['log_id', 'raw_log_text', 'source_file', 'label', 'cluster_id']
Cluster distribution:
cluster_id
0     2887
1     5268
2     3285
3     2967
4     3643
5     2467
6     2462
7     2500
8     4507
9     1100
10    1658
11    2693
12    2241
13     937
14    1188
15    1941
16     807
17    1352
18    4405
19    1130
20    2021
21    1058
22     864
23     606
24     659
Name: count, dtype: int64
