In [19]:
import pandas as pd

In [20]:
df = pd.read_csv("dataset/synthetic_logs.csv")

In [21]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert


In [22]:
df.source.unique()

array(['ModernCRM', 'AnalyticsEngine', 'ModernHR', 'BillingSystem',
       'ThirdPartyAPI', 'LegacyCRM'], dtype=object)

In [23]:
df.target_label.unique()

array(['HTTP Status', 'Critical Error', 'Security Alert', 'Error',
       'System Notification', 'Resource Usage', 'User Action',

In [24]:
df.complexity.unique()

array(['bert', 'regex', 'llm'], dtype=object)

In [25]:
from sentence_transformers import SentenceTransformer
from sklearn.cluster import DBSCAN

In [26]:
# loading pre trained model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [27]:
# generate embeddings for log messages
embeddings = model.encode(df['log_message'].tolist())

In [28]:
# performing DB scan
dbscan = DBSCAN(eps=0.2,min_samples=1,metric='cosine')
clusters = dbscan.fit_predict(embeddings)

In [29]:
df['cluster'] = clusters

In [30]:
df.head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
0,2025-06-27 07:20:25,ModernCRM,nova.osapi_compute.wsgi.server [req-b9718cd8-f...,HTTP Status,bert,0
1,1/14/2025 23:07,ModernCRM,Email service experiencing issues with sending,Critical Error,bert,1
2,1/17/2025 1:29,AnalyticsEngine,Unauthorized access to data was attempted,Security Alert,bert,2
3,2025-07-12 00:24:16,ModernHR,nova.osapi_compute.wsgi.server [req-4895c258-b...,HTTP Status,bert,0
4,2025-06-02 18:25:23,BillingSystem,nova.osapi_compute.wsgi.server [req-ee8bc8ba-9...,HTTP Status,bert,0


In [31]:
df[df.cluster==97].head()

Unnamed: 0,timestamp,source,log_message,target_label,complexity,cluster
1114,5/12/2025 5:23,BillingSystem,Detected a data transfer attempt with insuffic...,Security Alert,bert,97
1175,4/21/2025 2:17,AnalyticsEngine,Unapproved data transfer attempt was detected,Security Alert,bert,97
1740,10/7/2025 8:13,ModernHR,Unauthenticated data transfer attempt was dete...,Security Alert,bert,97
1979,9/28/2025 14:51,BillingSystem,Detected an unauthorized attempt to transfer data,Security Alert,bert,97


In [32]:
cluster_counts = df['cluster'].unique()
cluster_counts

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135])

In [33]:
cluster_counts = df['cluster'].value_counts()
cluster_counts

cluster
0      1017
5       147
11      100
13       86
7        60
       ... 
131       1
132       1
133       1
134       1
135       1
Name: count, Length: 136, dtype: int64

In [34]:
large_clusters = cluster_counts[cluster_counts > 10].index
large_clusters

Index([ 0,  5, 11, 13,  7,  8, 21,  3,  4, 17,  6, 32, 16, 20,  9,  1, 10, 34,
       53, 14, 52, 18, 42, 25, 59, 26],
      dtype='int64', name='cluster')

In [35]:
for cluster in large_clusters:
    print(f"Cluster {cluster}:") 
    print(df[df['cluster'] == cluster]['log_message'].head(5).to_string(index=False))
    print()

Cluster 0:
nova.osapi_compute.wsgi.server [req-b9718cd8-f6...
nova.osapi_compute.wsgi.server [req-4895c258-b2...
nova.osapi_compute.wsgi.server [req-ee8bc8ba-92...
nova.osapi_compute.wsgi.server [req-f0bffbc3-5a...
nova.osapi_compute.wsgi.server [req-2bf7cfee-a2...

Cluster 5:
nova.compute.claims [req-a07ac654-8e81-416d-bfb...
nova.compute.claims [req-d6986b54-3735-4a42-907...
nova.compute.claims [req-72b4858f-049e-49e1-b31...
nova.compute.claims [req-5c8f52bd-8e3c-41f0-95a...
nova.compute.claims [req-d38f479d-9bb9-4276-968...

Cluster 11:
User User685 logged out.
 User User395 logged in.
 User User225 logged in.
User User494 logged out.
 User User900 logged in.

Cluster 13:
Backup started at 2025-05-14 07:06:55.
Backup started at 2025-02-15 20:00:19.
  Backup ended at 2025-08-08 13:06:23.
Backup started at 2025-11-14 08:27:43.
Backup started at 2025-12-09 10:19:11.

Cluster 7:
Multiple bad login attempts detected on user 85...
Multiple login failures occurred on user 9052 a...
  User 

In [45]:
# identifying most commonly appearing pattern through regex  
def classify_with_regex(Log_message) :
    regex_patterns = {
r"User User\d+ logged (in|out). ": "User Action",
r"Backup (started|ended) at .* ": "System Notification",
r"Backup completed successfully. ": "System Notification",
r"System updated to version .* ": "System Notification",
r"File . * uploaded successfully by user .* ": "System Notification",
r"Disk cleanup completed successfully. ": "System Notification",
r"System reboot initiated by user . * ": "System Notification",
r"Account with ID .* created by .* ": "User Action"
}  
    for pattern, label in regex_patterns.items():
        if re.search(pattern, Log_message):
            return label
    return "other"
