<a href="https://colab.research.google.com/github/julieannaqi/julieannaqi/blob/main/dbscan_predict_severity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install scikit-learn



In [3]:
!pip install memory_profiler



In [18]:
import pandas as pd

# Replace 'juile.csv' with the actual filename if it's different
df = pd.read_csv('/content/sampledata.csv')

df = df.head(500)

In [4]:
df.columns

Index(['ioc_match_id', 'event_time', 'src_host', 'indicator', 'confidence',
       'severity', 'indicator_type', 'originator', 'source_feed_id',
       'event.action', 'event.dest', 'event.dest_ip', 'event.src',
       'event.src_ip', 'count', 'event.src_port', 'event.dest_port',
       'timestamp', 'event.sourcetype', 'source', 'itype', 'hour', 'minute'],
      dtype='object')

In [5]:
df.corr

<bound method DataFrame.corr of                           ioc_match_id     event_time       src_host  \
0      1773_362000776412_0_60382090314  1701618689736  10.15.149.168   
1      1773_361424542980_0_59299513186  1701589169373  10.15.149.168   
2      1773_361424542980_0_59327455359  1701589169373  10.15.149.168   
3      1773_361424542980_0_59327639236  1701589169373  10.15.149.168   
4      1773_362000776412_0_60440942415  1701618689736  10.15.149.168   
...                                ...            ...            ...   
99995  1773_361569283028_0_60382090483  1701596405801  10.20.100.250   
99996  1773_361569283028_0_60724428162  1701596405801  10.20.100.250   
99997  1773_361569283121_0_60362960162  1701596405801   10.20.106.39   
99998  1773_361569283121_0_60374602795  1701596405801   10.20.106.39   
99999  1773_361569283121_0_60650052021  1701596405801   10.20.106.39   

            indicator  confidence severity indicator_type  originator  \
0        77.90.185.71         

In [19]:
# looks like the time related fields are heavily correlated. Let's only keep the event_Time.
df.drop(columns=['timestamp', 'hour'], inplace=True)

In [11]:
print(df.columns)

Index(['ioc_match_id', 'event_time', 'src_host', 'indicator', 'confidence',
       'severity', 'indicator_type', 'originator', 'source_feed_id',
       'event.action', 'event.dest', 'event.dest_ip', 'event.src',
       'event.src_ip', 'count', 'event.src_port', 'event.dest_port',
       'event.sourcetype', 'source', 'itype', 'minute'],
      dtype='object')


In [20]:
# I don't know much about the data, but let's pretend we want to build a model to predict the severity

y = df['severity']
predictive_columns = df.columns[df.columns != 'severity']
x = df[predictive_columns]

In [13]:
print(y)

0           low
1        medium
2           low
3           low
4        medium
          ...  
99995       low
99996    medium
99997       low
99998       low
99999    medium
Name: severity, Length: 100000, dtype: object


In [13]:
from sklearn.metrics import homogeneity_score, completeness_score, v_measure_score
from sklearn.metrics import adjusted_rand_score, adjusted_mutual_info_score, silhouette_score

def calculate_metrics(data, true_labels, predicted_labels):
         homogeneity = homogeneity_score(true_labels, predicted_labels)
         completeness = completeness_score(true_labels, predicted_labels)
         v_measure = v_measure_score(true_labels, predicted_labels)
         adj_rand_index = adjusted_rand_score(true_labels, predicted_labels)
         adj_mutual_info = adjusted_mutual_info_score(true_labels, predicted_labels)
         silhouette_coeff = silhouette_score(data, predicted_labels)

         return {
             "Homogeneity": homogeneity,
             "Completeness": completeness,
             "V-Measure": v_measure,
             "Adjusted Rand Index": adj_rand_index,
             "Adjusted Mutual Information": adj_mutual_info,
             "Silhouette Coefficient": silhouette_coeff
         }
        #return silhouette_coeff


In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import FunctionTransformer
from scipy.sparse import csr_matrix

# Assuming 'severity' is the target variable and the rest are predictors
y = df['severity']
predictive_columns = df.columns[df.columns != 'severity']
X = df[predictive_columns]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Identify non-numeric and numeric columns
non_numeric_cols = X.select_dtypes(exclude=['number']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# ColumnTransformer to handle different transformations for different columns
preprocessor = ColumnTransformer(
    transformers=[
        ('onehot', OneHotEncoder(handle_unknown='ignore'), non_numeric_cols),
        ('imputer', SimpleImputer(strategy='constant', fill_value=-1), numeric_cols),
        ('scaler', StandardScaler(), numeric_cols),
        ('to_sparse', FunctionTransformer(csr_matrix), numeric_cols)
    ]
)

# DBSCAN model
dbscan = DBSCAN(eps=0.1, min_samples=10)

# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('dbscan', dbscan)  # Add the DBSCAN model to the pipeline
])

# Fit the pipeline to your training data
pipeline.fit(X_train)

true_labels_train = y_train
true_labels_test = y_test

X_train_processed = pipeline.named_steps['preprocessor'].fit_transform(X_train)

# Transform the testing set using the fitted pipeline
X_test_processed = pipeline.named_steps['preprocessor'].transform(X_test)

# Fit and predict labels on the training set
labels_train = pipeline.named_steps['dbscan'].fit_predict(X_train_processed)

# Predict labels on the testing set using the trained pipeline
labels_test = pipeline.named_steps['dbscan'].fit_predict(X_test_processed)

In [15]:
# Calculate metrics for the training set
metrics_train = calculate_metrics(X_train_processed, true_labels_train, labels_train)

# Calculate metrics for the testing set
metrics_test = calculate_metrics(X_test_processed, true_labels_test, labels_test)

# Display or use the metrics as needed
print("Training Set Metrics:")
print(metrics_train)

print("\nTesting Set Metrics:")
print(metrics_test)

Training Set Metrics:
{'Homogeneity': 0.01360916975476778, 'Completeness': 0.009197286626300644, 'V-Measure': 0.010976491296077199, 'Adjusted Rand Index': -0.00016501499174680458, 'Adjusted Mutual Information': 0.0029869827525054204, 'Silhouette Coefficient': 0.9525522606789859}

Testing Set Metrics:
{'Homogeneity': 0.01536177620835911, 'Completeness': 0.009603705170100984, 'V-Measure': 0.011818716199194917, 'Adjusted Rand Index': -0.008877917481467162, 'Adjusted Mutual Information': -0.019985575453091604, 'Silhouette Coefficient': 0.7786547855366028}


In [16]:
!pip install prettytable



In [17]:
from prettytable import PrettyTable

# Display or use the metrics as needed
def display_metrics(metrics, set_name):
    table = PrettyTable()
    table.field_names = ["Metric", "Value"]

    for metric, value in metrics.items():
        table.add_row([metric, value])

    print(f"{set_name} Metrics:")
    print(table)
    print()

# Assuming you have metrics_train and metrics_test dictionaries
display_metrics(metrics_train, "Training Set")
display_metrics(metrics_test, "Testing Set")


Training Set Metrics:
+-----------------------------+-------------------------+
|            Metric           |          Value          |
+-----------------------------+-------------------------+
|         Homogeneity         |   0.01360916975476778   |
|         Completeness        |   0.009197286626300644  |
|          V-Measure          |   0.010976491296077199  |
|     Adjusted Rand Index     | -0.00016501499174680458 |
| Adjusted Mutual Information |  0.0029869827525054204  |
|    Silhouette Coefficient   |    0.9525522606789859   |
+-----------------------------+-------------------------+

Testing Set Metrics:
+-----------------------------+-----------------------+
|            Metric           |         Value         |
+-----------------------------+-----------------------+
|         Homogeneity         |  0.01536177620835911  |
|         Completeness        |  0.009603705170100984 |
|          V-Measure          |  0.011818716199194917 |
|     Adjusted Rand Index     | -0.00887

Upon first impression, the silhouette coefficient tells us that our model produces decently well-formed clusters in both the train and test set, and doesn't appear to be overfitting.

However, from both homogeneity and completeness, we cannot assume that each cluster contains members of a single class. This is ok. It just means that combinations of different columns is more descriptive than a single column.

The rand index and adjusted mutual information suggests that the true and predicted clusterings are largley dissimilar. We cannot infer much from one about the other.