In [None]:
!pip install gower
!pip install scikit-learn-extra



In [None]:
import warnings

# Filter out DeprecationWarnings
warnings.filterwarnings("ignore", category=FutureWarning)

In [None]:
import numpy as np
import pandas as pd

from sklearn_extra.cluster import KMedoids
from sklearn.metrics import DistanceMetric
from sklearn.preprocessing import MinMaxScaler

from gower import gower_matrix

In [None]:
def gower_distance(X):
    """
    This function expects a pandas dataframe as input
    The data frame is to contain the features along the columns. Based on these features a
    distance matrix will be returned which will contain the pairwise gower distance between the rows
    All variables of object type will be treated as nominal variables and the others will be treated as
    numeric variables.
    Distance metrics used for:
    Nominal variables: Dice distance (https://en.wikipedia.org/wiki/S%C3%B8rensen%E2%80%93Dice_coefficient)
    Numeric variables: Manhattan distance normalized by the range of the variable (https://en.wikipedia.org/wiki/Taxicab_geometry)
    """

    individual_variable_distances = []

    for i in range(X.shape[1]):
        feature = X.iloc[:,[i]]
        if feature.columns[0] in ["dport", "flags_tcp", "proto", "sport", "src", "version"]:
            feature_dist = DistanceMetric.get_metric('dice').pairwise(pd.get_dummies(feature))
        else:
            feature_dist = DistanceMetric.get_metric('manhattan').pairwise(feature) / 10 #np.ptp(feature.values)
        feature_dist[np.isnan(feature_dist)] = 0
        individual_variable_distances.append(feature_dist)

    return np.array(individual_variable_distances).mean(0)

class ScipyKMedoidsClustering:
    def __init__(self, num_clusters=10, metric='precomputed', method='pam', init='build', max_iter=300, random_state=None, threshold = 0.45, \
                 categorical_columns = ["dport", "flags_tcp", "proto", "sport", "src", "version"]):
        self.num_clusters = num_clusters
        self.metric = metric
        self.method = method
        self.init = init
        self.max_iter = max_iter
        self.random_state = random_state
        self.kmedoids = None
        self.medoid_indices = None
        self.centroids = None
        self.threshold = threshold
        self.categorical_columns = categorical_columns

    def fit(self, X):
        """
        # Standarization of the data
        self.len_mean = X['len'].mean()
        self.len_std = X['len'].std()
        self.count_mean = X['count'].mean()
        self.count_std = X['count'].std()

        X['len'] = (X['len'] - len_mean) / len_std
        X['count'] = (X['count'] - count_mean) / count_std
        """
        # Compute the distances matrix using the gower distance as metric
        # distances = calculate_gower_distance(X, categorical_names=categorical_columns)

        #distances = gower.gower_matrix(X.values, cat_features = [1,1,0,1,1,1,1,0])
        distances = gower_distance(X)
        # Train the model to obtain the centroids
        self.kmedoids = KMedoids(n_clusters=self.num_clusters,
                                 metric=self.metric,
                                 method=self.method,
                                 init=self.init,
                                 max_iter=self.max_iter,
                                 random_state=self.random_state)
        self.kmedoids.fit(pd.DataFrame(distances).dropna())
        self.medoid_indices = self.kmedoids.medoid_indices_
        self.centroids = df.iloc[self.medoid_indices]
        # print(self.centroids)
        # print(distances)

    def discriminate(self, sample):
        """
        # Standarization of the sample
        sample['len'] = (sample['len'] - self.len_mean) / self.len_std
        sample['count'] = (sample['count'] - self.count_mean) / self.count_std
        # Compute the distance of the sample to the centroids and check whether they are below the given threshold
        """
        """
        sample_df = pd.DataFrame([sample])
        sample_centroids_df = pd.concat([sample_df, self.centroids]).reset_index(drop=True)
        print(sample_centroids_df)
        #distances_sample = calculate_gower_distance(sample_centroids_df, categorical_names=self.categorical_columns)
        distances_sample = gower.gower_matrix(sample_centroids_df, cat_features=[1, 1, 1, 1, 0, 1, 1, 0])
        print(distances_sample)
        first_row = distances_sample[0]
        # excluded_values = np.delete(first_row, 0)
        print(excluded_values)
        is_below_threshold = any(value < self.threshold for value in excluded_values)

        if is_below_threshold:
            return False
        else:
            return True

        """
        sample_df = pd.DataFrame([sample])
        distances_sample = []
        for _, centroid_row in self.centroids.iterrows():
          combined_df = pd.concat([sample_df, centroid_row.to_frame().T], ignore_index=True)
          combined_df.reset_index(drop=True, inplace=True)
          distance = gower_distance(combined_df)
          # distance = gower.gower_matrix(combined_df, cat_features = [1,1,0,1,1,1,1,0])
          distances_sample.append(distance[0][1])  # Extract the distance between sample and centroid

        # Check if any of the distances are below the threshold
        is_below_threshold = any(distance < self.threshold for distance in distances_sample)

        return not is_below_threshold

In [None]:
df = pd.read_csv('concatenated_output.csv')
df = df.sample(5000)
categorical_columns = ["dport", "flags_tcp", "proto", "sport", "src", "version"]
for category in categorical_columns:
  df[category] = df[category].astype(str)
df['len'] = df[category].astype(int)
df['count'] = df[category].astype(int)
my_kmedoids = ScipyKMedoidsClustering(threshold=0.45)
my_kmedoids.fit(df)

In [12]:
import pickle

file_path = "data.pkl"
with open(file_path, 'wb') as f:
    pickle.dump((df, my_kmedoids), f)

In [None]:
# Suppose your new entry is represented as a dictionary where keys are column names
new_entry= {'dport': '62835', 'flags_tcp': 'n', 'len': 4, 'proto': '17', 'sport': '443', 'src': '142.250.200.131', 'version': '4', 'count': 40}
print(my_kmedoids.discriminate(new_entry))

True
