# Clustering Demo (Malicious)

**Note:** Make sure that you have `oliveira.csv` in the same directory as this notebook.

**Clustering Methods not Supported:** GaussianMixture & HDBScan

## Code Preparation

Prepare the functions that might be used as the notebook runs.

In [1]:
import random
import pandas as pd
from sklearn.cluster import KMeans,  BisectingKMeans, MiniBatchKMeans, Birch, DBSCAN
# from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS, MeanShift # Not working
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
import time
import os

#Load list of API calls
API_LIST = "../api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #Add the label for NaN values.
API_FILE.close()

#Random Seed
seed = 1

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def load_df():
    '''Load the dataset file (CSV) as DataFrame'''
    print("Loading DF...")
    df = pd.read_csv("../oliveira_labelled.csv", low_memory=False, memory_map=True) # MAKE SURE THIS IS SET AS `oliveira.csv`
    df = df[df['malware'] == 1].copy()
    df = df.drop('malware', axis=1)
    df = df.drop('type', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def get_x(df:pd.DataFrame):
    '''Get the feature columns of the DataFrame'''
    return df.iloc[:, 1:102-1]

#Inverse Label Encoding
def inverse_labeller(item):
    '''Low Level. Converts encoded API calls to string API calls'''
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df:pd.DataFrame):
    '''High Level. Converts encoded API calls to string API calls'''
    df2 = df.copy(deep=True)
    print("Inverse Labelling...")
    df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df2

def search_k(parameters, model, X):
    '''Search for the best parameter(s) for the model (usually cluster size or K value)'''
    paramGrid = ParameterGrid(parameters)
    best_score = -1
    best_grid = -1
    best_clusterer = None
    # evaluation based on silhouette_score
    print("Searching Best Clustering Parameters...")
    for p in paramGrid:
        model.set_params(**p)    # set current hyper parameter
        start_time = time.time()
        model.fit(X)          # fit model on wine dataset, this will find clusters based on parameter p
        ss = metrics.silhouette_score(X, model.labels_, random_state=seed)   # calculate silhouette_score
        # silhouette_scores.append([p, ss]) # store all the scores
        print('Parameter:', p, 'Score', f"{ss:.4f}", 'Unique_Labels', len(pd.Series(model.labels_).unique()), "Time", f"{time.time()-start_time:.4f}")
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
            best_clusterer = model
    print("")
    print("BEST PARAM SETUP: ", best_grid, best_score)
    print("")
    return best_grid, best_clusterer

def clustering(inner_df:pd.DataFrame, name:str, clusterer, parameters):
    '''Executes the data clustering on the dataset. Produces the same input dataset with the additional column for the cluster #.'''
    '''The input dataset must contain integer API calls (except the API Call Pattern strings)'''
    X = get_x(inner_df)
    bestCluster, bestClusterer = search_k(parameters, clusterer, X) #assumes bestCluster already fitted
    #bestClusterer.fit(X)
    inner_df['cluster'] = bestClusterer.labels_
    inner_df.to_csv(f"Clustering/Malicious/{name}_Encoded_Clustering.csv", index=False)
    inner_df = inverse_label(inner_df)
    inner_df.to_csv(f"Clustering/Malicious/{name}_Clustering.csv", index=False)
    print("")
    return bestCluster

def common_api_cluster(inner_df:pd.DataFrame, name:str):
    '''Determine the most common API call patterns for each cluster'''
    global df
    inner_df = df
    clusters = inner_df['cluster'].unique()
    clusters.sort()
    commonAPI = []
    print("Searching for Common API Patterns per Cluster...")
    print(clusters)
    for cluster in clusters:
        raw_commonC = inner_df[inner_df['cluster']==cluster]['pattern']#.value_counts()
        commonC = raw_commonC.value_counts().to_frame(name='counts').reset_index()
        commonAPI.append([cluster, commonC['counts'].iloc[0], round(commonC['counts'].iloc[0]/raw_commonC.shape[0],4), commonC['pattern'].iloc[0]])
    commonAPI = pd.DataFrame(commonAPI, columns=['cluster', 'count', 'match_ratio', 'pattern'])
    commonAPI.to_csv(f"Clustering/Malicious/{name}_Common_API_Cluster.csv", index=False)
    print("")
    print("Average Match Ratio:", commonAPI['match_ratio'].mean())
    return commonAPI

def get_samplehash_common(inner_df:pd.DataFrame, common_counts:pd.DataFrame, name:str, samplesize:int):
    '''Get sample hashes from each cluster that matches the common API call pattern of the cluster.'''
    hashes = []
    global seed
    random.seed(seed)
    matching_samples = 0
    print(f"Random (Seed @ {seed}) Sampling Hashes subset of  Most Common API Patterns...")
    for pattern in range(common_counts.shape[0]):
        sub_df = inner_df[inner_df['pattern']==common_counts.iloc[pattern,3]]
        subsamples = len(sub_df['hash'].to_list())
        matching_samples += subsamples
        if subsamples < samplesize:
            subsamples = random.sample(sub_df['hash'].to_list(), len(sub_df['hash'].to_list()))
        else:
            subsamples = random.sample(sub_df['hash'].to_list(), samplesize)
        for subsample in subsamples:
            hashes.append([common_counts.iloc[pattern,:]['cluster'], subsample, '_', '_', '_', common_counts.iloc[pattern,:]['pattern']])
    hashes = pd.DataFrame(hashes, columns=['cluster', 'hash', 'Type 1', 'Type 2', 'Type 3', 'pattern'])
    hashes.to_csv(f"Clustering/Malicious/{name}_SampleHash_Common.csv", index=False)
    print(f"Commonality Ratio: {(matching_samples/inner_df.shape[0])}")
    print("")
    return hashes
    
def inject_patterns(inner_df:pd.DataFrame, inverse_labelled_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    print("")
    inverse_label(inner_df).to_csv(f"Clustering/Malicious/API_Patterns.csv", index=False)
    return inner_df

if not os.path.exists(os.path.abspath(os.getcwd())+"\Clustering"):
    os.makedirs(os.path.abspath(os.getcwd())+"\Clustering\Malicious")
    os.makedirs(os.path.abspath(os.getcwd())+"\Clustering\Benign")

  from pandas.core import (


## Load Dataset

In [2]:
df = load_df()
df = inject_patterns(df.copy(), inverse_label(df.copy()))
reserve_df = df.copy()
df

Loading DF...

Inverse Labelling...


1            GetSystemTimeAsFileTime
2        SetUnhandledExceptionFilter
3            GetSystemTimeAsFileTime
4            GetSystemTimeAsFileTime
                    ...             
41119        GetSystemTimeAsFileTime
41120        GetSystemTimeAsFileTime
41121        GetSystemTimeAsFileTime
41122        GetSystemTimeAsFileTime
41123                  RegOpenKeyExA
Name: t_0, Length: 41124, dtype: object' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
1        NtAllocateVirtualMemory
2                  OleInitialize
3        NtAllocateVirtualMemory
4                     LdrLoadDll
                  ...           
41119                 LdrLoadDll
41120                 LdrLoadDll
41121                 LdrLoadDll
41122                 LdrLoadDll
41123                  NtOpenKey
Name: t_1, Length: 41124, dtype: object' has dtype incompatible with int64, 


Injecting API patterns...

Inverse Labelling...


1            GetSystemTimeAsFileTime
2        SetUnhandledExceptionFilter
3            GetSystemTimeAsFileTime
4            GetSystemTimeAsFileTime
                    ...             
41119        GetSystemTimeAsFileTime
41120        GetSystemTimeAsFileTime
41121        GetSystemTimeAsFileTime
41122        GetSystemTimeAsFileTime
41123                  RegOpenKeyExA
Name: t_0, Length: 41124, dtype: object' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
1        NtAllocateVirtualMemory
2                  OleInitialize
3        NtAllocateVirtualMemory
4                     LdrLoadDll
                  ...           
41119                 LdrLoadDll
41120                 LdrLoadDll
41121                 LdrLoadDll
41122                 LdrLoadDll
41123                  NtOpenKey
Name: t_1, Length: 41124, dtype: object' has dtype incompatible with int64, 




Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41119,e3d6d58faa040f0f9742c9d0eaf58be4,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41120,9b917bab7f32188ae40c744f2be9aaf8,82,240,117,240,117,240,117,240,117,...,159,224,82,159,224,82,159,224,82,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41121,35a18ee05f75f04912018d9f462cb990,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41122,654139d715abcf7ecdddbef5a84f224b,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


### Explanation

The process as the following:
1. Find the best cluster size (by means of the [Silhouette Score](https://tushar-joshi-89.medium.com/silhouette-score-a9f7d8d78f29))
2. Search for the most common API pattern for each cluster
3. Sample n hashes per cluster that match the most common API pattern for the same cluster.

**Silhouette Score:** How good is the quality of data clustering? *(Higher is better)*

**Match Ratio:** How common is the most common API call pattern in each cluster among the samples found in its cluster? *(Higher is better)*

**Commonality Ratio**: How many of the matching most common API patterns per cluster are there relative to the dataset size. *(Higher is better)*

In [3]:
'''UPDATE THESE VALUES AS NEEDED'''

clusters = [25,50,75,100,125,150,175,200]    # Place either single or multiple values as long as it is in list format. 
                    # For multiple values, the program will iterate through every cluster size and will choose the best (usually the biggest value) to be part of the best configuration.
                    # It influences the projected total number of samples to verify/analyze.
                    
samplesize = 50     # Max no. of samples to obtain from a cluster. 
                    # It influences the projected total number of samples to verify/analyze.

## K-Means

In [4]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "KMeans", KMeans(n_init='auto', verbose=0, random_state=seed), 
                              {'algorithm':algorithm})

commonAPI = common_api_cluster(df, "KMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "KMeans", samplesize)
display(commonHashes)

(41124, 102) 

Searching Best Clustering Parameters...


Parameter: {'algorithm': 'lloyd'} Score 0.2113 Unique_Labels 8 Time 33.0661
Parameter: {'algorithm': 'elkan'} Score 0.2113 Unique_Labels 8 Time 30.1932

BEST PARAM SETUP:  {'algorithm': 'lloyd'} 0.21131487580954622

Inverse Labelling...


1            GetSystemTimeAsFileTime
2        SetUnhandledExceptionFilter
3            GetSystemTimeAsFileTime
4            GetSystemTimeAsFileTime
                    ...             
41119        GetSystemTimeAsFileTime
41120        GetSystemTimeAsFileTime
41121        GetSystemTimeAsFileTime
41122        GetSystemTimeAsFileTime
41123                  RegOpenKeyExA
Name: t_0, Length: 41124, dtype: object' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
1        NtAllocateVirtualMemory
2                  OleInitialize
3        NtAllocateVirtualMemory
4                     LdrLoadDll
                  ...           
41119                 LdrLoadDll
41120                 LdrLoadDll
41121                 LdrLoadDll
41122                 LdrLoadDll
41123                  NtOpenKey
Name: t_1, Length: 41124, dtype: object' has dtype incompatible with int64, 



Searching for Common API Patterns per Cluster...
[0 1 2 3 4 5 6 7]

Average Match Ratio: 0.2416


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,51,0.0481,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,1116,0.0993,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,2,738,0.2097,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,1094,0.2518,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
4,4,1049,0.4173,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
5,5,344,0.0838,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
6,6,3307,0.7653,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
7,7,576,0.0575,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.20122069837564438



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,0226e311ed2648ff399c7902fc113421,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,a3f4c7952802b3cc4a1f49f737cfc0d3,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,92c439e7e5eb92e263dede331ffbb344,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,96e5548a9b3fb189c37d0b23784e7ecc,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,0bf8d0123c352d36c07f319c3c9bc994,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
395,7,be89f1fa032f904894c6456e6edb7f09,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
396,7,9f49d414e87bd9e2610802c5e90c7185,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
397,7,a72f6762dc8b239c8aeb9819ef535633,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
398,7,bef29f99956653cc62f9e197c90909f5,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


## BisectingKMeans

In [5]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "BisectingKMeans", 
                              BisectingKMeans(random_state=seed, verbose=0, copy_x=True), 
                              {'algorithm':algorithm})

commonAPI = common_api_cluster(df, "BisectingKMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "BisectingKMeans", samplesize)
display(commonHashes)

(41124, 102) 

Searching Best Clustering Parameters...
Parameter: {'algorithm': 'lloyd'} Score 0.2132 Unique_Labels 8 Time 31.5448
Parameter: {'algorithm': 'elkan'} Score 0.2132 Unique_Labels 8 Time 29.7434

BEST PARAM SETUP:  {'algorithm': 'lloyd'} 0.2131609216659066

Inverse Labelling...


1            GetSystemTimeAsFileTime
2        SetUnhandledExceptionFilter
3            GetSystemTimeAsFileTime
4            GetSystemTimeAsFileTime
                    ...             
41119        GetSystemTimeAsFileTime
41120        GetSystemTimeAsFileTime
41121        GetSystemTimeAsFileTime
41122        GetSystemTimeAsFileTime
41123                  RegOpenKeyExA
Name: t_0, Length: 41124, dtype: object' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
1        NtAllocateVirtualMemory
2                  OleInitialize
3        NtAllocateVirtualMemory
4                     LdrLoadDll
                  ...           
41119                 LdrLoadDll
41120                 LdrLoadDll
41121                 LdrLoadDll
41122                 LdrLoadDll
41123                  NtOpenKey
Name: t_1, Length: 41124, dtype: object' has dtype incompatible with int64, 



Searching for Common API Patterns per Cluster...
[0 1 2 3 4 5 6 7]

Average Match Ratio: 0.213375


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,51,0.0479,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,3307,0.5222,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,2,738,0.0931,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,1094,0.2452,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
4,4,1049,0.4173,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
5,5,576,0.0653,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
6,6,521,0.1253,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
7,7,1116,0.1907,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.20552475440132284



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,0226e311ed2648ff399c7902fc113421,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,a3f4c7952802b3cc4a1f49f737cfc0d3,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,92c439e7e5eb92e263dede331ffbb344,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,96e5548a9b3fb189c37d0b23784e7ecc,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,0bf8d0123c352d36c07f319c3c9bc994,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
395,7,26d6e82e0b8ed12d444f471194e79fe7,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
396,7,a35a4a856b3c2a7f7b6353b3b7ff7e69,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
397,7,ac561579119557783563ac72bd2e18cc,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
398,7,1f43e8111e6bf9b99518f057f968da9e,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."


## MiniBatchKMeans

In [6]:
df = reserve_df.copy()
print(df.shape, "\n")

reassignment_ratio = [0.01, 0.1, 0.3]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "MiniBatchKMeans", 
                              MiniBatchKMeans(verbose=0, random_state=seed, n_init='auto', 
                                              batch_size=os.cpu_count()*256), 
                              {'reassignment_ratio':reassignment_ratio})

commonAPI = common_api_cluster(df, "MiniBatchKMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "MiniBatchKMeans", samplesize)
display(commonHashes)

(41124, 102) 

Searching Best Clustering Parameters...
Parameter: {'reassignment_ratio': 0.01} Score 0.2049 Unique_Labels 8 Time 31.4088
Parameter: {'reassignment_ratio': 0.1} Score 0.1970 Unique_Labels 8 Time 30.2048
Parameter: {'reassignment_ratio': 0.3} Score 0.1407 Unique_Labels 8 Time 33.7446

BEST PARAM SETUP:  {'reassignment_ratio': 0.01} 0.20490279462185543

Inverse Labelling...


1            GetSystemTimeAsFileTime
2        SetUnhandledExceptionFilter
3            GetSystemTimeAsFileTime
4            GetSystemTimeAsFileTime
                    ...             
41119        GetSystemTimeAsFileTime
41120        GetSystemTimeAsFileTime
41121        GetSystemTimeAsFileTime
41122        GetSystemTimeAsFileTime
41123                  RegOpenKeyExA
Name: t_0, Length: 41124, dtype: object' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
1        NtAllocateVirtualMemory
2                  OleInitialize
3        NtAllocateVirtualMemory
4                     LdrLoadDll
                  ...           
41119                 LdrLoadDll
41120                 LdrLoadDll
41121                 LdrLoadDll
41122                 LdrLoadDll
41123                  NtOpenKey
Name: t_1, Length: 41124, dtype: object' has dtype incompatible with int64, 



Searching for Common API Patterns per Cluster...
[0 1 2 3 4 5 6 7]

Average Match Ratio: 0.5452125


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,576,0.6063,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
1,1,14,1.0,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,2,3307,0.7709,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
3,3,738,0.1073,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,4,1116,0.041,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
5,5,37,0.1623,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."
6,6,1094,0.9991,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
7,7,305,0.6748,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.17476412800311253



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,10bb7a2d6b7340a4652b85b8d432241a,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
1,0,1160e0dd44c3cd213463338a19eedb76,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
2,0,0b823574177ac070d7e622ae7a35202c,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
3,0,03a154d5139f12b9b07b137d1be038c5,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
4,0,de729348cab5f133c6eb8db1ae7c5cac,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
...,...,...,...,...,...,...
346,7,d665673deccb288f351c345a11e1eb70,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
347,7,292dc45171f37bd3cd65aed6ecbf709c,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
348,7,79a877bc84e8f3200f601d5617b44191,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
349,7,73f287bac2a6b63265937622a4645791,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


## DBSCAN

In [7]:
df = reserve_df.copy()
print(df.shape, "\n")

#eps = [0.2,0.5,0.8]
min_samples = [1,2,3,4,5]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "DBSCAN", DBSCAN(algorithm='auto', n_jobs=-1), 
                              {'min_samples' : min_samples})

commonAPI = common_api_cluster(df, "DBSCAN")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "DBSCAN", 5)
display(commonHashes)

(41124, 102) 

Searching Best Clustering Parameters...
Parameter: {'min_samples': 1} Score 0.7587 Unique_Labels 11959 Time 96.1573
Parameter: {'min_samples': 2} Score 0.6312 Unique_Labels 2037 Time 82.6390
Parameter: {'min_samples': 3} Score 0.5698 Unique_Labels 1087 Time 82.9189
Parameter: {'min_samples': 4} Score 0.5392 Unique_Labels 759 Time 84.7858
Parameter: {'min_samples': 5} Score 0.5158 Unique_Labels 564 Time 87.9416

BEST PARAM SETUP:  {'min_samples': 1} 0.7587053788541971

Inverse Labelling...


1            GetSystemTimeAsFileTime
2        SetUnhandledExceptionFilter
3            GetSystemTimeAsFileTime
4            GetSystemTimeAsFileTime
                    ...             
41119        GetSystemTimeAsFileTime
41120        GetSystemTimeAsFileTime
41121        GetSystemTimeAsFileTime
41122        GetSystemTimeAsFileTime
41123                  RegOpenKeyExA
Name: t_0, Length: 41124, dtype: object' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
1        NtAllocateVirtualMemory
2                  OleInitialize
3        NtAllocateVirtualMemory
4                     LdrLoadDll
                  ...           
41119                 LdrLoadDll
41120                 LdrLoadDll
41121                 LdrLoadDll
41122                 LdrLoadDll
41123                  NtOpenKey
Name: t_1, Length: 41124, dtype: object' has dtype incompatible with int64, 



Searching for Common API Patterns per Cluster...
[ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106
 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160
 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178
 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196
 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214
 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232


Unnamed: 0,cluster,count,match_ratio,pattern
0,-1,4,0.0003,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
1,0,3307,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,1,128,1.0000,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
3,2,8,1.0000,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
4,3,172,1.0000,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
...,...,...,...,...
559,558,6,1.0000,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
560,559,16,1.0000,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
561,560,6,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
562,561,5,1.0000,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.6697062542554226



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,-1,a4afb0f8d4dbbfb2b642eb8ac6a64380,_,_,_,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
1,-1,892cadf55220447de45178ebd532d7e1,_,_,_,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
2,-1,1267b0557ebc56aa087689c3c74fbf42,_,_,_,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
3,-1,69d08a9797d580c6dcb88055ba752333,_,_,_,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
4,0,bc3e3b224c761579d8f5cfb4a379fdef,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
...,...,...,...,...,...,...
2814,562,e4ce1866d100a492138df030dd9c491f,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2815,562,cf1ba0765f0b617a98e6229697d31887,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2816,562,253dc03c58c6030fabb2206dbea710de,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2817,562,90049442be9ad824d4f7c64a972dc209,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."


## Birch

In [8]:
df = reserve_df.copy()
print(df.shape, "\n")

branching_factor = [25,50,75,100]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "Birch", Birch(compute_labels=True), 
                              {'branching_factor':branching_factor})

commonAPI = common_api_cluster(df, "Birch")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "Birch", samplesize)
display(commonHashes)

(41124, 102) 

Searching Best Clustering Parameters...
Parameter: {'branching_factor': 25} Score 0.0767 Unique_Labels 3 Time 71.2202
Parameter: {'branching_factor': 50} Score 0.0767 Unique_Labels 3 Time 65.4478
Parameter: {'branching_factor': 75} Score 0.0767 Unique_Labels 3 Time 65.3036
Parameter: {'branching_factor': 100} Score 0.0767 Unique_Labels 3 Time 68.9335

BEST PARAM SETUP:  {'branching_factor': 50} 0.07673502220222284

Inverse Labelling...


1            GetSystemTimeAsFileTime
2        SetUnhandledExceptionFilter
3            GetSystemTimeAsFileTime
4            GetSystemTimeAsFileTime
                    ...             
41119        GetSystemTimeAsFileTime
41120        GetSystemTimeAsFileTime
41121        GetSystemTimeAsFileTime
41122        GetSystemTimeAsFileTime
41123                  RegOpenKeyExA
Name: t_0, Length: 41124, dtype: object' has dtype incompatible with int64, please explicitly cast to a compatible dtype first.
  df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
1        NtAllocateVirtualMemory
2                  OleInitialize
3        NtAllocateVirtualMemory
4                     LdrLoadDll
                  ...           
41119                 LdrLoadDll
41120                 LdrLoadDll
41121                 LdrLoadDll
41122                 LdrLoadDll
41123                  NtOpenKey
Name: t_1, Length: 41124, dtype: object' has dtype incompatible with int64, 



Searching for Common API Patterns per Cluster...
[0 1 2]

Average Match Ratio: 0.08233333333333333


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,3307,0.1019,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,738,0.0968,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,51,0.0483,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.09960120610835523



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,fccce06bbabd5398622081172b98ac09,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,0,ed1c628b31c5c4c62fa061520a3817ce,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,0,ede0ea7318bfaecab5cc7c311e8a252a,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
3,0,a3a160e1f593d1b027042c01c943534a,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
4,0,a5b68fed471885355e5c028f963cce20,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
...,...,...,...,...,...,...
145,2,8e7a86108b484fc50a8744a3a913cd36,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
146,2,01e42e0ec55a58adcddf328e6757d664,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
147,2,2d060f3dac42e5f473947d66921a8ff2,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
148,2,7772b7886f4cbbd5e37943031e86d243,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


## SpectralClustering

*Due to some dataset- or hardware-related limitations, SpectralClustering cannot be executed using the full Oliveira dataset.*

In [9]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# n_neighbors=[10,20,30,40,50]
# assign_labels=['kmeans','discretize']

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "SpectralClustering", 
#                               SpectralClustering(random_state=seed, affinity='nearest_neighbors', assign_labels='kmeans', n_jobs=-1, verbose=0), 
#                               {'n_neighbors':n_neighbors, 'assign_labels':assign_labels})

# commonAPI = common_api_cluster(df, "SpectralClustering")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "SpectralClustering", samplesize)
# display(commonHashes)

## AgglomerativeClustering (Ward)

*Due to memory-related limitations, AgglomerativeClustering cannot be executed using the full Oliveira dataset.*

In [10]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# print("Computing Connectivity...")
# start_time = time.time()
# connectivity = []
# connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 2, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 4, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 6, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 8, mode='connectivity', include_self='auto', n_jobs=-1))
# print(f"Connectivity Computation: {time.time()-start_time:.4f}(s)\n")

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "AgglomerativeClustering", AgglomerativeClustering(memory=".memory/", 
#                                                                                      linkage='complete', 
#                                                                                      connectivity=connectivity), 
#                               {'connectivity':connectivity})

# commonAPI = common_api_cluster(df, "AgglomerativeClustering")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "AgglomerativeClustering", samplesize)
# display(commonHashes)

## Optics

*Due to some dataset- or hardware-related limitations, Optics cannot be executed using the full Oliveira dataset in a reasonable time.*

In [11]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# leaf_size = [10,20,30,40,50] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# min_samples = [5,10,15,20]

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "OPTICS", OPTICS(algorithm='auto', memory=".memory/", n_jobs=-1), 
#                               {'leaf_size': leaf_size, 'min_samples':min_samples}, 
#                               samplesize)

# commonAPI = common_api_cluster(df, "OPTICS")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "OPTICS", samplesize)
# display(commonHashes)

## MeanShift

*Due to some hardware-related limitations, MeanShift cannot be executed using the lite/full Oliveira dataset in a reasonable time.*

In [12]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# max_iter = [300,500] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "MeanShift", MeanShift(n_jobs=-1), {'max_iter': max_iter})

# commonAPI = common_api_cluster(df, "MeanShift")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "MeanShift", samplesize)
# display(commonHashes)