# Clustering Demo (Malicious)

**Clustering Methods not Supported:** GaussianMixture & HDBScan

# 1. Code Preparation

In [6]:
import random
import pandas as pd
from sklearn.cluster import KMeans,  BisectingKMeans, MiniBatchKMeans, Birch, DBSCAN
# from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS, MeanShift # Not working
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
import multiprocessing
import time

def cpu_count():
    '''Count the numeber of CPU cores available in the system.'''
    return multiprocessing.cpu_count()

def list_to_str(ls:list):
    '''Convert list of string to a single string with comma as a delimiter.'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def load_df():
    '''Load the dataset file (CSV) as DataFrame'''
    print("Loading DF...")
    df = pd.read_csv("oliveira_lite.csv", low_memory=False, memory_map=True)
    df = df[df['malware'] == 1].copy()
    df = df.drop('malware', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def get_x(df:pd.DataFrame):
    '''Get the feature columns (aka X) of the DataFrame'''
    return df.iloc[:, 1:102-1]

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Random Seed
seed = 1

#Inverse Label Encoding
def inverse_labeller(item):
    '''Low Level. Converts encoded API calls to string API calls'''
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df):
    '''High Level. Converts encoded API calls to string API calls'''
    df2 = df.copy(deep=True)
    print("Inverse Labelling...")
    df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df2

From: https://medium.com/swlh/k-means-clustering-on-high-dimensional-data-d2151e1a4240

In [7]:
def search_k(parameters, model, X):
    '''Search for the best parameter(s) for the model (usually cluster size or K value)'''
    paramGrid = ParameterGrid(parameters)
    best_score = -1
    best_grid = -1
    best_clusterer = None
    silhouette_scores = []
    # evaluation based on silhouette_score
    print("Searching Best Clustering Parameters...")
    for p in paramGrid:
        model.set_params(**p)    # set current hyper parameter
        start_time = time.time()
        model.fit(X)          # fit model on wine dataset, this will find clusters based on parameter p
        ss = metrics.silhouette_score(X, model.labels_, random_state=seed)   # calculate silhouette_score
        silhouette_scores += [ss]       # store all the scores
        print('Parameter:', p, 'Score', f"{ss:.4f}", 'Unique_Labels', len(pd.Series(model.labels_).unique()), "Time", f"{time.time()-start_time:.4f}")
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
            best_clusterer = model
    print("")
    print("BEST PARAM SETUP: ", best_grid, best_score)
    print("")
    return best_grid, best_clusterer

def clustering(inner_df, name, clusterer, parameters):
    '''Executes the data clustering on the dataset. Produces the same input dataset with the additional column for the cluster #.'''
    '''The input dataset must contain integer API calls (except the API Call Pattern strings)'''
    X = get_x(inner_df)
    bestCluster, bestClusterer = search_k(parameters, clusterer, X) #assumes bestCluster already fitted
    #bestClusterer.fit(X)
    inner_df['cluster'] = bestClusterer.labels_
    inner_df.to_csv(f"Clustering/Malicious/{name}_Encoded_Clustering.csv", index=False)
    inner_df = inverse_label(inner_df)
    inner_df.to_csv(f"Clustering/Malicious/{name}_Clustering.csv", index=False)
    print("")
    return bestCluster

def common_api_cluster(inner_df, name, size):
    '''Determine the most common API call patterns for each cluster'''
    global df
    inner_df = df
    clusters = inner_df['cluster'].unique()
    clusters.sort()
    columns = inner_df.columns[1:102]
    commonAPI = []
    print("Searching for Common API Patterns per Cluster...")
    print(clusters)
    for cluster in clusters:
        raw_commonC = inner_df[inner_df['cluster']==cluster]['pattern']#.value_counts()
        commonC = raw_commonC.value_counts().to_frame(name='counts').reset_index()
        commonAPI.append([cluster, commonC['counts'].iloc[0], round(commonC['counts'].iloc[0]/raw_commonC.shape[0],4), commonC['pattern'].iloc[0]])
    commonAPI = pd.DataFrame(commonAPI, columns=['cluster', 'count', 'match_ratio', 'pattern'])
    commonAPI.to_csv(f"Clustering/Malicious/{name}_Common_API_Cluster.csv", index=False)
    print("")
    return commonAPI

def get_samplehash_common(inner_df, common_counts, name, size, samplesize):
    hashes = []
    global seed
    random.seed(seed)
    matching_samples = 0
    print(f"Random (Seed @ {seed}) Sampling Hashes subset of  Most Common API Patterns...")
    for pattern in range(common_counts.shape[0]):
        sub_df = inner_df[inner_df['pattern']==common_counts.iloc[pattern,3]]
        subsamples = len(sub_df['hash'].to_list())
        matching_samples += subsamples
        if subsamples < samplesize:
            subsamples = random.sample(sub_df['hash'].to_list(), len(sub_df['hash'].to_list()))
        else:
            subsamples = random.sample(sub_df['hash'].to_list(), samplesize)
        for subsample in subsamples:
            hashes.append([common_counts.iloc[pattern,:]['cluster'], subsample, '_', '_', '_', common_counts.iloc[pattern,:]['pattern']])
    hashes = pd.DataFrame(hashes, columns=['cluster', 'hash', 'Type 1', 'Type 2', 'Type 3', 'pattern'])
    hashes.to_csv(f"Clustering/Malicious/{name}_SampleHash_Common.csv", index=False)
    print(f"Commonality Ratio: {(matching_samples/inner_df.shape[0])*100:.4f}%")
    print("")
    return hashes
    
def inject_patterns(inner_df, inverse_labelled_df):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    print("")
    inverse_label(inner_df).to_csv(f"Clustering/Malicious/API_Patterns.csv", index=False)
    return inner_df

# 2. Load Dataset

In [8]:
df = load_df()
df = inject_patterns(df.copy(), inverse_label(df.copy()))
df

Loading DF...

Inverse Labelling...



Injecting API patterns...

Inverse Labelling...



Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2535,5ed3176ac1222d9b2e595e70edf0fb04,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2536,1742087405387837e460d705ecefebf1,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2537,aab9a65a870337e5583013c6218407ba,82,240,117,240,117,240,117,240,117,...,117,240,117,82,172,117,16,31,215,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2538,018c37d434cf8b8a3ac83af7a372683a,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [9]:
reserve_df = df.copy()

# Explanation

The process as the following:
1. Find the best cluster size (by means of the [Silhouette Score](https://tushar-joshi-89.medium.com/silhouette-score-a9f7d8d78f29))
2. Search for the most common API pattern for each cluster
3. Sample n hashes per cluster that match the most common API pattern for the same cluster.

**Commonality Ratio**: How many of the matching most common API patterns per cluster are there relative to the dataset size. *(Higher is better)*

# 3. K-Means

In [10]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']
clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 10 # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "KMeans", KMeans(n_init='auto', verbose=0, random_state=seed), 
                              {'n_clusters': clusters, 'algorithm':algorithm})

commonAPI = common_api_cluster(df, "KMeans", bestClusterParam)
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "KMeans", bestClusterParam, samplesize)
display(commonHashes)

(2540, 102) 

Searching Best Clustering Parameters...


Parameter: {'algorithm': 'lloyd', 'n_clusters': 20} Score 0.3103 Unique_Labels 20 Time 0.4086
Parameter: {'algorithm': 'lloyd', 'n_clusters': 40} Score 0.3884 Unique_Labels 40 Time 0.3125
Parameter: {'algorithm': 'lloyd', 'n_clusters': 60} Score 0.4223 Unique_Labels 60 Time 0.3097
Parameter: {'algorithm': 'lloyd', 'n_clusters': 80} Score 0.4510 Unique_Labels 80 Time 0.3509
Parameter: {'algorithm': 'lloyd', 'n_clusters': 100} Score 0.4802 Unique_Labels 100 Time 0.3519
Parameter: {'algorithm': 'elkan', 'n_clusters': 20} Score 0.3103 Unique_Labels 20 Time 0.2442
Parameter: {'algorithm': 'elkan', 'n_clusters': 40} Score 0.3884 Unique_Labels 40 Time 0.2797
Parameter: {'algorithm': 'elkan', 'n_clusters': 60} Score 0.4223 Unique_Labels 60 Time 0.2916
Parameter: {'algorithm': 'elkan', 'n_clusters': 80} Score 0.4510 Unique_Labels 80 Time 0.3427
Parameter: {'algorithm': 'elkan', 'n_clusters': 100} Score 0.4802 Unique_Labels 100 Time 0.3525

BEST PARAM SETUP:  {'algorithm': 'lloyd', 'n_clusters':

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,9,0.4286,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,188,0.9895,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,2,60,1.0000,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
3,3,8,0.2286,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,4,63,1.0000,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
...,...,...,...,...
95,95,11,0.3143,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
96,96,24,0.8571,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
97,97,48,1.0000,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
98,98,2,0.5000,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 50.7480%



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,23f38ca8b47007fcf23b209063fb46aa,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,0aea3100fb39ae129837e2f9deacd221,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,5b22364b8c4b287e11555bef11c50c1b,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,e468d8f768ef1f4901def392dd562c42,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,0fe241af6c68c3b8292b382251da65fb,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
560,97,158c84cc65140c2ee6327ad82e294af3,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
561,98,20f07afeb0f00beaafb4d1a1b11276d3,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
562,98,458116640e4cfa143ffbf6d573aaf4b8,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
563,99,1fd6f8278f271b5ffd599e77d9e5b890,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


# 4. BisectingKMeans

In [11]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']
clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 10 # Max no. of samples to obtain from a cluster

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "BisectingKMeans", 
                              BisectingKMeans(random_state=seed, verbose=0, copy_x=True), 
                              {'n_clusters': clusters, 'algorithm':algorithm})

commonAPI = common_api_cluster(df, "BisectingKMeans", bestClusterParam)
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "BisectingKMeans", bestClusterParam, samplesize)
display(commonHashes)

(2540, 102) 

Searching Best Clustering Parameters...
Parameter: {'algorithm': 'lloyd', 'n_clusters': 20} Score 0.2543 Unique_Labels 20 Time 0.4066


Parameter: {'algorithm': 'lloyd', 'n_clusters': 40} Score 0.2889 Unique_Labels 40 Time 0.5428
Parameter: {'algorithm': 'lloyd', 'n_clusters': 60} Score 0.3312 Unique_Labels 60 Time 0.6019
Parameter: {'algorithm': 'lloyd', 'n_clusters': 80} Score 0.3577 Unique_Labels 80 Time 0.6103
Parameter: {'algorithm': 'lloyd', 'n_clusters': 100} Score 0.3819 Unique_Labels 100 Time 0.6831
Parameter: {'algorithm': 'elkan', 'n_clusters': 20} Score 0.2543 Unique_Labels 20 Time 0.3712
Parameter: {'algorithm': 'elkan', 'n_clusters': 40} Score 0.2889 Unique_Labels 40 Time 0.5555
Parameter: {'algorithm': 'elkan', 'n_clusters': 60} Score 0.3312 Unique_Labels 60 Time 0.6620
Parameter: {'algorithm': 'elkan', 'n_clusters': 80} Score 0.3577 Unique_Labels 80 Time 0.7343
Parameter: {'algorithm': 'elkan', 'n_clusters': 100} Score 0.3819 Unique_Labels 100 Time 0.7554

BEST PARAM SETUP:  {'algorithm': 'lloyd', 'n_clusters': 100} 0.38193370363331586

Inverse Labelling...


Searching for Common API Patterns per Cluste

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,48,0.6400,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
1,1,63,1.0000,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
2,2,22,0.9565,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,3,1,0.2000,"__exception__,NtAllocateVirtualMemory,GetSyste..."
4,4,2,0.2222,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
95,95,1,0.0476,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
96,96,1,0.5000,"NtAllocateVirtualMemory,NtCreateFile,NtAllocat..."
97,97,3,0.2727,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
98,98,5,0.1250,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 45.1181%



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,a902be388099f50c193879d4c84e864f,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
1,0,ed56822fee075f183ce14304d169708c,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
2,0,521b3e7c459f045ebc2aa40c0c082ee7,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
3,0,71735e0f1b2e9777ac5c8e12d933825b,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
4,0,a7fa6f06f61fba7f034480f85b8cb2b9,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
...,...,...,...,...,...,...
462,98,457764ec0a63186344f745993a6057ef,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
463,98,223b22fc8d3193af5686fcacc7a7e8ce,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
464,98,bae80a7a84322ce7cfb8b5a121d06ed0,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
465,99,ca68644d418c565c5343cbabcfe81ca7,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


# 5. MiniBatchKMeans

In [12]:
df = reserve_df.copy()
print(df.shape, "\n")

reassignment_ratio = [0.01, 0.1, 0.3]
clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 10 # Max no. of samples to obtain from a cluster

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "MiniBatchKMeans", 
                              MiniBatchKMeans(verbose=0, random_state=seed, n_init='auto', 
                                              batch_size=cpu_count()*256), 
                              {'n_clusters': clusters, 'reassignment_ratio':reassignment_ratio})

commonAPI = common_api_cluster(df, "MiniBatchKMeans", bestClusterParam)
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "MiniBatchKMeans", bestClusterParam, samplesize)
display(commonHashes)

(2540, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 20, 'reassignment_ratio': 0.01} Score 0.2917 Unique_Labels 20 Time 0.2956
Parameter: {'n_clusters': 20, 'reassignment_ratio': 0.1} Score 0.2882 Unique_Labels 20 Time 0.3493
Parameter: {'n_clusters': 20, 'reassignment_ratio': 0.3} Score 0.1756 Unique_Labels 17 Time 0.2530
Parameter: {'n_clusters': 40, 'reassignment_ratio': 0.01} Score 0.3867 Unique_Labels 40 Time 0.2858
Parameter: {'n_clusters': 40, 'reassignment_ratio': 0.1} Score 0.3748 Unique_Labels 39 Time 0.2791
Parameter: {'n_clusters': 40, 'reassignment_ratio': 0.3} Score 0.2045 Unique_Labels 34 Time 0.3463
Parameter: {'n_clusters': 60, 'reassignment_ratio': 0.01} Score 0.4113 Unique_Labels 60 Time 0.3377
Parameter: {'n_clusters': 60, 'reassignment_ratio': 0.1} Score 0.3599 Unique_Labels 54 Time 0.4212
Parameter: {'n_clusters': 60, 'reassignment_ratio': 0.3} Score 0.3101 Unique_Labels 53 Time 0.3251
Parameter: {'n_clusters': 80, 'reassignment_ratio': 

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,3,0.2500,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
1,1,4,0.1538,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,12,0.6316,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,3,4,0.0580,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
4,4,1,0.1667,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...
75,94,22,1.0000,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
76,95,1,0.5000,"NtAllocateVirtualMemory,LdrLoadDll,LdrGetProce..."
77,96,14,1.0000,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
78,97,1,1.0000,"__exception__,NtAllocateVirtualMemory,GetSyste..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 43.5433%



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,c849749715750b8a993fe797e9556dcd,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
1,0,846b9e780388604c9859d1619706a842,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
2,0,c87cbe02cddbc16e3bbf034fcd0d253b,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
3,1,b4a561bdc28b9d5563b4fe17cbc818ce,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,1,60ebd453611c3645bb8eac996a34f3ec,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
404,97,9a2088e32599289f83d3e7fb6ca76e4b,_,_,_,"__exception__,NtAllocateVirtualMemory,GetSyste..."
405,99,46a6726e6297592328d1fffd445a8552,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
406,99,222c563cd8f69270c78493673bf7208b,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
407,99,424c769950a4c04d8fe8dc9f2dbd50fd,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


# 6. DBSCAN

In [13]:
df = reserve_df.copy()
print(df.shape, "\n")

eps = [0.2,0.5,0.8]
min_samples = [5,10]
samplesize = 10 # Max no. of samples to obtain from a cluster

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "DBSCAN", DBSCAN(algorithm='auto', n_jobs=-1), 
                              {'min_samples' : min_samples, 'eps':eps})

commonAPI = common_api_cluster(df, "DBSCAN", bestClusterParam)
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "DBSCAN", bestClusterParam, samplesize)
display(commonHashes)

(2540, 102) 

Searching Best Clustering Parameters...
Parameter: {'eps': 0.2, 'min_samples': 5} Score 0.3230 Unique_Labels 60 Time 0.4537
Parameter: {'eps': 0.2, 'min_samples': 10} Score 0.2676 Unique_Labels 37 Time 0.3855
Parameter: {'eps': 0.5, 'min_samples': 5} Score 0.3230 Unique_Labels 60 Time 0.3412
Parameter: {'eps': 0.5, 'min_samples': 10} Score 0.2676 Unique_Labels 37 Time 0.3878
Parameter: {'eps': 0.8, 'min_samples': 5} Score 0.3230 Unique_Labels 60 Time 0.3972
Parameter: {'eps': 0.8, 'min_samples': 10} Score 0.2676 Unique_Labels 37 Time 0.3678

BEST PARAM SETUP:  {'eps': 0.2, 'min_samples': 5} 0.322959524380881

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[-1  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22
 23 24 25 26 27 28 29 30 31 32 33 34 35]



Unnamed: 0,cluster,count,match_ratio,pattern
0,-1,9,0.0062,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
1,0,188,1.0,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,1,10,1.0,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
3,2,25,1.0,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,3,56,1.0,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
5,4,31,1.0,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
6,5,54,1.0,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
7,6,25,1.0,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
8,7,28,1.0,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
9,8,63,1.0,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 43.5827%



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,-1,55f1c3e0aa031a198bce930eb7606a0c,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
1,-1,6b106429d95d6532c62fd679f2b492a1,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
2,-1,82682719779f7681a59af19b4fb2650c,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
3,-1,a9b3f3f84ea5cfcf0fcf44d4fda5e86d,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
4,-1,c9d2cb122eb8f7f04e8983020acaaaf4,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
...,...,...,...,...,...,...
364,35,3b8e13ce2f0ebbb4a6581f207be89dcf,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
365,35,02fb7152f8346181b4f7f9720f0811fe,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
366,35,0edc67218bb692eaf280ad82b90929f8,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
367,35,002a4383e3ef41f65d9041bf3cf51bc5,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


# 7. Birch

In [14]:
df = reserve_df.copy()
print(df.shape, "\n")

clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
branching_factor = [25,50,75,100]
samplesize = 10 # Max no. of samples to obtain from a cluster

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "Birch", Birch(compute_labels=True), 
                              {'n_clusters': clusters, 'branching_factor':branching_factor})

commonAPI = common_api_cluster(df, "Birch", bestClusterParam)
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "Birch", bestClusterParam, samplesize)
display(commonHashes)

(2540, 102) 

Searching Best Clustering Parameters...


Parameter: {'branching_factor': 25, 'n_clusters': 20} Score 0.2364 Unique_Labels 20 Time 0.7134
Parameter: {'branching_factor': 25, 'n_clusters': 40} Score 0.2894 Unique_Labels 40 Time 0.5334
Parameter: {'branching_factor': 25, 'n_clusters': 60} Score 0.3164 Unique_Labels 60 Time 0.5433
Parameter: {'branching_factor': 25, 'n_clusters': 80} Score 0.3369 Unique_Labels 80 Time 0.5692
Parameter: {'branching_factor': 25, 'n_clusters': 100} Score 0.3528 Unique_Labels 100 Time 0.5967
Parameter: {'branching_factor': 50, 'n_clusters': 20} Score 0.2372 Unique_Labels 20 Time 0.4661
Parameter: {'branching_factor': 50, 'n_clusters': 40} Score 0.2920 Unique_Labels 40 Time 0.5094
Parameter: {'branching_factor': 50, 'n_clusters': 60} Score 0.3153 Unique_Labels 60 Time 0.4607
Parameter: {'branching_factor': 50, 'n_clusters': 80} Score 0.3410 Unique_Labels 80 Time 0.4809
Parameter: {'branching_factor': 50, 'n_clusters': 100} Score 0.3594 Unique_Labels 100 Time 0.4369
Parameter: {'branching_factor': 75, 

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,8,0.1538,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
1,1,9,0.6923,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."
2,2,60,0.2429,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
3,3,7,0.4118,"LdrGetDllHandle,LdrGetProcedureAddress,GetSyst..."
4,4,6,0.3750,"NtAllocateVirtualMemory,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
95,95,7,0.5000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
96,96,25,0.9615,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
97,97,1,0.3333,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
98,98,16,0.9412,"NtDelayExecution,NtAllocateVirtualMemory,RegOp..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 36.5354%



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,2e9caac20a59ed5bdc83a92edd63ac23,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
1,0,746e441618178f3cc7d5ef4de3a45273,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,0,96c77dffd0b78b42b39cc488af0b9ae9,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
3,0,dfe8fd75a9a2e788acf20640583e808f,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,0,95b949d7621d146ca44fba022edfe732,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
...,...,...,...,...,...,...
406,99,776877116f253512f246923d30447a95,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
407,99,00a157f6bcc8f4a280175cb8599570b9,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
408,99,56f0110ad40759c688e1ea7d76ae6039,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
409,99,5499bb1cd56965f09ff1c368c2abdb88,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


# 8. SpectralClustering

*Due to some dataset- or hardware-related limitations, SpectralClustering cannot be executed using the full Oliveira dataset.*

In [15]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# n_neighbors=[10,20,30,40,50]
# assign_labels=['kmeans','discretize']
# samplesize = 10 # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "SpectralClustering", 
#                               SpectralClustering(random_state=seed, affinity='nearest_neighbors', 
#                                                  assign_labels='kmeans', n_jobs=-1, verbose=0), 
#                               {'n_clusters': clusters, 'n_neighbors':n_neighbors, 'assign_labels':assign_labels}, 
#                               samplesize)

# commonAPI = common_api_cluster(df, "SpectralClustering", bestClusterParam)
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "SpectralClustering", bestClusterParam, samplesize)
# display(commonHashes)

# 9. AgglomerativeClustering (Ward)

*Due to memory-related limitations, AgglomerativeClustering cannot be executed using the full Oliveira dataset.*

In [16]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# samplesize = 10 # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster

# print("Computing Connectivity...")
# start_time = time.time()
# connectivity = []
# connectivity.append(kneighbors_graph(get_x(df).values, 2, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df).values, 4, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df).values, 6, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df).values, 8, mode='connectivity', include_self='auto', n_jobs=-1))
# print(f"Connectivity Computation: {time.time()-start_time:.4f}(s)\n")

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "AgglomerativeClustering", AgglomerativeClustering(memory=".memory/", 
#                                                                                      linkage='complete', 
#                                                                                      connectivity=connectivity), 
#                               {'n_clusters': clusters, 'connectivity':connectivity})

# commonAPI = common_api_cluster(df, "AgglomerativeClustering", bestClusterParam)
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "AgglomerativeClustering", bestClusterParam, samplesize)
# display(commonHashes)

# 10. Optics

*Due to some dataset- or hardware-related limitations, Optics cannot be executed using the full Oliveira dataset in a reasonable time.*

In [17]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# leaf_size = [10,20,30,40,50] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# min_samples = [5,10,15,20]
# samplesize = 10 # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "OPTICS", OPTICS(algorithm='auto', memory=".memory/", n_jobs=-1), 
#                               {'leaf_size': leaf_size, 'min_samples':min_samples}, 
#                               samplesize)

# commonAPI = common_api_cluster(df, "OPTICS", bestClusterParam)
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "OPTICS", bestClusterParam, samplesize)
# display(commonHashes)

# 11. MeanShift

*Due to some hardware-related limitations, MeanShift cannot be executed using the lite/full Oliveira dataset in a reasonable time.*

In [18]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# max_iter = [300,500] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# samplesize = 10 # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster # Max no. of samples to obtain from a cluster

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "MeanShift", MeanShift(n_jobs=-1), {'max_iter': max_iter})

# commonAPI = common_api_cluster(df, "MeanShift", bestClusterParam)
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "MeanShift", bestClusterParam, samplesize)
# display(commonHashes)