# Clustering Demo (Benign)

**Clustering Methods not Supported:** GaussianMixture & HDBScan

In [1]:
import random
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering, BisectingKMeans, MiniBatchKMeans, Birch
from sklearn.cluster import AgglomerativeClustering, OPTICS, MeanShift, DBSCAN
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
import matplotlib.pyplot as plt
import multiprocessing
from sklearn.neighbors import kneighbors_graph
import time

def cpuCount():
    return multiprocessing.cpu_count()

def listToStr(ls):
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def loadDF():
    print("Loading DF...")
    df = pd.read_csv("oliveira.csv", low_memory=False, memory_map=True)
    df = df[df['malware'] == 0].copy()
    df = df.drop('malware', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def getX(df):
    return df.iloc[:, 1:102-1]

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Random Seed
seed = 1

#Inverse Label Encoding
def inverse_labeller(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df:pd.DataFrame):
    print("Inverse Labelling...")
    df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df

From: https://medium.com/swlh/k-means-clustering-on-high-dimensional-data-d2151e1a4240

In [2]:
def searchK(parameters, model, X):
    paramGrid = ParameterGrid(parameters)
    best_score = -1
    best_grid = -1
    best_clusterer = None
    silhouette_scores = []
    # evaluation based on silhouette_score
    print("Searching Best Clustering Parameters...")
    for p in paramGrid:
        model.set_params(**p)    # set current hyper parameter
        start_time = time.time()
        model.fit(X)          # fit model on wine dataset, this will find clusters based on parameter p
        ss = metrics.silhouette_score(X, model.labels_, random_state=seed)   # calculate silhouette_score
        silhouette_scores += [ss]       # store all the scores
        print('Parameter:', p, 'Score', f"{ss:.4f}", 'Unique_Labels', len(pd.Series(model.labels_).unique()), "Time", f"{time.time()-start_time:.4f}")
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
            best_clusterer = model
    print("")
    print("BEST PARAM SETUP: ", best_grid, best_score)
    print("")
    return best_grid, best_clusterer

def clustering(inner_df, name, clusterer, parameters):
    X = getX(inner_df)
    bestCluster, bestClusterer = searchK(parameters, clusterer, X) #assumes bestCluster already fitted
    #bestClusterer.fit(X)
    inner_df['cluster'] = bestClusterer.labels_
    inner_df.to_csv(f"Clustering/Benign/{name}_Encoded_Clustering.csv", index=False)
    inner_df = inverse_label(inner_df)
    inner_df.to_csv(f"Clustering/Benign/{name}_Clustering.csv", index=False)
    print("")
    return bestCluster

def commonAPICluster(inner_df, name, size):
    inner_df = df
    clusters = inner_df['cluster'].unique()
    clusters.sort()
    columns = inner_df.columns[1:102]
    commonAPI = []
    print("Searching for Common API Patterns per Cluster...")
    print(clusters)
    for cluster in clusters:
        raw_commonC = inner_df[inner_df['cluster']==cluster]['pattern']#.value_counts()
        commonC = raw_commonC.value_counts().to_frame(name='counts').reset_index()
        commonAPI.append([cluster, commonC['counts'].iloc[0], round(commonC['counts'].iloc[0]/raw_commonC.shape[0],4), commonC['pattern'].iloc[0]])
    commonAPI = pd.DataFrame(commonAPI, columns=['cluster', 'count', 'match_ratio', 'pattern'])
    commonAPI.to_csv(f"Clustering/Benign/{name}_CommonAPICluster.csv", index=False)
    print("")
    return commonAPI

def getSampleHash_Common(inner_df, common_counts, name, size, samplesize):
    hashes = []
    global seed
    random.seed(seed)
    matching_samples = 0
    print(f"Random (Seed @ {seed}) Sampling Hashes subset of  Most Common API Patterns...")
    for pattern in range(common_counts.shape[0]):
        sub_df = inner_df[inner_df['pattern']==common_counts.iloc[pattern,3]]
        subsamples = len(sub_df['hash'].to_list())
        matching_samples += subsamples
        if subsamples < samplesize:
            subsamples = random.sample(sub_df['hash'].to_list(), len(sub_df['hash'].to_list()))
        else:
            subsamples = random.sample(sub_df['hash'].to_list(), samplesize)
        for subsample in subsamples:
            hashes.append([common_counts.iloc[pattern,:]['cluster'], subsample, common_counts.iloc[pattern,:]['pattern']])
    hashes = pd.DataFrame(hashes, columns=['cluster', 'hash', 'pattern'])
    hashes.to_csv(f"Clustering/Benign/{name}_SampleHash_Common.csv", index=True)
    print(f"Commonality Ratio: {(matching_samples/inner_df.shape[0])*100:.4f}%")
    print("")
    return hashes
    
def injectPatterns(inner_df, inverse_labelled_df):
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(listToStr(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    print("")
    return inner_df

# Load Dataset

In [3]:
df = loadDF()
df = injectPatterns(df.copy(), inverse_label(df.copy()))
df

Loading DF...

Inverse Labelling...

Injecting API patterns...



Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,5b51d65972a349f90a86984c26b12b30,286,110,172,240,117,240,117,240,117,...,215,114,215,117,261,106,144,297,117,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,ceb8cc125478fad641daa4e04e9b2f19,198,208,106,271,144,194,257,127,114,...,215,86,215,172,117,215,86,215,297,"GetSystemInfo,NtAllocateVirtualMemory,NtOpenSe..."
2,f108600edf46d7c20f6acc522aeba6df,82,228,16,29,82,29,82,29,82,...,286,73,286,208,286,73,286,257,114,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."
3,711be6337cb78a948f04759a0bd210ce,82,240,117,240,117,240,117,240,117,...,117,208,117,35,240,117,35,208,240,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,6de26f67ceb1e3303b889489010f4c3f,286,110,172,240,117,240,117,240,117,...,215,114,215,117,71,25,71,275,260,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1074,d282ef96a93986f89825508812958354,286,110,172,240,117,240,117,240,117,...,215,117,208,172,117,172,117,172,117,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1075,c0389d256f976044adf570f0df908953,82,16,243,286,2,142,31,89,208,...,208,240,117,208,35,260,208,35,208,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
1076,20316e717de5db169aecbb67377504ce,16,86,208,215,86,215,86,215,208,...,260,141,65,260,141,65,260,141,65,"SetUnhandledExceptionFilter,NtCreateMutant,NtA..."
1077,ce945d424b93ea73fbbedf0254f6bc07,215,274,158,215,274,158,215,172,117,...,240,117,240,117,240,117,172,60,81,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."


In [4]:
reserve_df = df.copy()

# Explanation

The process as the following:
1. Find the best cluster size (by means of the [Silhouette Score](https://tushar-joshi-89.medium.com/silhouette-score-a9f7d8d78f29))
2. Search for the most common API pattern for each cluster
3. Sample n hashes per cluster that match the most common API pattern for the same cluster.

**Commonality Ratio**: How many of the matching most common API patterns per cluster are there relative to the dataset size. *(Higher is better)*

# K-Means

In [5]:
df = reserve_df.copy()
print(df.shape, "\n")

clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "KMeans", KMeans(n_init='auto', algorithm='elkan', verbose=0, random_state=seed), 
                              {'n_clusters': clusters})

commonAPI = commonAPICluster(df, "KMeans")
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "KMeans", samplesize)
display(commonHashes)

(1079, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 20} Score 0.3072 Unique_Labels 20 Time 0.0990
Parameter: {'n_clusters': 40} Score 0.3530 Unique_Labels 40 Time 0.1005
Parameter: {'n_clusters': 60} Score 0.3611 Unique_Labels 60 Time 0.1355
Parameter: {'n_clusters': 80} Score 0.3756 Unique_Labels 80 Time 0.1491
Parameter: {'n_clusters': 100} Score 0.3773 Unique_Labels 100 Time 0.1563

BEST PARAM SETUP:  {'n_clusters': 100} 0.3773137789277438

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]



Unnamed: 0,cluster,count,match_ratio,pattern
0,0,94,0.9216,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,1,1,0.0769,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,2,13,0.7222,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
3,3,1,0.5000,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
4,4,11,0.7857,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."
...,...,...,...,...
95,95,1,1.0000,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
96,96,2,0.1429,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
97,97,1,0.3333,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
98,98,1,1.0000,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 44.4856%



Unnamed: 0,cluster,hash,pattern
0,0,305e3f21c4d60a51fd28929465aa8856,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,0,176f927cf8c3fae16bd339571a4d3f07,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,0,a8546eed414a7d9df90aa1f0bcc6db9e,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,0,603551be15c5ceca1020b55376de7815,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,0,54b46953e99d246408beb86e88917efe,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...
195,96,bca0ce2634812df00370861b7af1dd70,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
196,96,57c91b12e7d4cec2c9ae38c7c517e09f,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
197,97,e9876296a05eb5e4d4bf49b319f7b2d9,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
198,98,4a02ac2afd70e112ae1825bcceab485a,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."


# BisectingKMeans

In [6]:
df = reserve_df.copy()
print(df.shape, "\n")

clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "BisectingKMeans", 
                              BisectingKMeans(init='k-means++', random_state=seed, verbose=0, copy_x=True, 
                                              algorithm='lloyd'), 
                              {'n_clusters': clusters})

commonAPI = commonAPICluster(df, "BisectingKMeans")
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "BisectingKMeans", samplesize)
display(commonHashes)

(1079, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 20} Score 0.2952 Unique_Labels 20 Time 0.2714
Parameter: {'n_clusters': 40} Score 0.2978 Unique_Labels 40 Time 0.2462
Parameter: {'n_clusters': 60} Score 0.2940 Unique_Labels 60 Time 0.3173
Parameter: {'n_clusters': 80} Score 0.3000 Unique_Labels 80 Time 0.4004
Parameter: {'n_clusters': 100} Score 0.3018 Unique_Labels 100 Time 0.4735

BEST PARAM SETUP:  {'n_clusters': 100} 0.3018313286164507

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]



Unnamed: 0,cluster,count,match_ratio,pattern
0,0,154,0.7264,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,1,94,0.9126,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,2,1,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,1,1.0000,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
4,4,1,0.5000,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
...,...,...,...,...
95,95,1,0.1000,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."
96,96,1,0.0769,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."
97,97,1,0.2500,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
98,98,2,0.2000,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 41.0565%



Unnamed: 0,cluster,hash,pattern
0,0,7cc7e4d98bcd444365e210db7181c753,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,0,85cc8c7d1e0575d4e7f439834756fe25,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,0,56b2bf994ce95ca4495a99f9d75c5462,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,0,3cedd98ea184c22ee3b024c72a96e075,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,0,79b78bb3d583748040c41ded09555fd3,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...
169,99,3865ac777d267a8aa49299d10a076007,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
170,99,95de3cf54e0a360eed766dbddf152f0d,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
171,99,7f404ed2bad3365f1a6452dbe40024fd,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
172,99,967047584598b8ea09a742328872c06d,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."


# MiniBatchKMeans

In [7]:
df = reserve_df.copy()
print(df.shape, "\n")

clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "MiniBatchKMeans", 
                              MiniBatchKMeans(init='k-means++', verbose=0, random_state=seed, tol=0.001, n_init='auto', 
                                              batch_size=cpuCount()*256), 
                              {'n_clusters': clusters})

commonAPI = commonAPICluster(df, "MiniBatchKMeans")
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "MiniBatchKMeans", samplesize)
display(commonHashes)

(1079, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 20} Score 0.3274 Unique_Labels 20 Time 0.2026
Parameter: {'n_clusters': 40} Score 0.3750 Unique_Labels 38 Time 0.1991
Parameter: {'n_clusters': 60} Score 0.3646 Unique_Labels 57 Time 0.2971
Parameter: {'n_clusters': 80} Score 0.3719 Unique_Labels 76 Time 0.2346
Parameter: {'n_clusters': 100} Score 0.3815 Unique_Labels 94 Time 0.3581

BEST PARAM SETUP:  {'n_clusters': 100} 0.3815283717621825

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  2  3  4  5  6  8  9 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26
 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74
 75 76 77 78 79 80 81 82 84 85 86 87 88 89 90 91 92 93 94 97 98 99]



Unnamed: 0,cluster,count,match_ratio,pattern
0,0,11,1.0000,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."
1,2,2,0.6667,"SetUnhandledExceptionFilter,LdrGetDllHandle,Ge..."
2,3,1,0.0625,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
3,4,50,1.0000,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,5,154,1.0000,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...
89,93,1,1.0000,"GetSystemTimeAsFileTime,NtProtectVirtualMemory..."
90,94,1,1.0000,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
91,97,1,0.2500,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
92,98,1,0.0667,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 49.8610%



Unnamed: 0,cluster,hash,pattern
0,0,e550d4d96d8b9d771c6e227aff2b9029,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."
1,0,4dabfc6c5aeabd35fac449258e174856,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."
2,0,6d8c0c930480a524ad1e0e18e345b7ce,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."
3,0,ff8a61c9edbcbd84e23b069496e73db3,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."
4,0,3506356c329758e4f703cd2103d7daab,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."
...,...,...,...
205,99,0ca449c2a395b980571f628c3d178d8b,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
206,99,fb4259163f4258693cc85d5f06edeeea,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
207,99,d63a7d1a196c40cfec247b13e3c1c773,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
208,99,0fca2620b9f96936b7594fc650b1d8ca,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."


# DBSCAN

In [8]:
df = reserve_df.copy()
print(df.shape, "\n")
min_samples = [5,6,7,8,9,10]
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "DBSCAN", DBSCAN(algorithm='auto', n_jobs=-1), 
                              {'min_samples' : min_samples})

commonAPI = commonAPICluster(df, "DBSCAN")
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "DBSCAN", samplesize)
display(commonHashes)

(1079, 102) 

Searching Best Clustering Parameters...
Parameter: {'min_samples': 5} Score 0.2931 Unique_Labels 20 Time 0.0909
Parameter: {'min_samples': 6} Score 0.2752 Unique_Labels 14 Time 0.0791
Parameter: {'min_samples': 7} Score 0.2697 Unique_Labels 13 Time 0.1052
Parameter: {'min_samples': 8} Score 0.2771 Unique_Labels 10 Time 0.0980
Parameter: {'min_samples': 9} Score 0.2638 Unique_Labels 9 Time 0.0787
Parameter: {'min_samples': 10} Score 0.2638 Unique_Labels 9 Time 0.1106

BEST PARAM SETUP:  {'min_samples': 5} 0.2931169656126402

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[-1  0  1  2  3  4  5  6  7]



Unnamed: 0,cluster,count,match_ratio,pattern
0,-1,8,0.0111,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,0,154,1.0,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,1,94,1.0,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,2,50,1.0,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,3,12,1.0,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
5,4,13,1.0,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
6,5,12,1.0,"LdrGetDllHandle,LdrGetProcedureAddress,LdrGetD..."
7,6,11,1.0,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."
8,7,12,1.0,"GetSystemTimeAsFileTime,NtOpenKey,NtQueryValue..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 33.9203%



Unnamed: 0,cluster,hash,pattern
0,-1,ca823c82789ba7a8506eb000ed2de167,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,-1,33d6bfb8d27bfbc6aaa27ebbb83d1e39,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,-1,657fa0774a9110e7a2b1df370ec1a927,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,-1,0b0564356cf019b3a2e94d7ba497c87c,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,-1,22cc743a96c926817719872e07c351cd,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
5,0,196c9158504207854cd4d7c11c1e3ba5,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
6,0,64a3017d0c6b3aadd64e02e8682e84c1,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
7,0,951edbb498184e7a4f2ce4b7a26a1fcf,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
8,0,ddc559ab995e489f7ad9c37d3203f0cf,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
9,0,08f2f8e28405bd9eb7cc306bf25264a1,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


# Birch

In [9]:
df = reserve_df.copy()
print(df.shape, "\n")
clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
branching_factor = [25,50,75,100]
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "Birch", Birch(compute_labels=True), 
                              {'n_clusters': clusters, 'branching_factor':branching_factor})

commonAPI = commonAPICluster(df, "Birch")
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "Birch", samplesize)
display(commonHashes)

(1079, 102) 

Searching Best Clustering Parameters...
Parameter: {'branching_factor': 25, 'n_clusters': 20} Score 0.2973 Unique_Labels 20 Time 0.1901
Parameter: {'branching_factor': 25, 'n_clusters': 40} Score 0.3032 Unique_Labels 40 Time 0.1615
Parameter: {'branching_factor': 25, 'n_clusters': 60} Score 0.3245 Unique_Labels 60 Time 0.1575
Parameter: {'branching_factor': 25, 'n_clusters': 80} Score 0.3397 Unique_Labels 80 Time 0.1590
Parameter: {'branching_factor': 25, 'n_clusters': 100} Score 0.3511 Unique_Labels 100 Time 0.1570
Parameter: {'branching_factor': 50, 'n_clusters': 20} Score 0.2968 Unique_Labels 20 Time 0.1361
Parameter: {'branching_factor': 50, 'n_clusters': 40} Score 0.2972 Unique_Labels 40 Time 0.1417
Parameter: {'branching_factor': 50, 'n_clusters': 60} Score 0.3196 Unique_Labels 60 Time 0.1363
Parameter: {'branching_factor': 50, 'n_clusters': 80} Score 0.3360 Unique_Labels 80 Time 0.1273
Parameter: {'branching_factor': 50, 'n_clusters': 100} Score 0.3566 Unique_Label

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,1,0.1429,"GetSystemTimeAsFileTime,GetSystemWindowsDirect..."
1,1,2,0.5000,"GetSystemTimeAsFileTime,GetSystemInfo,NtOpenKe..."
2,2,1,0.0833,"GetSystemInfo,NtAllocateVirtualMemory,NtOpenSe..."
3,3,1,0.1429,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
4,4,1,0.1250,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
95,95,1,0.2500,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
96,96,1,1.0000,"FindWindowA,NtOpenFile,NtClose,NtOpenFile,NtCl..."
97,97,1,0.1000,"GetSystemTimeAsFileTime,EnumWindows,OleInitial..."
98,98,4,1.0000,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 33.3642%



Unnamed: 0,cluster,hash,pattern
0,0,7b4878ca9fe993fecc862023440f1cff,"GetSystemTimeAsFileTime,GetSystemWindowsDirect..."
1,1,7718f675bd31f6532021bcee54fac97b,"GetSystemTimeAsFileTime,GetSystemInfo,NtOpenKe..."
2,1,42ccd69a3be9618d329de0ea0fde3a81,"GetSystemTimeAsFileTime,GetSystemInfo,NtOpenKe..."
3,2,7b5d36a3009c11a9b70bda1ad98190b4,"GetSystemInfo,NtAllocateVirtualMemory,NtOpenSe..."
4,3,46578f39b7d29d70606db4e16a9f081c,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
...,...,...,...
174,98,37e31a84967f6e5135ff0cfd10bfe487,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
175,98,ccc607182821134a38d7a400ae063f73,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
176,98,337cc9e8ea6f7be53eb1b200365ce918,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."
177,99,b239106b9b5beb8e0fa4b6584af701e8,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."


# SpectralClustering

*Due to some dataset- or hardware-related limitations, SpectralClustering cannot be executed using the full Oliveira dataset.*

In [10]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# n_neighbors=[10,20,30,40,50]
# assign_labels=['kmeans','discretize']
# samplesize = 5

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "SpectralClustering", 
#                               SpectralClustering(random_state=seed, affinity='nearest_neighbors', 
#                                                  assign_labels='kmeans', n_jobs=-1, verbose=0), 
#                               {'n_clusters': clusters, 'n_neighbors':n_neighbors, 'assign_labels':assign_labels}, 
#                               samplesize)

# commonAPI = commonAPICluster(df, "SpectralClustering")
# display(commonAPI)

# commonHashes = getSampleHash_Common(df, commonAPI, "SpectralClustering", samplesize)
# display(commonHashes)

# AgglomerativeClustering (Ward)

*Due to memory-related limitations, AgglomerativeClustering cannot be executed using the full Oliveira dataset.*

In [11]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# samplesize = 5

# print("Computing Connectivity...")
# start_time = time.time()
# connectivity = []
# connectivity.append(kneighbors_graph(getX(df).values, 2, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(getX(df).values, 4, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(getX(df).values, 6, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(getX(df).values, 8, mode='connectivity', include_self='auto', n_jobs=-1))
# print(f"Connectivity Computation: {time.time()-start_time:.4f}(s)\n")

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "AgglomerativeClustering", AgglomerativeClustering(memory=".memory/", 
#                                                                                      linkage='complete', 
#                                                                                      connectivity=connectivity), 
#                               {'n_clusters': clusters, 'connectivity':connectivity})

# commonAPI = commonAPICluster(df, "AgglomerativeClustering")
# display(commonAPI)

# commonHashes = getSampleHash_Common(df, commonAPI, "AgglomerativeClustering", samplesize)
# display(commonHashes)

# Optics

*Due to some dataset- or hardware-related limitations, Optics cannot be executed using the full Oliveira dataset in a reasonable time.*

In [12]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# leaf_size = [10,20,30,40,50] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# min_samples = [5,10,15,20]
# samplesize = 5

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "OPTICS", OPTICS(algorithm='auto', memory=".memory/", n_jobs=-1), 
#                               {'leaf_size': leaf_size, 'min_samples':min_samples}, 
#                               samplesize)

# commonAPI = commonAPICluster(df, "OPTICS")
# display(commonAPI)

# commonHashes = getSampleHash_Common(df, commonAPI, "OPTICS", samplesize)
# display(commonHashes)

# MeanShift

*Due to some hardware-related limitations, MeanShift cannot be executed using the lite/full Oliveira dataset in a reasonable time.*

In [13]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# max_iter = [300,500] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# samplesize = 5

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "MeanShift", MeanShift(n_jobs=-1), {'max_iter': max_iter})

# commonAPI = commonAPICluster(df, "MeanShift")
# display(commonAPI)

# commonHashes = getSampleHash_Common(df, commonAPI, "MeanShift", samplesize)
# display(commonHashes)