# Clustering Demo (Malicious)

**Clustering Methods not Supported:** GaussianMixture & HDBScan

In [1]:
import random
import pandas as pd
from sklearn.cluster import KMeans, SpectralClustering, BisectingKMeans, MiniBatchKMeans, Birch
from sklearn.cluster import AgglomerativeClustering, OPTICS, MeanShift, DBSCAN
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
import matplotlib.pyplot as plt
import multiprocessing
from sklearn.neighbors import kneighbors_graph
import time

def cpuCount():
    return multiprocessing.cpu_count()

def listToStr(ls):
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def loadDF():
    print("Loading DF...")
    df = pd.read_csv("oliveira.csv", low_memory=False, memory_map=True)
    df = df[df['malware'] == 1].copy()
    df = df.drop('malware', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def getX(df):
    return df.iloc[:, 1:102-1]

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Random Seed
seed = 1

#Inverse Label Encoding
def inverse_labeller(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df):
    print("Inverse Labelling...")
    df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df

From: https://medium.com/swlh/k-means-clustering-on-high-dimensional-data-d2151e1a4240

In [2]:
def searchK(parameters, model, X):
    paramGrid = ParameterGrid(parameters)
    best_score = -1
    best_grid = -1
    best_clusterer = None
    silhouette_scores = []
    # evaluation based on silhouette_score
    print("Searching Best Clustering Parameters...")
    for p in paramGrid:
        model.set_params(**p)    # set current hyper parameter
        startTime = time.time()
        model.fit(X)          # fit model on wine dataset, this will find clusters based on parameter p
        ss = metrics.silhouette_score(X, model.labels_, random_state=seed)   # calculate silhouette_score
        silhouette_scores += [ss]       # store all the scores
        print('Parameter:', p, 'Score', f"{ss:.4f}", 'Unique_Labels', len(pd.Series(model.labels_).unique()), "Time", f"{time.time()-startTime:.4f}")
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
            best_clusterer = model
    print("")
    print("BEST PARAM SETUP: ", best_grid, best_score)
    print("")
    return best_grid, best_clusterer

def clustering(inner_df, name, clusterer, parameters):
    X = getX(inner_df)
    bestCluster, bestClusterer = searchK(parameters, clusterer, X) #assumes bestCluster already fitted
    #bestClusterer.fit(X)
    inner_df['cluster'] = bestClusterer.labels_
    inner_df = inverse_label(inner_df)
    inner_df.to_csv(f"Clustering/Malicious/{name}_Clustering.csv", index=False)
    print("")
    return bestCluster

def commonAPICluster(inner_df, name, size):
    inner_df = df
    clusters = inner_df['cluster'].unique()
    clusters.sort()
    columns = inner_df.columns[1:102]
    commonAPI = []
    print("Searching for Common API Patterns per Cluster...")
    print(clusters)
    for cluster in clusters:
        raw_commonC = inner_df[inner_df['cluster']==cluster]['pattern']#.value_counts()
        commonC = raw_commonC.value_counts().to_frame(name='counts').reset_index()
        commonAPI.append([cluster, commonC['counts'].iloc[0], round(commonC['counts'].iloc[0]/raw_commonC.shape[0],4), commonC['pattern'].iloc[0]])
    commonAPI = pd.DataFrame(commonAPI, columns=['cluster', 'count', 'match_ratio', 'pattern'])
    commonAPI.to_csv(f"Clustering/Malicious/{name}_CommonAPICluster.csv", index=False)
    print("")
    return commonAPI

def getSampleHash_Common(inner_df, common_counts, name, size, samplesize):
    hashes = []
    global seed
    random.seed(seed)
    matching_samples = 0
    print(f"Random (Seed @ {seed}) Sampling Hashes subset of  Most Common API Patterns...")
    for pattern in range(common_counts.shape[0]):
        sub_df = inner_df[inner_df['pattern']==common_counts.iloc[pattern,3]]
        subsamples = len(sub_df['hash'].to_list())
        matching_samples += subsamples
        if subsamples < samplesize:
            subsamples = random.sample(sub_df['hash'].to_list(), len(sub_df['hash'].to_list()))
        else:
            subsamples = random.sample(sub_df['hash'].to_list(), samplesize)
        for subsample in subsamples:
            hashes.append([common_counts.iloc[pattern,:]['cluster'], subsample, common_counts.iloc[pattern,:]['pattern']])
    hashes = pd.DataFrame(hashes, columns=['cluster', 'hash', 'pattern'])
    hashes.to_csv(f"Clustering/Malicious/{name}_SampleHash_Common.csv", index=True)
    print(f"Commonality Ratio: {(matching_samples/inner_df.shape[0])*100:.4f}%")
    print("")
    return hashes
    
def injectPatterns(inner_df, inverse_labelled_df):
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(listToStr(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    print("")
    return inner_df

# Load Dataset

In [3]:
df = loadDF()
df = injectPatterns(df.copy(), inverse_label(df.copy()))
df

Loading DF...

Inverse Labelling...

Injecting API patterns...



Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42792,e3d6d58faa040f0f9742c9d0eaf58be4,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42793,9b917bab7f32188ae40c744f2be9aaf8,82,240,117,240,117,240,117,240,117,...,159,224,82,159,224,82,159,224,82,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42794,35a18ee05f75f04912018d9f462cb990,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42795,654139d715abcf7ecdddbef5a84f224b,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [4]:
reserve_df = df.copy()

# Explanation

The process as the following:
1. Find the best cluster size (by means of the [Silhouette Score](https://tushar-joshi-89.medium.com/silhouette-score-a9f7d8d78f29))
2. Search for the most common API pattern for each cluster
3. Sample n hashes per cluster that match the most common API pattern for the same cluster.

**Commonality Ratio**: How many of the matching most common API patterns per cluster are there relative to the dataset size. *(Higher is better)*

# K-Means

In [5]:
df = reserve_df.copy()
print(df.shape, "\n")

clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "KMeans", KMeans(n_init='auto', algorithm='elkan', verbose=0, random_state=seed), 
                              {'n_clusters': clusters})

commonAPI = commonAPICluster(df, "KMeans", bestClusterParam)
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "KMeans", bestClusterParam, samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 20} Score 0.3004 Unique_Labels 20 Time 29.9716
Parameter: {'n_clusters': 40} Score 0.3771 Unique_Labels 40 Time 29.5898
Parameter: {'n_clusters': 60} Score 0.4337 Unique_Labels 60 Time 30.3644
Parameter: {'n_clusters': 80} Score 0.4681 Unique_Labels 80 Time 31.1324
Parameter: {'n_clusters': 100} Score 0.4979 Unique_Labels 100 Time 31.7785

BEST PARAM SETUP:  {'n_clusters': 100} 0.49788832505950503

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]



Unnamed: 0,cluster,count,match_ratio,pattern
0,0,306,0.9745,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,1,5,0.0333,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,344,0.4674,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,3308,0.9928,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
4,4,1116,0.9654,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
...,...,...,...,...
95,95,18,0.0634,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
96,96,17,0.0929,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
97,97,13,0.0637,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
98,98,39,0.4194,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 49.2464%



Unnamed: 0,cluster,hash,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...
490,99,38beaa14fdd861489b7c1e88161266f9,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
491,99,125e4dfc79fbfdadfeba0fea49533621,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
492,99,ce4823889c3c5f42ffd5654be87d8ff3,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
493,99,d7f05bb88c5547e567e0a4ee484feba4,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


# BisectingKMeans

In [6]:
df = reserve_df.copy()
print(df.shape, "\n")

clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "BisectingKMeans", 
                              BisectingKMeans(init='k-means++', random_state=seed, verbose=0, copy_x=True, 
                                              algorithm='lloyd'), 
                              {'n_clusters': clusters})

commonAPI = commonAPICluster(df, "BisectingKMeans", bestClusterParam)
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "BisectingKMeans", bestClusterParam, samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 20} Score 0.2778 Unique_Labels 20 Time 30.7842
Parameter: {'n_clusters': 40} Score 0.3334 Unique_Labels 40 Time 32.9465
Parameter: {'n_clusters': 60} Score 0.3545 Unique_Labels 60 Time 30.5018
Parameter: {'n_clusters': 80} Score 0.3776 Unique_Labels 80 Time 30.6965
Parameter: {'n_clusters': 100} Score 0.4136 Unique_Labels 100 Time 32.2547

BEST PARAM SETUP:  {'n_clusters': 100} 0.4136282940968569

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]



Unnamed: 0,cluster,count,match_ratio,pattern
0,0,6,0.0233,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
1,1,59,0.5514,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,1116,0.8598,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
3,3,486,0.5834,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,4,81,0.9643,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
95,95,8,0.0316,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
96,96,27,0.0553,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
97,97,132,0.1362,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
98,98,411,0.4472,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 42.1478%



Unnamed: 0,cluster,hash,pattern
0,0,d29927b7c51c388ae5355f3c3b983aaa,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
1,0,fd56118f5880ef1f65310ded0eb627e8,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
2,0,d7780eeb410b27fbee7596435e85c5a8,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
3,0,57b9ce2e8bf740244912cc91c97fa55a,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
4,0,fd87749183c967f6d235083ef72b49a5,"NtAllocateVirtualMemory,NtFreeVirtualMemory,Nt..."
...,...,...,...
485,99,6d7b401188c1a18ecc5a1e18c2c379c2,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
486,99,ea1276abee5752ceea2a0fc06115d620,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
487,99,5e146fdae2ab66ebd1569fcaa31a2429,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
488,99,3070f07a2a0260a7c153eaba289eccf6,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


# MiniBatchKMeans

In [7]:
df = reserve_df.copy()
print(df.shape, "\n")

clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "MiniBatchKMeans", 
                              MiniBatchKMeans(init='k-means++', verbose=0, random_state=seed, tol=0.001, n_init='auto', 
                                              batch_size=cpuCount()*256), 
                              {'n_clusters': clusters})

commonAPI = commonAPICluster(df, "MiniBatchKMeans", bestClusterParam)
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "MiniBatchKMeans", bestClusterParam, samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 20} Score 0.2849 Unique_Labels 20 Time 31.1592
Parameter: {'n_clusters': 40} Score 0.3740 Unique_Labels 40 Time 30.4445
Parameter: {'n_clusters': 60} Score 0.4188 Unique_Labels 60 Time 30.4056
Parameter: {'n_clusters': 80} Score 0.4567 Unique_Labels 80 Time 30.3583
Parameter: {'n_clusters': 100} Score 0.4863 Unique_Labels 100 Time 30.3895

BEST PARAM SETUP:  {'n_clusters': 100} 0.48629204003638327

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]



Unnamed: 0,cluster,count,match_ratio,pattern
0,0,122,0.9839,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,18,0.0465,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,2,738,0.6727,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,3308,0.9922,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
4,4,1094,0.9991,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
...,...,...,...,...
95,95,12,0.1008,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
96,96,12,0.1791,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
97,97,127,0.1478,"NtAllocateVirtualMemory,LdrLoadDll,LdrGetProce..."
98,98,75,0.1521,"LdrLoadDll,SetErrorMode,LdrLoadDll,WSAStartup,..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 49.0595%



Unnamed: 0,cluster,hash,pattern
0,0,f5358cff915679eddb4263bdcde18b8d,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,8bbdd06ff36d7462523f3b0602d7625a,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,ddc5a1c6939f3d5aa14cef588991087b,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,1ee754e2857d80b7d4050637f169373f,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,45d4e72f3836158b966d608a8a07b346,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...
491,99,5f960cfd0313ffaeb82195de3b5151ce,"LdrGetDllHandle,LdrGetProcedureAddress,OleInit..."
492,99,030b08f9c8fe5f0f2cb5ae2aac6142d6,"LdrGetDllHandle,LdrGetProcedureAddress,OleInit..."
493,99,871799ed904b806f9ab2c74fab71f301,"LdrGetDllHandle,LdrGetProcedureAddress,OleInit..."
494,99,c51c8c10de4c77a526304217950cf4db,"LdrGetDllHandle,LdrGetProcedureAddress,OleInit..."


# DBSCAN

In [8]:
df = reserve_df.copy()
print(df.shape, "\n")
min_samples = [5,6,7,8,9,10]
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "DBSCAN", DBSCAN(algorithm='auto', n_jobs=-1), 
                              {'min_samples' : min_samples})

commonAPI = commonAPICluster(df, "DBSCAN", bestClusterParam)
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "DBSCAN", bestClusterParam, samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 81.6820
Parameter: {'min_samples': 6} Score 0.4973 Unique_Labels 501 Time 78.9457
Parameter: {'min_samples': 7} Score 0.4818 Unique_Labels 411 Time 79.5411
Parameter: {'min_samples': 8} Score 0.4715 Unique_Labels 358 Time 79.0739
Parameter: {'min_samples': 9} Score 0.4647 Unique_Labels 322 Time 87.2335
Parameter: {'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 85.4762

BEST PARAM SETUP:  {'min_samples': 5} 0.5091870285529345

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70
  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88
  89

Unnamed: 0,cluster,count,match_ratio,pattern
0,-1,9,0.0006,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,0,3308,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,1,128,1.0000,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
3,2,173,1.0000,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
4,3,486,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
294,293,15,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
295,294,11,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
296,295,17,1.0000,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
297,296,13,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 62.0698%



Unnamed: 0,cluster,hash,pattern
0,-1,5e1f079fc9130cd508568da3cb0b219a,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,-1,2b05809d67062f0af9fec37f33d1b338,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,-1,e8a9d42e07c25d00fcc56170e66071fd,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,-1,01e2cd4d45e8bc2608f3519a653d3a63,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,-1,d93b214c093a9f1e07248962aeb74fc8,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...
1490,297,05b379055a79c5e47bdabec418190ac7,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1491,297,d8c65468405b789c56754336c1f8911b,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1492,297,4b58a7c885df8e86be4769fd949d2c37,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1493,297,a4200ec0b146d8a0d37e90e32c674780,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


# Birch

In [9]:
df = reserve_df.copy()
print(df.shape, "\n")
clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
branching_factor = [25,50,75,100]
samplesize = 5

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "Birch", Birch(compute_labels=True), 
                              {'n_clusters': clusters, 'branching_factor':branching_factor})

commonAPI = commonAPICluster(df, "Birch", bestClusterParam)
display(commonAPI)

commonHashes = getSampleHash_Common(df, commonAPI, "Birch", bestClusterParam, samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'branching_factor': 25, 'n_clusters': 20} Score 0.1730 Unique_Labels 20 Time 67.4883
Parameter: {'branching_factor': 25, 'n_clusters': 40} Score 0.2349 Unique_Labels 40 Time 59.7633
Parameter: {'branching_factor': 25, 'n_clusters': 60} Score 0.2510 Unique_Labels 60 Time 60.1371
Parameter: {'branching_factor': 25, 'n_clusters': 80} Score 0.2628 Unique_Labels 80 Time 58.0592
Parameter: {'branching_factor': 25, 'n_clusters': 100} Score 0.2822 Unique_Labels 100 Time 59.8531
Parameter: {'branching_factor': 50, 'n_clusters': 20} Score 0.2069 Unique_Labels 20 Time 58.4900
Parameter: {'branching_factor': 50, 'n_clusters': 40} Score 0.2242 Unique_Labels 40 Time 57.5294
Parameter: {'branching_factor': 50, 'n_clusters': 60} Score 0.2314 Unique_Labels 60 Time 60.2260
Parameter: {'branching_factor': 50, 'n_clusters': 80} Score 0.2772 Unique_Labels 80 Time 58.1176
Parameter: {'branching_factor': 50, 'n_clusters': 100} Score 0.2855 Un

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,114,0.1735,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
1,1,2,0.0081,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,1059,0.3998,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
3,3,12,0.0930,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
4,4,19,0.0609,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
...,...,...,...,...
95,95,2,0.0164,"SetUnhandledExceptionFilter,NtAllocateVirtualM..."
96,96,135,0.2292,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
97,97,50,0.6250,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
98,98,4,0.0444,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 29.2941%



Unnamed: 0,cluster,hash,pattern
0,0,28d48772cdb60f8aa046bfcc7fc4015c,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
1,0,ac904aaaedc9600a4027ed4001fa6bcf,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
2,0,38dfce406619531a931cedf85630892d,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
3,0,3ad802c3ee257fbc8c4a0f01616a80fb,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
4,0,05d1026342dfbe2d1a7153d3533c0573,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
...,...,...,...
445,99,55eceea9390c26610d904ba4ce22e7b0,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."
446,99,c7f3a06002f34caeac850a2e263f2313,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."
447,99,c5ef90b279200885db652a8668684d18,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."
448,99,17131d4ca788c14ad8bf00dcc29890f3,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."


# SpectralClustering

*Due to some dataset- or hardware-related limitations, SpectralClustering cannot be executed using the full Oliveira dataset.*

In [10]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# n_neighbors=[10,20,30,40,50]
# assign_labels=['kmeans','discretize']
# samplesize = 5

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "SpectralClustering", 
#                               SpectralClustering(random_state=seed, affinity='nearest_neighbors', 
#                                                  assign_labels='kmeans', n_jobs=-1, verbose=0), 
#                               {'n_clusters': clusters, 'n_neighbors':n_neighbors, 'assign_labels':assign_labels}, 
#                               samplesize)

# commonAPI = commonAPICluster(df, "SpectralClustering", bestClusterParam)
# display(commonAPI)

# commonHashes = getSampleHash_Common(df, commonAPI, "SpectralClustering", bestClusterParam, samplesize)
# display(commonHashes)

# AgglomerativeClustering (Ward)

*Due to memory-related limitations, AgglomerativeClustering cannot be executed using the full Oliveira dataset.*

In [11]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# clusters = [20,40,60,80,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# samplesize = 5

# print("Computing Connectivity...")
# startTime = time.time()
# connectivity = []
# connectivity.append(kneighbors_graph(getX(df).values, 2, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(getX(df).values, 4, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(getX(df).values, 6, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(getX(df).values, 8, mode='connectivity', include_self='auto', n_jobs=-1))
# print(f"Connectivity Computation: {time.time()-startTime:.4f}(s)\n")

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "AgglomerativeClustering", AgglomerativeClustering(memory=".memory/", 
#                                                                                      linkage='complete', 
#                                                                                      connectivity=connectivity), 
#                               {'n_clusters': clusters, 'connectivity':connectivity})

# commonAPI = commonAPICluster(df, "AgglomerativeClustering", bestClusterParam)
# display(commonAPI)

# commonHashes = getSampleHash_Common(df, commonAPI, "AgglomerativeClustering", bestClusterParam, samplesize)
# display(commonHashes)

# Optics

*Due to some dataset- or hardware-related limitations, Optics cannot be executed using the full Oliveira dataset in a reasonable time.*

In [12]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# leaf_size = [10,20,30,40,50] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# min_samples = [5,10,15,20]
# samplesize = 5

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "OPTICS", OPTICS(algorithm='auto', memory=".memory/", n_jobs=-1), 
#                               {'leaf_size': leaf_size, 'min_samples':min_samples}, 
#                               samplesize)

# commonAPI = commonAPICluster(df, "OPTICS", bestClusterParam)
# display(commonAPI)

# commonHashes = getSampleHash_Common(df, commonAPI, "OPTICS", bestClusterParam, samplesize)
# display(commonHashes)

# MeanShift

*Due to some hardware-related limitations, MeanShift cannot be executed using the lite/full Oliveira dataset in a reasonable time.*

In [13]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# max_iter = [300,500] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# samplesize = 5

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "MeanShift", MeanShift(n_jobs=-1), {'max_iter': max_iter})

# commonAPI = commonAPICluster(df, "MeanShift", bestClusterParam)
# display(commonAPI)

# commonHashes = getSampleHash_Common(df, commonAPI, "MeanShift", bestClusterParam, samplesize)
# display(commonHashes)