# Clustering Demo (Malicious)

**Note:** Make sure that you have `oliveira.csv` in the same directory as this notebook.

**Clustering Methods not Supported:** GaussianMixture & HDBScan

## Code Preparation

Prepare the functions that might be used as the notebook runs.

In [19]:
import random
import pandas as pd
from sklearn.cluster import KMeans,  BisectingKMeans, MiniBatchKMeans, Birch, DBSCAN
# from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS, MeanShift # Not working
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
import time
import os

#Load list of API calls
API_LIST = "../api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #Add the label for NaN values.
API_FILE.close()

#Random Seed
seed = 1

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def load_df():
    '''Load the dataset file (CSV) as DataFrame'''
    print("Loading DF...")
    df = pd.read_csv("../oliveira.csv", low_memory=False, memory_map=True) # MAKE SURE THIS IS SET AS `oliveira.csv`
    df = df[df['malware'] == 1].copy()
    df = df.drop('malware', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def get_x(df:pd.DataFrame):
    '''Get the feature columns of the DataFrame'''
    return df.iloc[:, 1:102-1]

#Inverse Label Encoding
def inverse_labeller(item):
    '''Low Level. Converts encoded API calls to string API calls'''
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df:pd.DataFrame):
    '''High Level. Converts encoded API calls to string API calls'''
    df2 = df.copy(deep=True)
    print("Inverse Labelling...")
    df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df2

def search_k(parameters, model, X):
    '''Search for the best parameter(s) for the model (usually cluster size or K value)'''
    paramGrid = ParameterGrid(parameters)
    best_score = -1
    best_grid = -1
    best_clusterer = None
    # evaluation based on silhouette_score
    print("Searching Best Clustering Parameters...")
    for p in paramGrid:
        model.set_params(**p)    # set current hyper parameter
        start_time = time.time()
        model.fit(X)          # fit model on wine dataset, this will find clusters based on parameter p
        ss = metrics.silhouette_score(X, model.labels_, random_state=seed)   # calculate silhouette_score
        # silhouette_scores.append([p, ss]) # store all the scores
        print('Parameter:', p, 'Score', f"{ss:.4f}", 'Unique_Labels', len(pd.Series(model.labels_).unique()), "Time", f"{time.time()-start_time:.4f}")
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
            best_clusterer = model
    print("")
    print("BEST PARAM SETUP: ", best_grid, best_score)
    print("")
    return best_grid, best_clusterer

def clustering(inner_df:pd.DataFrame, name:str, clusterer, parameters):
    '''Executes the data clustering on the dataset. Produces the same input dataset with the additional column for the cluster #.'''
    '''The input dataset must contain integer API calls (except the API Call Pattern strings)'''
    X = get_x(inner_df)
    bestCluster, bestClusterer = search_k(parameters, clusterer, X) #assumes bestCluster already fitted
    #bestClusterer.fit(X)
    inner_df['cluster'] = bestClusterer.labels_
    inner_df.to_csv(f"Clustering/Malicious/{name}_Encoded_Clustering.csv", index=False)
    inner_df = inverse_label(inner_df)
    inner_df.to_csv(f"Clustering/Malicious/{name}_Clustering.csv", index=False)
    print("")
    return bestCluster

def common_api_cluster(inner_df:pd.DataFrame, name:str):
    '''Determine the most common API call patterns for each cluster'''
    global df
    inner_df = df
    clusters = inner_df['cluster'].unique()
    clusters.sort()
    commonAPI = []
    print("Searching for Common API Patterns per Cluster...")
    print(clusters)
    for cluster in clusters:
        raw_commonC = inner_df[inner_df['cluster']==cluster]['pattern']#.value_counts()
        commonC = raw_commonC.value_counts().to_frame(name='counts').reset_index()
        commonAPI.append([cluster, commonC['counts'].iloc[0], round(commonC['counts'].iloc[0]/raw_commonC.shape[0],4), commonC['pattern'].iloc[0]])
    commonAPI = pd.DataFrame(commonAPI, columns=['cluster', 'count', 'match_ratio', 'pattern'])
    commonAPI.to_csv(f"Clustering/Malicious/{name}_Common_API_Cluster.csv", index=False)
    print("")
    print("Average Match Ratio:", commonAPI['match_ratio'].mean())
    return commonAPI

def get_samplehash_common(inner_df:pd.DataFrame, common_counts:pd.DataFrame, name:str, samplesize:int):
    '''Get sample hashes from each cluster that matches the common API call pattern of the cluster.'''
    hashes = []
    global seed
    random.seed(seed)
    matching_samples = 0
    print(f"Random (Seed @ {seed}) Sampling Hashes subset of  Most Common API Patterns...")
    for pattern in range(common_counts.shape[0]):
        sub_df = inner_df[inner_df['pattern']==common_counts.iloc[pattern,3]]
        subsamples = len(sub_df['hash'].to_list())
        matching_samples += subsamples
        if subsamples < samplesize:
            subsamples = random.sample(sub_df['hash'].to_list(), len(sub_df['hash'].to_list()))
        else:
            subsamples = random.sample(sub_df['hash'].to_list(), samplesize)
        for subsample in subsamples:
            hashes.append([common_counts.iloc[pattern,:]['cluster'], subsample, '_', '_', '_', common_counts.iloc[pattern,:]['pattern']])
    hashes = pd.DataFrame(hashes, columns=['cluster', 'hash', 'Type 1', 'Type 2', 'Type 3', 'pattern'])
    hashes.to_csv(f"Clustering/Malicious/{name}_SampleHash_Common.csv", index=False)
    print(f"Commonality Ratio: {(matching_samples/inner_df.shape[0])}")
    print("")
    return hashes
    
def inject_patterns(inner_df:pd.DataFrame, inverse_labelled_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    print("")
    inverse_label(inner_df).to_csv(f"Clustering/Malicious/API_Patterns.csv", index=False)
    return inner_df

if not os.path.exists(os.path.abspath(os.getcwd())+"\Clustering"):
    os.makedirs(os.path.abspath(os.getcwd())+"\Clustering\Malicious")
    os.makedirs(os.path.abspath(os.getcwd())+"\Clustering\Benign")

## Load Dataset

In [20]:
df = load_df()
df = inject_patterns(df.copy(), inverse_label(df.copy()))
reserve_df = df.copy()
df

Loading DF...

Inverse Labelling...

Injecting API patterns...

Inverse Labelling...



Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42792,e3d6d58faa040f0f9742c9d0eaf58be4,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42793,9b917bab7f32188ae40c744f2be9aaf8,82,240,117,240,117,240,117,240,117,...,159,224,82,159,224,82,159,224,82,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42794,35a18ee05f75f04912018d9f462cb990,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42795,654139d715abcf7ecdddbef5a84f224b,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


### Explanation

The process as the following:
1. Find the best cluster size (by means of the [Silhouette Score](https://tushar-joshi-89.medium.com/silhouette-score-a9f7d8d78f29))
2. Search for the most common API pattern for each cluster
3. Sample n hashes per cluster that match the most common API pattern for the same cluster.

**Silhouette Score:** How good is the quality of data clustering? *(Higher is better)*

**Match Ratio:** How common is the most common API call pattern in each cluster among the samples found in its cluster? *(Higher is better)*

**Commonality Ratio**: How many of the matching most common API patterns per cluster are there relative to the dataset size. *(Higher is better)*

In [21]:
'''UPDATE THESE VALUES AS NEEDED'''

clusters = [25,50,75,100,125,150,175,200]    # Place either single or multiple values as long as it is in list format. 
                    # For multiple values, the program will iterate through every cluster size and will choose the best (usually the biggest value) to be part of the best configuration.
                    # It influences the projected total number of samples to verify/analyze.
                    
samplesize = 1000000      # Max no. of samples to obtain from a cluster. 
                    # It influences the projected total number of samples to verify/analyze.

## K-Means

In [22]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "KMeans", KMeans(n_init='auto', verbose=0, random_state=seed), 
                              {'algorithm':algorithm})

commonAPI = common_api_cluster(df, "KMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "KMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...


Parameter: {'algorithm': 'lloyd'} Score 0.2365 Unique_Labels 8 Time 39.6733
Parameter: {'algorithm': 'elkan'} Score 0.2365 Unique_Labels 8 Time 40.1440

BEST PARAM SETUP:  {'algorithm': 'lloyd'} 0.2365118856589945

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[0 1 2 3 4 5 6 7]

Average Match Ratio: 0.3024375


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,1059,0.4129,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
1,1,738,0.0968,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,3308,0.7645,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
3,3,539,0.1065,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
4,4,576,0.4586,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
5,5,1094,0.251,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
6,6,486,0.2582,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
7,7,1116,0.071,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.20833235974484193



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,fa1b05b3f205f89c5a4e67fac1ec16ef,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
1,0,1e559182a1dbb153a193d48b3eda36a8,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
2,0,12457923ecb62e66b302ce66672a9411,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
3,0,6d834f1e2e622aa124a130465a303ed9,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
4,0,8e7e32092c7cb50f1821d00f13048d9e,_,_,_,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
...,...,...,...,...,...,...
8911,7,1a28bab638b0882ee472609df654c4a2,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
8912,7,7608a77ce3234e06d20673a60bc58023,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
8913,7,98cebf92b3dda69516573f9b6b0cec86,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
8914,7,0aac478e0f3c25052d9aa05714b9b0ff,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."


## BisectingKMeans

In [23]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "BisectingKMeans", 
                              BisectingKMeans(random_state=seed, verbose=0, copy_x=True), 
                              {'algorithm':algorithm})

commonAPI = common_api_cluster(df, "BisectingKMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "BisectingKMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'algorithm': 'lloyd'} Score 0.2144 Unique_Labels 8 Time 33.0899
Parameter: {'algorithm': 'elkan'} Score 0.2144 Unique_Labels 8 Time 33.8742

BEST PARAM SETUP:  {'algorithm': 'lloyd'} 0.21438695385080042

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[0 1 2 3 4 5 6 7]

Average Match Ratio: 0.22943750000000002


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,738,0.0967,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,3308,0.7617,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,2,1094,0.1965,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
3,3,1059,0.2349,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
4,4,576,0.1035,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
5,5,1116,0.2601,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
6,6,181,0.0201,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
7,7,305,0.162,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.19573801902002477



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,799a0ba351f1556d05da1c350a82fffb,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,34cb49bcc96115511eaa57d76d47f432,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,573d6e43e0091c689eb17750973d76ec,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,d2177332030d79b15f5f3bedb2dec39f,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,735867868ee9010f3a38a609945b2a56,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
8372,7,e1b3146380d4f25d29c519651b71651c,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
8373,7,89571a6709e8330498a28ecad92426d1,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
8374,7,9ce9f21ba30195305c070bfd962a4902,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
8375,7,b42c6496b50a54b7ef362c960726a390,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


## MiniBatchKMeans

In [24]:
df = reserve_df.copy()
print(df.shape, "\n")

reassignment_ratio = [0.01, 0.1, 0.3]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "MiniBatchKMeans", 
                              MiniBatchKMeans(verbose=0, random_state=seed, n_init='auto', 
                                              batch_size=os.cpu_count()*256), 
                              {'reassignment_ratio':reassignment_ratio})

commonAPI = common_api_cluster(df, "MiniBatchKMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "MiniBatchKMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'reassignment_ratio': 0.01} Score 0.1879 Unique_Labels 8 Time 31.6704
Parameter: {'reassignment_ratio': 0.1} Score 0.1993 Unique_Labels 8 Time 39.6330
Parameter: {'reassignment_ratio': 0.3} Score 0.0153 Unique_Labels 8 Time 45.0989

BEST PARAM SETUP:  {'reassignment_ratio': 0.1} 0.1992703807683651

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[0 1 2 3 4 5 6 7]

Average Match Ratio: 0.48562500000000003


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,486,0.9701,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,344,0.0761,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,13,0.0458,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
3,3,12,0.3243,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
4,4,242,0.55,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
5,5,3308,0.0912,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
6,6,122,0.976,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
7,7,539,0.8515,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.11837278313900507



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,0efbad47c5453667e22dcd6627e9f563,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,6e32b002b511f21dbe574950185dec3c,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,cc91bf723527983cbf52a0ecb05aa6ff,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,7b69234bf6e2165a3efa97adb120f91a,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,769503fbebd7c7573544140451600879,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
5061,7,9d6d893cd9974916bdffc8a4a80a735c,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
5062,7,9bbbc431da5de913107fd909fa43438f,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
5063,7,8e55ec77ffb82c4984d6bbbe499dc38c,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
5064,7,3120d0f8f7e53d55f1fb72905d0480ad,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."


## DBSCAN

In [25]:
df = reserve_df.copy()
print(df.shape, "\n")

eps = [0.2,0.5,0.8]
min_samples = [5,10]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "DBSCAN", DBSCAN(algorithm='auto', n_jobs=-1), 
                              {'min_samples' : min_samples, 'eps':eps})

commonAPI = common_api_cluster(df, "DBSCAN")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "DBSCAN", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'eps': 0.2, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 133.5213
Parameter: {'eps': 0.2, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 109.9407
Parameter: {'eps': 0.5, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 96.1744
Parameter: {'eps': 0.5, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 103.3366
Parameter: {'eps': 0.8, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 96.8249
Parameter: {'eps': 0.8, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 92.3960

BEST PARAM SETUP:  {'eps': 0.2, 'min_samples': 5} 0.5091870285529345

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67 

Unnamed: 0,cluster,count,match_ratio,pattern
0,-1,9,0.0006,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,0,3308,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,1,128,1.0000,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
3,2,173,1.0000,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
4,3,486,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
294,293,15,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
295,294,11,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
296,295,17,1.0000,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
297,296,13,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.6206977124564806



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,-1,5e1f079fc9130cd508568da3cb0b219a,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,-1,2b05809d67062f0af9fec37f33d1b338,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,-1,e8a9d42e07c25d00fcc56170e66071fd,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,-1,01e2cd4d45e8bc2608f3519a653d3a63,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,-1,d93b214c093a9f1e07248962aeb74fc8,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...,...,...
26559,297,a4200ec0b146d8a0d37e90e32c674780,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
26560,297,05b379055a79c5e47bdabec418190ac7,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
26561,297,832cdd79c5deaabf3170380f17118c3b,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
26562,297,4b58a7c885df8e86be4769fd949d2c37,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


## Birch

In [26]:
df = reserve_df.copy()
print(df.shape, "\n")

branching_factor = [25,50,75,100]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "Birch", Birch(compute_labels=True), 
                              {'branching_factor':branching_factor})

commonAPI = common_api_cluster(df, "Birch")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "Birch", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'branching_factor': 25} Score 0.0717 Unique_Labels 3 Time 76.4099
Parameter: {'branching_factor': 50} Score 0.0717 Unique_Labels 3 Time 72.8889
Parameter: {'branching_factor': 75} Score 0.0717 Unique_Labels 3 Time 64.7182
Parameter: {'branching_factor': 100} Score 0.0717 Unique_Labels 3 Time 78.4506

BEST PARAM SETUP:  {'branching_factor': 75} 0.0716853006300032

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[0 1 2]

Average Match Ratio: 0.0808


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,3308,0.0969,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,738,0.0968,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,51,0.0487,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.09573100918288666



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,fccce06bbabd5398622081172b98ac09,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,0,ed1c628b31c5c4c62fa061520a3817ce,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,0,bae18a5c39fd2c3a5d48b59e1f059494,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
3,0,a3a160e1f593d1b027042c01c943534a,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
4,0,a5b68fed471885355e5c028f963cce20,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
...,...,...,...,...,...,...
4092,2,a145135d2b1fe0794f4dfcf492ddb12b,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4093,2,05ffaeda05a4cf9e7c3e671251f116e1,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4094,2,01e42e0ec55a58adcddf328e6757d664,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4095,2,2d060f3dac42e5f473947d66921a8ff2,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


## SpectralClustering

*Due to some dataset- or hardware-related limitations, SpectralClustering cannot be executed using the full Oliveira dataset.*

In [27]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# n_neighbors=[10,20,30,40,50]
# assign_labels=['kmeans','discretize']

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "SpectralClustering", 
#                               SpectralClustering(random_state=seed, affinity='nearest_neighbors', assign_labels='kmeans', n_jobs=-1, verbose=0), 
#                               {'n_neighbors':n_neighbors, 'assign_labels':assign_labels})

# commonAPI = common_api_cluster(df, "SpectralClustering")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "SpectralClustering", samplesize)
# display(commonHashes)

## AgglomerativeClustering (Ward)

*Due to memory-related limitations, AgglomerativeClustering cannot be executed using the full Oliveira dataset.*

In [28]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# print("Computing Connectivity...")
# start_time = time.time()
# connectivity = []
# connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 2, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 4, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 6, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 8, mode='connectivity', include_self='auto', n_jobs=-1))
# print(f"Connectivity Computation: {time.time()-start_time:.4f}(s)\n")

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "AgglomerativeClustering", AgglomerativeClustering(memory=".memory/", 
#                                                                                      linkage='complete', 
#                                                                                      connectivity=connectivity), 
#                               {'connectivity':connectivity})

# commonAPI = common_api_cluster(df, "AgglomerativeClustering")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "AgglomerativeClustering", samplesize)
# display(commonHashes)

## Optics

*Due to some dataset- or hardware-related limitations, Optics cannot be executed using the full Oliveira dataset in a reasonable time.*

In [29]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# leaf_size = [10,20,30,40,50] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# min_samples = [5,10,15,20]

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "OPTICS", OPTICS(algorithm='auto', memory=".memory/", n_jobs=-1), 
#                               {'leaf_size': leaf_size, 'min_samples':min_samples}, 
#                               samplesize)

# commonAPI = common_api_cluster(df, "OPTICS")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "OPTICS", samplesize)
# display(commonHashes)

## MeanShift

*Due to some hardware-related limitations, MeanShift cannot be executed using the lite/full Oliveira dataset in a reasonable time.*

In [30]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# max_iter = [300,500] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "MeanShift", MeanShift(n_jobs=-1), {'max_iter': max_iter})

# commonAPI = common_api_cluster(df, "MeanShift")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "MeanShift", samplesize)
# display(commonHashes)