# Clustering Demo (Malicious)

**Note:** Make sure that you have `oliveira.csv` in the same directory as this notebook.

**Clustering Methods not Supported:** GaussianMixture & HDBScan

# 1. Code Preparation

Prepare the functions that might be used as the notebook runs.

In [1]:
import random
import pandas as pd
from sklearn.cluster import KMeans,  BisectingKMeans, MiniBatchKMeans, Birch, DBSCAN
# from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS, MeanShift # Not working
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
from yellowbrick.cluster import SilhouetteVisualizer
import time
import os

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #Add the label for NaN values.
API_FILE.close()

#Random Seed
seed = 1

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def load_df():
    '''Load the dataset file (CSV) as DataFrame'''
    print("Loading DF...")
    df = pd.read_csv("oliveira.csv", low_memory=False, memory_map=True) # MAKE SURE THIS IS SET AS `oliveira.csv`
    df = df[df['malware'] == 1].copy()
    df = df.drop('malware', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def get_x(df:pd.DataFrame):
    '''Get the feature columns of the DataFrame'''
    return df.iloc[:, 1:102-1]

#Inverse Label Encoding
def inverse_labeller(item):
    '''Low Level. Converts encoded API calls to string API calls'''
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df:pd.DataFrame):
    '''High Level. Converts encoded API calls to string API calls'''
    df2 = df.copy(deep=True)
    print("Inverse Labelling...")
    df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df2

def search_k(parameters, model, X):
    '''Search for the best parameter(s) for the model (usually cluster size or K value)'''
    paramGrid = ParameterGrid(parameters)
    best_score = -1
    best_grid = -1
    best_clusterer = None
    # evaluation based on silhouette_score
    print("Searching Best Clustering Parameters...")
    for p in paramGrid:
        model.set_params(**p)    # set current hyper parameter
        start_time = time.time()
        model.fit(X)          # fit model on wine dataset, this will find clusters based on parameter p
        ss = metrics.silhouette_score(X, model.labels_, random_state=seed)   # calculate silhouette_score
        # silhouette_scores.append([p, ss]) # store all the scores
        print('Parameter:', p, 'Score', f"{ss:.4f}", 'Unique_Labels', len(pd.Series(model.labels_).unique()), "Time", f"{time.time()-start_time:.4f}")
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
            best_clusterer = model
    print("")
    print("BEST PARAM SETUP: ", best_grid, best_score)
    print("")
    return best_grid, best_clusterer

def clustering(inner_df:pd.DataFrame, name:str, clusterer, parameters):
    '''Executes the data clustering on the dataset. Produces the same input dataset with the additional column for the cluster #.'''
    '''The input dataset must contain integer API calls (except the API Call Pattern strings)'''
    X = get_x(inner_df)
    bestCluster, bestClusterer = search_k(parameters, clusterer, X) #assumes bestCluster already fitted
    #bestClusterer.fit(X)
    inner_df['cluster'] = bestClusterer.labels_
    inner_df.to_csv(f"Clustering/Malicious/{name}_Encoded_Clustering.csv", index=False)
    inner_df = inverse_label(inner_df)
    inner_df.to_csv(f"Clustering/Malicious/{name}_Clustering.csv", index=False)
    print("")
    return bestCluster

def common_api_cluster(inner_df:pd.DataFrame, name:str):
    '''Determine the most common API call patterns for each cluster'''
    global df
    inner_df = df
    clusters = inner_df['cluster'].unique()
    clusters.sort()
    commonAPI = []
    print("Searching for Common API Patterns per Cluster...")
    print(clusters)
    for cluster in clusters:
        raw_commonC = inner_df[inner_df['cluster']==cluster]['pattern']#.value_counts()
        commonC = raw_commonC.value_counts().to_frame(name='counts').reset_index()
        commonAPI.append([cluster, commonC['counts'].iloc[0], round(commonC['counts'].iloc[0]/raw_commonC.shape[0],4), commonC['pattern'].iloc[0]])
    commonAPI = pd.DataFrame(commonAPI, columns=['cluster', 'count', 'match_ratio', 'pattern'])
    commonAPI.to_csv(f"Clustering/Malicious/{name}_Common_API_Cluster.csv", index=False)
    print("")
    print("Average Match Ratio:", commonAPI['match_ratio'].mean())
    return commonAPI

def get_samplehash_common(inner_df:pd.DataFrame, common_counts:pd.DataFrame, name:str, samplesize:int):
    '''Get sample hashes from each cluster that matches the common API call pattern of the cluster.'''
    hashes = []
    global seed
    random.seed(seed)
    matching_samples = 0
    print(f"Random (Seed @ {seed}) Sampling Hashes subset of  Most Common API Patterns...")
    for pattern in range(common_counts.shape[0]):
        sub_df = inner_df[inner_df['pattern']==common_counts.iloc[pattern,3]]
        subsamples = len(sub_df['hash'].to_list())
        matching_samples += subsamples
        if subsamples < samplesize:
            subsamples = random.sample(sub_df['hash'].to_list(), len(sub_df['hash'].to_list()))
        else:
            subsamples = random.sample(sub_df['hash'].to_list(), samplesize)
        for subsample in subsamples:
            hashes.append([common_counts.iloc[pattern,:]['cluster'], subsample, '_', '_', '_', common_counts.iloc[pattern,:]['pattern']])
    hashes = pd.DataFrame(hashes, columns=['cluster', 'hash', 'Type 1', 'Type 2', 'Type 3', 'pattern'])
    hashes.to_csv(f"Clustering/Malicious/{name}_SampleHash_Common.csv", index=False)
    print(f"Commonality Ratio: {(matching_samples/inner_df.shape[0])}")
    print("")
    return hashes
    
def inject_patterns(inner_df:pd.DataFrame, inverse_labelled_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    print("")
    inverse_label(inner_df).to_csv(f"Clustering/Malicious/API_Patterns.csv", index=False)
    return inner_df

if not os.path.exists(os.path.abspath(os.getcwd())+"\Clustering"):
    os.makedirs(os.path.abspath(os.getcwd())+"\Clustering\Malicious")
    os.makedirs(os.path.abspath(os.getcwd())+"\Clustering\Benign")

# 2. Load Dataset

In [4]:
df = load_df()
df = inject_patterns(df.copy(), inverse_label(df.copy()))
reserve_df = df.copy()
df

Loading DF...

Inverse Labelling...

Injecting API patterns...

Inverse Labelling...



Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42792,e3d6d58faa040f0f9742c9d0eaf58be4,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42793,9b917bab7f32188ae40c744f2be9aaf8,82,240,117,240,117,240,117,240,117,...,159,224,82,159,224,82,159,224,82,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42794,35a18ee05f75f04912018d9f462cb990,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42795,654139d715abcf7ecdddbef5a84f224b,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


# 2.1 Explanation

The process as the following:
1. Find the best cluster size (by means of the [Silhouette Score](https://tushar-joshi-89.medium.com/silhouette-score-a9f7d8d78f29))
2. Search for the most common API pattern for each cluster
3. Sample n hashes per cluster that match the most common API pattern for the same cluster.

**Silhouette Score:** How good is the quality of data clustering? *(Higher is better)*

**Match Ratio:** How common is the most common API call pattern in each cluster among the samples found in its cluster? *(Higher is better)*

**Commonality Ratio**: How many of the matching most common API patterns per cluster are there relative to the dataset size. *(Higher is better)*

In [6]:
'''UPDATE THESE VALUES AS NEEDED'''

clusters = [25,50,75,100,125,150,175,200]    # Place either single or multiple values as long as it is in list format. 
                    # For multiple values, the program will iterate through every cluster size and will choose the best (usually the biggest value) to be part of the best configuration.
                    # It influences the projected total number of samples to verify/analyze.
                    
samplesize = 10      # Max no. of samples to obtain from a cluster. 
                    # It influences the projected total number of samples to verify/analyze.

# 3. K-Means

In [7]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "KMeans", KMeans(n_init='auto', verbose=0, random_state=seed), 
                              {'n_clusters': clusters, 'algorithm':algorithm})

commonAPI = common_api_cluster(df, "KMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "KMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'algorithm': 'lloyd', 'n_clusters': 25} Score 0.3238 Unique_Labels 25 Time 27.0917
Parameter: {'algorithm': 'lloyd', 'n_clusters': 50} Score 0.3934 Unique_Labels 50 Time 28.8441
Parameter: {'algorithm': 'lloyd', 'n_clusters': 75} Score 0.4686 Unique_Labels 75 Time 27.9730
Parameter: {'algorithm': 'lloyd', 'n_clusters': 100} Score 0.4979 Unique_Labels 100 Time 28.1561
Parameter: {'algorithm': 'lloyd', 'n_clusters': 125} Score 0.5161 Unique_Labels 125 Time 28.4876
Parameter: {'algorithm': 'lloyd', 'n_clusters': 150} Score 0.5226 Unique_Labels 150 Time 29.1506
Parameter: {'algorithm': 'lloyd', 'n_clusters': 175} Score 0.5360 Unique_Labels 175 Time 29.7302
Parameter: {'algorithm': 'lloyd', 'n_clusters': 200} Score 0.5407 Unique_Labels 200 Time 31.7314
Parameter: {'algorithm': 'elkan', 'n_clusters': 25} Score 0.3238 Unique_Labels 25 Time 27.7143
Parameter: {'algorithm': 'elkan', 'n_clusters': 50} Score 0.3934 Unique_Labels 5

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,306,0.9745,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,1,5,0.0149,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."
2,2,12,0.1237,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,411,0.6683,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,4,1059,0.9991,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
...,...,...,...,...
195,195,30,0.2158,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
196,196,12,0.0502,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
197,197,16,0.0625,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
198,198,51,0.2000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.5531696146926186



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...
1756,198,d24b78bd73f17379ed62e4c776b4f66e,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1757,198,f666dd4b3a53b7fe71f8976fa09bfdfb,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1758,199,b6d6520b608875282d831b1e983cd5e5,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1759,199,18bce1a594550daf8b3f318de48c1674,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


# 4. BisectingKMeans

In [8]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "BisectingKMeans", 
                              BisectingKMeans(random_state=seed, verbose=0, copy_x=True), 
                              {'n_clusters': clusters, 'algorithm':algorithm})

commonAPI = common_api_cluster(df, "BisectingKMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "BisectingKMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'algorithm': 'lloyd', 'n_clusters': 25} Score 0.2614 Unique_Labels 25 Time 26.2854
Parameter: {'algorithm': 'lloyd', 'n_clusters': 50} Score 0.3009 Unique_Labels 50 Time 26.8244
Parameter: {'algorithm': 'lloyd', 'n_clusters': 75} Score 0.3564 Unique_Labels 75 Time 27.3909
Parameter: {'algorithm': 'lloyd', 'n_clusters': 100} Score 0.4030 Unique_Labels 100 Time 29.9498
Parameter: {'algorithm': 'lloyd', 'n_clusters': 125} Score 0.4324 Unique_Labels 125 Time 29.0144
Parameter: {'algorithm': 'lloyd', 'n_clusters': 150} Score 0.4527 Unique_Labels 150 Time 31.2624
Parameter: {'algorithm': 'lloyd', 'n_clusters': 175} Score 0.4550 Unique_Labels 175 Time 32.0186
Parameter: {'algorithm': 'lloyd', 'n_clusters': 200} Score 0.4724 Unique_Labels 200 Time 30.3749
Parameter: {'algorithm': 'elkan', 'n_clusters': 25} Score 0.2614 Unique_Labels 25 Time 25.8006
Parameter: {'algorithm': 'elkan', 'n_clusters': 50} Score 0.3009 Unique_Labels 5

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,14,0.0562,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,13,0.0456,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,1,0.0115,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,7,0.0173,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,4,3,0.0095,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
195,195,305,0.6733,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
196,196,121,0.7378,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
197,197,124,0.7607,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
198,198,60,0.6000,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.48503399771011985



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,ebd84144138b198f6e70f9e5b885f3ff,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,a1b47c27107e78e8c21a8a783617abca,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,9c9711b02aedb8864e5ea0377cded141,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,64344851895b44d1735bad2c664b8f7f,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,10453619503a4aa3e3c5c35530281633,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
1535,199,7e1a68ebe8aecf877e26cdca477c206e,_,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
1536,199,6f94f8a66e679f79ea2c6cbc375a69ee,_,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
1537,199,650f685a95aac0ec0a62f5ca4436a3ea,_,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
1538,199,ae67d2f38b20ea53d3688c8fd77baf3a,_,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."


# 5. MiniBatchKMeans

In [9]:
df = reserve_df.copy()
print(df.shape, "\n")

reassignment_ratio = [0.01, 0.1, 0.3]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "MiniBatchKMeans", 
                              MiniBatchKMeans(verbose=0, random_state=seed, n_init='auto', 
                                              batch_size=os.cpu_count()*256), 
                              {'n_clusters': clusters, 'reassignment_ratio':reassignment_ratio})

commonAPI = common_api_cluster(df, "MiniBatchKMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "MiniBatchKMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 25, 'reassignment_ratio': 0.01} Score 0.3176 Unique_Labels 25 Time 25.5338
Parameter: {'n_clusters': 25, 'reassignment_ratio': 0.1} Score 0.3078 Unique_Labels 25 Time 25.5827
Parameter: {'n_clusters': 25, 'reassignment_ratio': 0.3} Score 0.1639 Unique_Labels 22 Time 25.7370
Parameter: {'n_clusters': 50, 'reassignment_ratio': 0.01} Score 0.4045 Unique_Labels 50 Time 25.4437
Parameter: {'n_clusters': 50, 'reassignment_ratio': 0.1} Score 0.4005 Unique_Labels 50 Time 25.5211
Parameter: {'n_clusters': 50, 'reassignment_ratio': 0.3} Score 0.1706 Unique_Labels 48 Time 25.4956
Parameter: {'n_clusters': 75, 'reassignment_ratio': 0.01} Score 0.4508 Unique_Labels 75 Time 25.6024
Parameter: {'n_clusters': 75, 'reassignment_ratio': 0.1} Score 0.4178 Unique_Labels 70 Time 25.5966
Parameter: {'n_clusters': 75, 'reassignment_ratio': 0.3} Score 0.3118 Unique_Labels 64 Time 25.7238
Parameter: {'n_clusters': 100, 'reassignme

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,3308,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,539,0.8983,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
2,2,5,0.0694,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,10,0.0459,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,4,1,0.0119,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
142,193,305,0.6763,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
143,195,236,0.6611,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
144,197,1,0.2000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
145,198,2,0.0083,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.47482300161226254



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,fccce06bbabd5398622081172b98ac09,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,0,ed1c628b31c5c4c62fa061520a3817ce,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,0,bae18a5c39fd2c3a5d48b59e1f059494,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
3,0,a3a160e1f593d1b027042c01c943534a,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
4,0,a5b68fed471885355e5c028f963cce20,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
...,...,...,...,...,...,...
1109,199,bf3bcd09550f95ccaad48ca28a13a337,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1110,199,f6a901ec5d7e500e30fc08a6e692a093,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1111,199,4e2537f056aa6d2e90344b5002830e50,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1112,199,701a144745371eb70907f042b40375b5,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


# 6. DBSCAN

In [10]:
df = reserve_df.copy()
print(df.shape, "\n")

eps = [0.2,0.5,0.8]
min_samples = [5,10]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "DBSCAN", DBSCAN(algorithm='auto', n_jobs=-1), 
                              {'min_samples' : min_samples, 'eps':eps})

commonAPI = common_api_cluster(df, "DBSCAN")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "DBSCAN", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'eps': 0.2, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 64.9204
Parameter: {'eps': 0.2, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 65.2979
Parameter: {'eps': 0.5, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 65.1114
Parameter: {'eps': 0.5, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 66.1169
Parameter: {'eps': 0.8, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 65.1855
Parameter: {'eps': 0.8, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 65.4238

BEST PARAM SETUP:  {'eps': 0.2, 'min_samples': 5} 0.5091870285529345

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68

Unnamed: 0,cluster,count,match_ratio,pattern
0,-1,9,0.0006,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,0,3308,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,1,128,1.0000,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
3,2,173,1.0000,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
4,3,486,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
294,293,15,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
295,294,11,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
296,295,17,1.0000,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
297,296,13,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.6206977124564806



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,-1,5e1f079fc9130cd508568da3cb0b219a,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,-1,2b05809d67062f0af9fec37f33d1b338,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,-1,e8a9d42e07c25d00fcc56170e66071fd,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,-1,01e2cd4d45e8bc2608f3519a653d3a63,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,-1,d93b214c093a9f1e07248962aeb74fc8,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...,...,...
2984,297,a4200ec0b146d8a0d37e90e32c674780,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2985,297,87e9967ee4246dabb78854ed2e0402f2,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2986,297,05b379055a79c5e47bdabec418190ac7,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2987,297,d8c65468405b789c56754336c1f8911b,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


# 7. Birch

In [11]:
df = reserve_df.copy()
print(df.shape, "\n")

branching_factor = [25,50,75,100]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "Birch", Birch(compute_labels=True), 
                              {'n_clusters': clusters, 'branching_factor':branching_factor})

commonAPI = common_api_cluster(df, "Birch")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "Birch", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'branching_factor': 25, 'n_clusters': 25} Score 0.1908 Unique_Labels 25 Time 58.6440
Parameter: {'branching_factor': 25, 'n_clusters': 50} Score 0.2433 Unique_Labels 50 Time 56.3083
Parameter: {'branching_factor': 25, 'n_clusters': 75} Score 0.2657 Unique_Labels 75 Time 58.6679
Parameter: {'branching_factor': 25, 'n_clusters': 100} Score 0.2822 Unique_Labels 100 Time 56.5953
Parameter: {'branching_factor': 25, 'n_clusters': 125} Score 0.2986 Unique_Labels 125 Time 56.4709
Parameter: {'branching_factor': 25, 'n_clusters': 150} Score 0.3167 Unique_Labels 150 Time 56.6069
Parameter: {'branching_factor': 25, 'n_clusters': 175} Score 0.3282 Unique_Labels 175 Time 56.8309
Parameter: {'branching_factor': 25, 'n_clusters': 200} Score 0.3449 Unique_Labels 200 Time 56.6360
Parameter: {'branching_factor': 50, 'n_clusters': 25} Score 0.2228 Unique_Labels 25 Time 57.3799
Parameter: {'branching_factor': 50, 'n_clusters': 50} Score 0.

Unnamed: 0,cluster,count,match_ratio,pattern
0,0,19,0.0974,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
1,1,36,0.1885,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,17,0.1164,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."
3,3,305,0.5505,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
4,4,12,0.1818,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
...,...,...,...,...
195,195,50,0.6250,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
196,196,8,0.6154,"LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyEx..."
197,197,4,0.1379,"GetSystemTimeAsFileTime,NtOpenKey,NtQueryValue..."
198,198,12,0.5455,"GetSystemTimeAsFileTime,SetUnhandledExceptionF..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.36808654812253194



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,18b95eee7970e8cb4dcb3218a2c07fbb,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
1,0,fad8b0d20017b823c33888994837dc57,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
2,0,818d85bae450ee7659b28dc3aecef197,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
3,0,0bc30f64f5af186cb7c7c2c27bd1d942,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
4,0,632c342a6f10e2cd1839dc15d131fcd8,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
...,...,...,...,...,...,...
1386,199,431508b9bfdc534018039bc3ccde1366,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
1387,199,06a390acad92db92ebf8283444eb5c29,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
1388,199,3d66b302bc2731e088ca222c4a9bfa1a,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
1389,199,380368cf9f6036ae4b7bdb11815041f4,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."


# 8. SpectralClustering

*Due to some dataset- or hardware-related limitations, SpectralClustering cannot be executed using the full Oliveira dataset.*

In [12]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# n_neighbors=[10,20,30,40,50]
# assign_labels=['kmeans','discretize']

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "SpectralClustering", 
#                               SpectralClustering(random_state=seed, affinity='nearest_neighbors', assign_labels='kmeans', n_jobs=-1, verbose=0), 
#                               {'n_clusters': clusters, 'n_neighbors':n_neighbors, 'assign_labels':assign_labels})

# commonAPI = common_api_cluster(df, "SpectralClustering")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "SpectralClustering", samplesize)
# display(commonHashes)

# 9. AgglomerativeClustering (Ward)

*Due to memory-related limitations, AgglomerativeClustering cannot be executed using the full Oliveira dataset.*

In [13]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# print("Computing Connectivity...")
# start_time = time.time()
# connectivity = []
# connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 2, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 4, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 6, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 8, mode='connectivity', include_self='auto', n_jobs=-1))
# print(f"Connectivity Computation: {time.time()-start_time:.4f}(s)\n")

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "AgglomerativeClustering", AgglomerativeClustering(memory=".memory/", 
#                                                                                      linkage='complete', 
#                                                                                      connectivity=connectivity), 
#                               {'n_clusters': clusters, 'connectivity':connectivity})

# commonAPI = common_api_cluster(df, "AgglomerativeClustering")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "AgglomerativeClustering", samplesize)
# display(commonHashes)

# 10. Optics

*Due to some dataset- or hardware-related limitations, Optics cannot be executed using the full Oliveira dataset in a reasonable time.*

In [14]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# leaf_size = [10,20,30,40,50] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# min_samples = [5,10,15,20]

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "OPTICS", OPTICS(algorithm='auto', memory=".memory/", n_jobs=-1), 
#                               {'leaf_size': leaf_size, 'min_samples':min_samples}, 
#                               samplesize)

# commonAPI = common_api_cluster(df, "OPTICS")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "OPTICS", samplesize)
# display(commonHashes)

# 11. MeanShift

*Due to some hardware-related limitations, MeanShift cannot be executed using the lite/full Oliveira dataset in a reasonable time.*

In [15]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# max_iter = [300,500] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "MeanShift", MeanShift(n_jobs=-1), {'max_iter': max_iter})

# commonAPI = common_api_cluster(df, "MeanShift")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "MeanShift", samplesize)
# display(commonHashes)