# Clustering Demo (Malicious)

**Note:** Make sure that you have `oliveira.csv` in the same directory as this notebook.

**Clustering Methods not Supported:** GaussianMixture & HDBScan

# 1. Code Preparation

In [1]:
import random
import pandas as pd
from sklearn.cluster import KMeans,  BisectingKMeans, MiniBatchKMeans, Birch, DBSCAN
# from sklearn.cluster import SpectralClustering, AgglomerativeClustering, OPTICS, MeanShift # Not working due to HW limitations
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
import multiprocessing
import time
import os

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def load_df():
    '''Load the dataset file (CSV) as DataFrame'''
    print("Loading DF...")
    df = pd.read_csv("oliveira.csv", low_memory=False, memory_map=True) # MAKE SURE THIS IS SET AS `oliveira.csv`
    df = df[df['malware'] == 1].copy()
    df = df.drop('malware', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def get_x(df:pd.DataFrame):
    '''Get the feature columns of the DataFrame'''
    return df.iloc[:, 1:102-1]

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #Add the label for NaN values.
API_FILE.close()

#Random Seed
seed = 1

#Inverse Label Encoding
def inverse_labeller(item):
    '''Low Level. Converts encoded API calls to string API calls'''
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df:pd.DataFrame):
    '''High Level. Converts encoded API calls to string API calls'''
    df2 = df.copy(deep=True)
    print("Inverse Labelling...")
    df2.iloc[:, 1:101] = df2.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df2

In [2]:
def search_k(parameters, model, X):
    '''Search for the best parameter(s) for the model (usually cluster size or K value)'''
    paramGrid = ParameterGrid(parameters)
    best_score = -1
    best_grid = -1
    best_clusterer = None
    # evaluation based on silhouette_score
    print("Searching Best Clustering Parameters...")
    for p in paramGrid:
        model.set_params(**p)    # set current hyper parameter
        start_time = time.time()
        model.fit(X)          # fit model on wine dataset, this will find clusters based on parameter p
        ss = metrics.silhouette_score(X, model.labels_, random_state=seed)   # calculate silhouette_score
        # silhouette_scores.append([p, ss]) # store all the scores
        print('Parameter:', p, 'Score', f"{ss:.4f}", 'Unique_Labels', len(pd.Series(model.labels_).unique()), "Time", f"{time.time()-start_time:.4f}")
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
            best_clusterer = model
    print("")
    print("BEST PARAM SETUP: ", best_grid, best_score)
    print("")
    return best_grid, best_clusterer

def clustering(inner_df:pd.DataFrame, name:str, clusterer, parameters):
    '''Executes the data clustering on the dataset. Produces the same input dataset with the additional column for the cluster #.'''
    '''The input dataset must contain integer API calls (except the API Call Pattern strings)'''
    X = get_x(inner_df)
    bestCluster, bestClusterer = search_k(parameters, clusterer, X) #assumes bestCluster already fitted
    #bestClusterer.fit(X)
    inner_df['cluster'] = bestClusterer.labels_
    inner_df.to_csv(f"Clustering/Malicious/{name}_Encoded_Clustering.csv", index=False)
    inner_df = inverse_label(inner_df)
    inner_df.to_csv(f"Clustering/Malicious/{name}_Clustering.csv", index=False)
    print("")
    return bestCluster

def common_api_cluster(inner_df:pd.DataFrame, name:str):
    '''Determine the most common API call patterns for each cluster'''
    global df
    inner_df = df
    clusters = inner_df['cluster'].unique()
    clusters.sort()
    commonAPI = []
    print("Searching for Common API Patterns per Cluster...")
    print(clusters)
    for cluster in clusters:
        raw_commonC = inner_df[inner_df['cluster']==cluster]['pattern']#.value_counts()
        commonC = raw_commonC.value_counts().to_frame(name='counts').reset_index()
        commonAPI.append([cluster, commonC['counts'].iloc[0], round(commonC['counts'].iloc[0]/raw_commonC.shape[0],4), commonC['pattern'].iloc[0]])
    commonAPI = pd.DataFrame(commonAPI, columns=['cluster', 'count', 'match_ratio', 'pattern'])
    commonAPI.to_csv(f"Clustering/Malicious/{name}_Common_API_Cluster.csv", index=False)
    print("")
    print("Average Match Ratio:", commonAPI['match_ratio'].mean())
    return commonAPI

def get_samplehash_common(inner_df:pd.DataFrame, common_counts:pd.DataFrame, name:str, samplesize:int):
    '''Get sample hashes from each cluster that matches the common API call pattern of the cluster.'''
    hashes = []
    global seed
    random.seed(seed)
    matching_samples = 0
    print(f"Random (Seed @ {seed}) Sampling Hashes subset of  Most Common API Patterns...")
    for pattern in range(common_counts.shape[0]):
        sub_df = inner_df[inner_df['pattern']==common_counts.iloc[pattern,3]]
        subsamples = len(sub_df['hash'].to_list())
        matching_samples += subsamples
        if subsamples < samplesize:
            subsamples = random.sample(sub_df['hash'].to_list(), len(sub_df['hash'].to_list()))
        else:
            subsamples = random.sample(sub_df['hash'].to_list(), samplesize)
        for subsample in subsamples:
            hashes.append([common_counts.iloc[pattern,:]['cluster'], subsample, '_', '_', '_', common_counts.iloc[pattern,:]['pattern']])
    hashes = pd.DataFrame(hashes, columns=['cluster', 'hash', 'Type 1', 'Type 2', 'Type 3', 'pattern'])
    hashes.to_csv(f"Clustering/Malicious/{name}_SampleHash_Common.csv", index=False)
    print(f"Commonality Ratio: {(matching_samples/inner_df.shape[0])}")
    print("")
    return hashes
    
def inject_patterns(inner_df:pd.DataFrame, inverse_labelled_df:pd.DataFrame):
    '''Injects the API call patterns of each sample as its last column'''
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    print("")
    inverse_label(inner_df).to_csv(f"Clustering/Malicious/API_Patterns.csv", index=False)
    return inner_df

# 2. Load Dataset

In [3]:
df = load_df()
df = inject_patterns(df.copy(), inverse_label(df.copy()))
df

Loading DF...

Inverse Labelling...

Injecting API patterns...

Inverse Labelling...



Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_91,t_92,t_93,t_94,t_95,t_96,t_97,t_98,t_99,pattern
0,071e8c3f8922e186e57548cd4c703a5d,112,274,158,215,274,158,215,298,76,...,71,297,135,171,215,35,208,56,71,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,33f8e6d08a6aae939f25a8e0d63dd523,82,208,187,208,172,117,172,117,172,...,81,240,117,71,297,135,171,215,35,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,b68abd064e975e1c6d5f25e748663076,16,110,240,117,240,117,240,117,240,...,65,112,123,65,112,123,65,113,112,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,72049be7bd30ea61297ea624ae198067,82,208,187,208,172,117,172,117,172,...,208,302,208,302,187,208,302,228,302,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,c9b3700a77facf29172f32df6bc77f48,82,240,117,240,117,240,117,240,117,...,209,260,40,209,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42792,e3d6d58faa040f0f9742c9d0eaf58be4,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42793,9b917bab7f32188ae40c744f2be9aaf8,82,240,117,240,117,240,117,240,117,...,159,224,82,159,224,82,159,224,82,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42794,35a18ee05f75f04912018d9f462cb990,82,240,117,240,117,240,117,240,117,...,260,141,260,141,260,141,260,141,260,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42795,654139d715abcf7ecdddbef5a84f224b,82,240,117,240,117,240,117,240,117,...,141,260,141,260,141,260,141,260,141,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [4]:
reserve_df = df.copy()

# 2.1 Explanation

The process as the following:
1. Find the best cluster size (by means of the [Silhouette Score](https://tushar-joshi-89.medium.com/silhouette-score-a9f7d8d78f29))
2. Search for the most common API pattern for each cluster
3. Sample n hashes per cluster that match the most common API pattern for the same cluster.

**Silhouette Score:** How good is the quality of data clustering? *(Higher is better)*

**Match Ratio:** How common is the most common API call pattern in each cluster among the samples found in its cluster? *(Higher is better)*

**Commonality Ratio**: How many of the matching most common API patterns per cluster are there relative to the dataset size. *(Higher is better)*

In [5]:
'''UPDATE THESE VALUES AS NEEDED'''

clusters = [100]    # Place either single or multiple values as long as it is in list format. 
                    # For multiple values, the program will iterate through every cluster size and will choose the best (usually the biggest value) to be part of the best configuration.
                    # It influences the projected total number of samples to verify/analyze.
                    
samplesize = 10      # Max no. of samples to obtain from a cluster. 
                    # It influences the projected total number of samples to verify/analyze.

# 3. K-Means

In [6]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "KMeans", KMeans(n_init='auto', verbose=0, random_state=seed), 
                              {'n_clusters': clusters, 'algorithm':algorithm})

commonAPI = common_api_cluster(df, "KMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "KMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'algorithm': 'lloyd', 'n_clusters': 100} Score 0.4979 Unique_Labels 100 Time 48.4675
Parameter: {'algorithm': 'elkan', 'n_clusters': 100} Score 0.4979 Unique_Labels 100 Time 37.5406

BEST PARAM SETUP:  {'algorithm': 'lloyd', 'n_clusters': 100} 0.49788832505950503

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]

Average Match Ratio: 0.483996


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,306,0.9745,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,1,5,0.0333,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,344,0.4674,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,3308,0.9928,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
4,4,1116,0.9654,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
...,...,...,...,...
95,95,18,0.0634,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
96,96,17,0.0929,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
97,97,13,0.0637,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
98,98,39,0.4194,"SetErrorMode,LdrGetDllHandle,LdrGetProcedureAd..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.49246442507652405



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...
948,99,eee58f6bd3fb1bd13b88870f0e50e12b,_,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
949,99,125e4dfc79fbfdadfeba0fea49533621,_,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
950,99,656c949b54d6e6b7983a53e542ff261f,_,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
951,99,a5be5336f78ac2105fba8dc6a7c792c9,_,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."


# 4. BisectingKMeans

In [7]:
df = reserve_df.copy()
print(df.shape, "\n")

algorithm = ['lloyd', 'elkan']

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "BisectingKMeans", 
                              BisectingKMeans(random_state=seed, verbose=0, copy_x=True), 
                              {'n_clusters': clusters, 'algorithm':algorithm})

commonAPI = common_api_cluster(df, "BisectingKMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "BisectingKMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'algorithm': 'lloyd', 'n_clusters': 100} Score 0.4030 Unique_Labels 100 Time 38.7910
Parameter: {'algorithm': 'elkan', 'n_clusters': 100} Score 0.4030 Unique_Labels 100 Time 45.9657

BEST PARAM SETUP:  {'algorithm': 'lloyd', 'n_clusters': 100} 0.40296115819557765

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]

Average Match Ratio: 0.4034229999999999


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,14,0.0136,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,8,0.0092,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,344,0.2531,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,3,132,0.1530,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,4,68,0.0798,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
95,95,17,0.0530,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
96,96,236,1.0000,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
97,97,153,0.2611,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
98,98,305,0.6733,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.4199359768208052



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,ebd84144138b198f6e70f9e5b885f3ff,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,a1b47c27107e78e8c21a8a783617abca,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,9c9711b02aedb8864e5ea0377cded141,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,64344851895b44d1735bad2c664b8f7f,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,10453619503a4aa3e3c5c35530281633,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
816,99,f7da6e72d81951ed52547836fc6a712b,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
817,99,59dd59042be566ec4ca3a7294b2b1c22,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
818,99,8dcbb793fe65a98e3471ffadfb5b40f2,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
819,99,e02171cec6d49a8aaf9f862dfde3b2b8,_,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."


# 5. MiniBatchKMeans

In [8]:
df = reserve_df.copy()
print(df.shape, "\n")

reassignment_ratio = [0.01, 0.1, 0.3]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "MiniBatchKMeans", 
                              MiniBatchKMeans(verbose=0, random_state=seed, n_init='auto', 
                                              batch_size=os.cpu_count()*256), 
                              {'n_clusters': clusters, 'reassignment_ratio':reassignment_ratio})

commonAPI = common_api_cluster(df, "MiniBatchKMeans")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "MiniBatchKMeans", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'n_clusters': 100, 'reassignment_ratio': 0.01} Score 0.4863 Unique_Labels 100 Time 48.1413
Parameter: {'n_clusters': 100, 'reassignment_ratio': 0.1} Score 0.3977 Unique_Labels 84 Time 42.1584
Parameter: {'n_clusters': 100, 'reassignment_ratio': 0.3} Score 0.3069 Unique_Labels 83 Time 40.2401

BEST PARAM SETUP:  {'n_clusters': 100, 'reassignment_ratio': 0.01} 0.48629204003638327

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24
 25 26 27 28 29 30 31 32 33 34 35 37 39 40 42 43 45 46 48 50 51 53 54 55
 56 57 58 59 60 61 62 63 64 65 66 67 68 70 72 73 75 77 78 79 80 82 83 84
 88 89 90 91 92 93 94 95 96 97 98]

Average Match Ratio: 0.4069927710843373


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,10,0.4000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,1,5,0.0794,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,703,0.5913,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
3,3,29,0.9062,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,4,12,0.4138,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
...,...,...,...,...
78,94,41,0.5616,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
79,95,30,0.0860,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
80,96,10,0.1786,"NtAllocateVirtualMemory,SetErrorMode,LoadStrin..."
81,97,13,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.39348552468630976



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,9823f2ac70286e0d5d86fa43704056ba,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1,0,4a1ee7e7b083fd9a11e1182b07668bcf,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,0,b6670ec8c82d5bc1d00f0f03d6db8268,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
3,0,c9f5c11e38438766915e09e548b41708,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
4,0,94677dca5ee54e922da773d7fceeea56,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
695,98,34a46a62023edea6fb34a6ed71f55147,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
696,98,8e55ec77ffb82c4984d6bbbe499dc38c,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
697,98,a2612c913e86d048cf2ca216eb151e77,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."
698,98,517fb05bbba6c52bfae735cefaa4e258,_,_,_,"GetSystemInfo,LdrGetDllHandle,LdrGetProcedureA..."


# 6. DBSCAN

In [9]:
df = reserve_df.copy()
print(df.shape, "\n")

eps = [0.2,0.5,0.8]
min_samples = [5,10]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "DBSCAN", DBSCAN(algorithm='auto', n_jobs=-1), 
                              {'min_samples' : min_samples, 'eps':eps})

commonAPI = common_api_cluster(df, "DBSCAN")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "DBSCAN", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'eps': 0.2, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 123.3411
Parameter: {'eps': 0.2, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 115.0535
Parameter: {'eps': 0.5, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 124.1300
Parameter: {'eps': 0.5, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 116.3758
Parameter: {'eps': 0.8, 'min_samples': 5} Score 0.5092 Unique_Labels 600 Time 105.8182
Parameter: {'eps': 0.8, 'min_samples': 10} Score 0.4586 Unique_Labels 299 Time 114.1951

BEST PARAM SETUP:  {'eps': 0.2, 'min_samples': 5} 0.5091870285529345

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ -1   0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16
  17  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52
  53  54  55  56  57  58  59  60  61  62  63  64  65  66  

Unnamed: 0,cluster,count,match_ratio,pattern
0,-1,9,0.0006,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,0,3308,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
2,1,128,1.0000,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
3,2,173,1.0000,"NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtOp..."
4,3,486,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...
294,293,15,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
295,294,11,1.0000,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
296,295,17,1.0000,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
297,296,13,1.0000,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.6206977124564806



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,-1,5e1f079fc9130cd508568da3cb0b219a,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,-1,2b05809d67062f0af9fec37f33d1b338,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,-1,e8a9d42e07c25d00fcc56170e66071fd,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,-1,01e2cd4d45e8bc2608f3519a653d3a63,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,-1,d93b214c093a9f1e07248962aeb74fc8,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...,...,...
2984,297,a4200ec0b146d8a0d37e90e32c674780,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2985,297,87e9967ee4246dabb78854ed2e0402f2,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2986,297,05b379055a79c5e47bdabec418190ac7,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2987,297,d8c65468405b789c56754336c1f8911b,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


# 7. Birch

In [10]:
df = reserve_df.copy()
print(df.shape, "\n")

branching_factor = [25,50,75,100]

#Search for best clusters size and run clustering
bestClusterParam = clustering(df, "Birch", Birch(compute_labels=True), 
                              {'n_clusters': clusters, 'branching_factor':branching_factor})

commonAPI = common_api_cluster(df, "Birch")
display(commonAPI)

commonHashes = get_samplehash_common(df, commonAPI, "Birch", samplesize)
display(commonHashes)

(42797, 102) 

Searching Best Clustering Parameters...
Parameter: {'branching_factor': 25, 'n_clusters': 100} Score 0.2822 Unique_Labels 100 Time 86.1486
Parameter: {'branching_factor': 50, 'n_clusters': 100} Score 0.2855 Unique_Labels 100 Time 75.6930
Parameter: {'branching_factor': 75, 'n_clusters': 100} Score 0.3002 Unique_Labels 100 Time 64.7036
Parameter: {'branching_factor': 100, 'n_clusters': 100} Score 0.2792 Unique_Labels 100 Time 72.6685

BEST PARAM SETUP:  {'branching_factor': 75, 'n_clusters': 100} 0.30021573234810095

Inverse Labelling...


Searching for Common API Patterns per Cluster...
[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71
 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95
 96 97 98 99]

Average Match Ratio: 0.152612


Unnamed: 0,cluster,count,match_ratio,pattern
0,0,114,0.1735,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
1,1,2,0.0081,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
2,2,1059,0.3998,"GetSystemTimeAsFileTime,GetSystemInfo,NtCreate..."
3,3,12,0.0930,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
4,4,19,0.0609,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
...,...,...,...,...
95,95,2,0.0164,"SetUnhandledExceptionFilter,NtAllocateVirtualM..."
96,96,135,0.2292,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
97,97,50,0.6250,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
98,98,4,0.0444,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


Random (Seed @ 1) Sampling Hashes subset of  Most Common API Patterns...
Commonality Ratio: 0.29294109400191604



Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,28d48772cdb60f8aa046bfcc7fc4015c,_,_,_,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
1,0,ac904aaaedc9600a4027ed4001fa6bcf,_,_,_,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
2,0,38dfce406619531a931cedf85630892d,_,_,_,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
3,0,3ad802c3ee257fbc8c4a0f01616a80fb,_,_,_,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
4,0,05d1026342dfbe2d1a7153d3533c0573,_,_,_,"CreateThread,NtClose,NtDelayExecution,NtAlloca..."
...,...,...,...,...,...,...
768,99,0ca024cb01b07698554e75bf87e77e41,_,_,_,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."
769,99,c3fca6b1bd8b67c9c305198aeb5f71c3,_,_,_,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."
770,99,4baf8ec477e31d4e5636bd4eaa1bc5eb,_,_,_,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."
771,99,55eceea9390c26610d904ba4ce22e7b0,_,_,_,"LdrGetDllHandle,LdrGetProcedureAddress,NtAlloc..."


# 8. SpectralClustering

*Due to some dataset- or hardware-related limitations, SpectralClustering cannot be executed using the full Oliveira dataset.*

In [11]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# n_neighbors=[10,20,30,40,50]
# assign_labels=['kmeans','discretize']

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "SpectralClustering", 
#                               SpectralClustering(random_state=seed, affinity='nearest_neighbors', 
#                                                  assign_labels='kmeans', n_jobs=-1, verbose=0), 
#                               {'n_clusters': clusters, 'n_neighbors':n_neighbors, 'assign_labels':assign_labels}, 
#                               samplesize)

# commonAPI = common_api_cluster(df, "SpectralClustering")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "SpectralClustering", samplesize)
# display(commonHashes)

# 9. AgglomerativeClustering (Ward)

*Due to memory-related limitations, AgglomerativeClustering cannot be executed using the full Oliveira dataset.*

In [12]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# print("Computing Connectivity...")
# start_time = time.time()
# connectivity = []
# connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 2, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 4, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 6, mode='connectivity', include_self='auto', n_jobs=-1))
# #connectivity.append(kneighbors_graph(get_x(df:pd.DataFrame).values, 8, mode='connectivity', include_self='auto', n_jobs=-1))
# print(f"Connectivity Computation: {time.time()-start_time:.4f}(s)\n")

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "AgglomerativeClustering", AgglomerativeClustering(memory=".memory/", 
#                                                                                      linkage='complete', 
#                                                                                      connectivity=connectivity), 
#                               {'n_clusters': clusters, 'connectivity':connectivity})

# commonAPI = common_api_cluster(df, "AgglomerativeClustering")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "AgglomerativeClustering", samplesize)
# display(commonHashes)

# 10. Optics

*Due to some dataset- or hardware-related limitations, Optics cannot be executed using the full Oliveira dataset in a reasonable time.*

In [13]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# leaf_size = [10,20,30,40,50] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much
# min_samples = [5,10,15,20]

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "OPTICS", OPTICS(algorithm='auto', memory=".memory/", n_jobs=-1), 
#                               {'leaf_size': leaf_size, 'min_samples':min_samples}, 
#                               samplesize)

# commonAPI = common_api_cluster(df, "OPTICS")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "OPTICS", samplesize)
# display(commonHashes)

# 11. MeanShift

*Due to some hardware-related limitations, MeanShift cannot be executed using the lite/full Oliveira dataset in a reasonable time.*

In [14]:
# df = reserve_df.copy()
# print(df.shape, "\n")

# max_iter = [300,500] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset. Going beyond that is too much

# #Search for best clusters size and run clustering
# bestClusterParam = clustering(df, "MeanShift", MeanShift(n_jobs=-1), {'max_iter': max_iter})

# commonAPI = common_api_cluster(df, "MeanShift")
# display(commonAPI)

# commonHashes = get_samplehash_common(df, commonAPI, "MeanShift", samplesize)
# display(commonHashes)