# Clustering Demo (Benign)

**Clustering Methods not Supported:** GaussianMixture & HDBScan

In [1]:
import random
import pandas as pd
import multiprocessing
import time

def cpu_count():
    return multiprocessing.cpu_count()

def list_to_str(ls):
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def load_df():
    print("Loading DF...")
    df = pd.read_csv("oliveira_lite.csv", low_memory=False, memory_map=True)
    df = df[df['malware'] == 0].copy()
    df = df.drop('malware', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def get_x(df):
    return df.iloc[:, 1:102-1]

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Random Seed
seed = 1

#Inverse Label Encoding
def inverse_labeller(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df):
    print("Inverse Labelling...")
    df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df

From: https://medium.com/swlh/k-means-clustering-on-high-dimensional-data-d2151e1a4240

In [2]:
def common_api_cluster(inner_df, name, size):
    global df
    inner_df = df
    clusters = inner_df['cluster'].unique()
    clusters.sort()
    columns = inner_df.columns[1:102]
    commonAPI = []
    print("Searching for Common API Patterns per Cluster...")
    print(clusters)
    for cluster in clusters:
        raw_commonC = inner_df[inner_df['cluster']==cluster]['pattern']#.value_counts()
        commonC = raw_commonC.value_counts().to_frame(name='counts').reset_index()
        commonAPI.append([cluster, commonC['counts'].iloc[0], round(commonC['counts'].iloc[0]/raw_commonC.shape[0],4), commonC['pattern'].iloc[0]])
    commonAPI = pd.DataFrame(commonAPI, columns=['cluster', 'count', 'match_ratio', 'pattern'])
    commonAPI.to_csv(f"Clustering/Benign/{name}_Common_API_Cluster.csv", index=False)
    print("")
    return commonAPI

def get_samplehash_common(inner_df, common_counts, name, size, samplesize):
    hashes = []
    global seed
    random.seed(seed)
    matching_samples = 0
    print(f"Random (Seed @ {seed}) Sampling Hashes subset of  Most Common API Patterns...")
    for pattern in range(common_counts.shape[0]):
        sub_df = inner_df[inner_df['pattern']==common_counts.iloc[pattern,3]]
        subsamples = len(sub_df['hash'].to_list())
        matching_samples += subsamples
        if subsamples < samplesize:
            subsamples = random.sample(sub_df['hash'].to_list(), len(sub_df['hash'].to_list()))
        else:
            subsamples = random.sample(sub_df['hash'].to_list(), samplesize)
        for subsample in subsamples:
            hashes.append([common_counts.iloc[pattern,:]['cluster'], subsample, '_', '_', '_', common_counts.iloc[pattern,:]['pattern']])
    hashes = pd.DataFrame(hashes, columns=['cluster', 'hash', 'Type 1', 'Type 2', 'Type 3', 'pattern'])
    hashes.to_csv(f"Clustering/Benign/{name}_SampleHash_Common.csv", index=False)
    print(f"Commonality Ratio: {(matching_samples/inner_df.shape[0])*100:.4f}%")
    print("")
    return hashes
    
def inject_patterns(inner_df, inverse_labelled_df):
    patterns = []
    print("Injecting API patterns...")
    for row in range(inner_df.shape[0]):
        patterns.append(list_to_str(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
    inner_df['pattern'] = patterns
    print("")
    inverse_label(inner_df).to_csv(f"Clustering/Benign/API_Patterns.csv", index=False)
    return inner_df

# Load Dataset

In [3]:
df = load_df()
df = inject_patterns(df.copy(), inverse_label(df.copy()))

Loading DF...

Inverse Labelling...

Injecting API patterns...



In [4]:
reserve_df = df.copy()