In [1]:
import random
import pandas as pd
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.model_selection import ParameterGrid
from sklearn import metrics
import matplotlib.pyplot as plt

def loadDF():
    df = pd.read_csv("oliveira.csv")
    return df[df['malware'] == 1]

def getX(df):
    return df.iloc[:, 1:102-1]

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Inverse Label Encoding
def inverse_labeller(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df):
    df.iloc[:, 1:df.shape[1]-2] = df.iloc[:, 1:df.shape[1]-2].apply(inverse_labeller, axis=1, result_type='reduce')
    return df

From: https://medium.com/swlh/k-means-clustering-on-high-dimensional-data-d2151e1a4240

In [2]:
def searchK(parameters, model, X):
    # instantiating ParameterGrid, pass number of clusters as input
    parameter_grid = ParameterGrid({'n_clusters': parameters})
    best_score = -1
    best_grid = -1
    silhouette_scores = []
    # evaluation based on silhouette_score
    for p in parameter_grid:
        model.set_params(**p)    # set current hyper parameter
        model.fit(X)          # fit model on wine dataset, this will find clusters based on parameter p
        ss = metrics.silhouette_score(X, model.labels_)   # calculate silhouette_score
        silhouette_scores += [ss]       # store all the scores
        print('Parameter:', p, 'Score', ss)
        # check p which has the best score
        if ss > best_score:
            best_score = ss
            best_grid = p
    # plotting silhouette score
    plt.bar(range(len(silhouette_scores)), list(silhouette_scores), align='center', color='#722f59', width=0.5)
    plt.xticks(range(len(silhouette_scores)), list(parameters))
    plt.title('Silhouette Score', fontweight='bold')
    plt.xlabel('Number of Clusters')
    plt.show()
    return best_grid['n_clusters']

In [3]:
# Get sample hashes for VirusTotal Analysis
# For each cluster, it will pick a random sample
def getSampleHashes(df, name, clusters, size):
    sampleHashes = []
    for i in range(0, size):
        clusterSize = df[df['cluster']==i].shape[0]
        sampleHashes.append([i,df[df['cluster']==i].iloc[random.randint(0,clusterSize-1)][0]])
    pd.DataFrame(sampleHashes, columns =['cluster', 'hash']).to_csv(f"sampleHashes_{name}_{size}.csv", index=False)
    return sampleHashes

Microsoft recognizes [15](https://www.microsoft.com/en-us/security/business/security-101/what-is-malware#types-of-malware
) common types malwares. This information can be used as a reference for the max size of clusters to be made.

In [4]:
df = loadDF()
X = getX(df)

clusters = [10,20,30,40,50,60,70,80,90,100] #Let's assume that there are up to 100 clusters that can be derived from the Oliveira dataset.
bestClusterSize = searchK(clusters, KMeans(init='k-means++', n_init='auto', verbose=0, random_state=1), X)
kmeans = KMeans(n_clusters=bestClusterSize, init='k-means++', n_init='auto', verbose=0, random_state=1)
kmeans.fit(X)
df['cluster'] = kmeans.predict(X)
df = inverse_label(df)
df.to_csv(f"Cluster_KMeans_{bestClusterSize:.0f}.csv", index=False)
getSampleHashes(df, "KMeans", clusters, bestClusterSize)

In [None]:
# AgglomerativeClustering is highly unstable and requires a lot of memory for the input dataset. Hence, while ideal, it is not feasible on the current hardware.
# df = loadDF()
# X = getX(df)

# clusters = 13
# bestClusterSize = searchK(clusters, AgglomerativeClustering(compute_full_tree='auto', linkage='ward'), X)
# ac = AgglomerativeClustering(n_clusters=bestClusterSize, compute_full_tree='auto', linkage='ward')
# df['cluster'] = ac.fit_predict(X)
# df = inverse_label(df)
# df.to_csv(f"Cluster_AgglomerativeClustering_{bestClusterSize:.0f}.csv", index=False)
# getSampleHashes(df, "AgglomerativeClustering", clusters, bestClusterSize)