# Clustering Demo (Benign)

**Note:** Make sure that you have `oliveira.csv` in the same directory as this notebook.

**Note:** No Clustering occurs here as it is Benign, though it is still needed to be executed as it has a specific step that will be used as part of Dataset Analysis.

In [2]:
import pandas as pd

def list_to_str(ls:list):
    '''Convert list to a stringified version (comma delimited).'''
    output = ""
    for l in ls:
        output += str(l) + ","
    return output[0:len(output)-1]

def load_df():
    '''Load the dataset file (CSV) as DataFrame'''
    print("Loading DF...")
    df = pd.read_csv("oliveira.csv", low_memory=False, memory_map=True)
    df = df[df['malware'] == 0].copy()
    df = df.drop('malware', axis=1)
    print("")
    return df.reset_index().iloc[:,1:]

def get_x(df:pd.DataFrame):
    '''Get the feature columns of the DataFrame'''
    return df.iloc[:, 1:102-1]

#Load list of API calls
API_LIST = "api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

#Random Seed
seed = 1

#Inverse Label Encoding
def inverse_labeller(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
def inverse_label(df:pd.DataFrame):
    print("Inverse Labelling...")
    df.iloc[:, 1:101] = df.iloc[:, 1:101].apply(inverse_labeller, axis=1, result_type='reduce')
    print("")
    return df

From: https://medium.com/swlh/k-means-clustering-on-high-dimensional-data-d2151e1a4240

# Load Dataset

In [None]:
df = load_df()
inverse_labelled_df = inverse_label(df)

# Create `Benign/API_Patterns.csv`

In [4]:
patterns = []
print("Injecting API patterns...")
for row in range(df.shape[0]):
    patterns.append(list_to_str(inverse_labelled_df.iloc[row,1:101].transpose().to_list()))
df['pattern'] = patterns
inverse_label(df).to_csv(f"Clustering/Benign/API_Patterns.csv", index=False)

Loading DF...

Inverse Labelling...

Injecting API patterns...

Inverse Labelling...

