# Malicious False Labels

Describes/summarizes the or set of clusters. Useful for when you aim to enlist those samples (and its API calls) that were falsely labelled as malcious as per the third-party verification tool, VirusTotal.

Note that this notebook only makes use of the verified xxxx_SampleHash_Common.csv file which represents a significant majority of the entire Oliveira dataset.  

## Import Libraries/Datasets

In [1]:
import pandas as pd

malicious_df = pd.read_csv('./Manual_DBSCAN_Encoded_Clustering_w_FalseLabels.csv', low_memory=False) #This should point to a verified <DataClustering>_SampleHash_Common.csv file
benign_df = pd.read_csv('./API_Patterns_Benign.csv', low_memory=False) #This should point to the API_Patterns.csv file

#DROP ROWS WITH NA
import numpy as nan
malicious_df.dropna(inplace=True, subset=['type'])
display(malicious_df['type'].unique())

#Load list of API calls
API_LIST = "../api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

def get_unique_clusters(df:pd.DataFrame):
    return list(df['cluster'].unique())

array(['trojan', 'pua', 'downloader', 'adware', 'hacktool', '_', 'miner',
       'virus', 'spyware', 'ransomware', 'dropper', 'worm'], dtype=object)

## DataFrame Preview

In [2]:
#Replace '-' empty malware type delimiter with '_' for consistency
malicious_df.replace(to_replace='-',value='_', inplace=True)
# display(malicious_df['cluster'].value_counts())
display(malicious_df)

Unnamed: 0,hash,t_0,t_1,t_2,t_3,t_4,t_5,t_6,t_7,t_8,...,t_94,t_95,t_96,t_97,t_98,t_99,malware,type,pattern,cluster
0,071e8c3f8922e186e57548cd4c703a5d,RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,NtQueryAttributesFile,LoadStringA,...,NtMapViewOfSection,NtClose,GetSystemMetrics,NtAllocateVirtualMemory,CreateActCtxW,GetSystemWindowsDirectoryW,1,trojan,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos...",0
1,33f8e6d08a6aae939f25a8e0d63dd523,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,GetSystemWindowsDirectoryW,NtCreateFile,NtCreateSection,NtMapViewOfSection,NtClose,GetSystemMetrics,1,pua,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor...",1
2,b68abd064e975e1c6d5f25e748663076,SetUnhandledExceptionFilter,OleInitialize,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,...,RegCloseKey,RegOpenKeyExA,RegQueryValueExA,RegCloseKey,RegEnumKeyExA,RegOpenKeyExA,1,trojan,"SetUnhandledExceptionFilter,OleInitialize,LdrL...",2
3,72049be7bd30ea61297ea624ae198067,GetSystemTimeAsFileTime,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrGetDllHandle,...,NtWriteVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,NtWriteVirtualMemory,NtProtectVirtualMemory,NtWriteVirtualMemory,1,trojan,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor...",3
4,c9b3700a77facf29172f32df6bc77f48,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegEnumKeyExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce...",4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42792,e3d6d58faa040f0f9742c9d0eaf58be4,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce...",19
42793,9b917bab7f32188ae40c744f2be9aaf8,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,NtDelayExecution,EnumWindows,GetSystemTimeAsFileTime,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce...",12595
42794,35a18ee05f75f04912018d9f462cb990,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce...",12596
42795,654139d715abcf7ecdddbef5a84f224b,GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,LdrGetProcedureAddress,...,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,RegOpenKeyExW,RegQueryValueExW,1,trojan,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce...",19


## How many are falsely labelled samples from the verified samples?

In [3]:
def count_false_samples():
    false_labelled = malicious_df[(malicious_df['type']=='_')].copy(deep=True)
    combined = []
    for c in false_labelled['cluster'].unique():
        combined.append([c,false_labelled[false_labelled['cluster'] == c]['hash'].shape[0], malicious_df[malicious_df['cluster'] == c]['hash'].shape[0], f"{false_labelled[false_labelled['cluster'] == c]['hash'].shape[0]/malicious_df[malicious_df['cluster'] == c]['hash'].shape[0]:.4f}"])
    false_labelled = pd.DataFrame(data=combined, columns=['cluster', 'false_labelled', 'total_samples', 'percentage'])
    false_labelled.sort_values('percentage', inplace=True, ascending=False)
    false_labelled['percentage'] = false_labelled['percentage'].astype('float')
    print(f"No. of falsely labelled samples verified samples: {false_labelled.sum().iloc[1]} ({false_labelled.sum().iloc[1]/malicious_df.shape[0]*100:.4f}%)\n")
    print(f"No. of clusters with purely falsely labelled verified samples: {false_labelled[false_labelled['percentage'] == 1]['cluster'].count()} ({false_labelled[false_labelled['percentage'] == 1]['cluster'].count()/600*100:.4f}%)\n")
    print("")
    display(false_labelled)
    false_labelled.to_csv("./Output/2 Falsely_Labelled.csv", index=False)
    return false_labelled

false_labelled = count_false_samples()

No. of falsely labelled samples verified samples: 2638.0 (6.1640%)

No. of clusters with purely falsely labelled verified samples: 1105 (184.1667%)




Unnamed: 0,cluster,false_labelled,total_samples,percentage
0,14,2,2,1.0000
846,8487,1,1,1.0000
854,8584,1,1,1.0000
853,8564,1,1,1.0000
852,8541,1,1,1.0000
...,...,...,...,...
991,9,2,1116,0.0018
996,21,1,576,0.0017
986,39,1,802,0.0012
974,105,1,1094,0.0009


## Does the presented API Call Patterns match those from the API Call Patterns of those Benign samples?

In [4]:
def matching_api_in_benign():
    unique_false_patterns = list(malicious_df[malicious_df['type'] == '_']['pattern'])
    ctr = 1
    same = []
    str_output = "Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples" + "\n"
    for f in unique_false_patterns:
        if benign_df[benign_df['pattern']==f].shape[0]>0 and f not in same:
            str_output += f"\nPATTERN: {ctr}\n"
            str_output += "API Call Pattern: "
            str_output += f"{f}\n"
            # str_output += "API Calls: \n"
            # str_output += f"\t {list(pd.Series(list(f.split(","))).unique())}\n"
            # str_output += "Clusters & Hashes of Matching Falsely Labelled Samples: \n"
    #         for p in range(false_labelled[false_labelled['pattern']==f].shape[0]):
    #             str_output += f"\tCluster: {false_labelled['cluster'].iloc[p]:3d} - Hash: {false_labelled['hash'].iloc[p]} ({false_labelled['type'].iloc[p]}, {false_labelled['Type 2'].iloc[p]}, {false_labelled['Type 3'].iloc[p]})\n"
            str_output += "\nHashes of Benign Samples with Matching API Call Patterns:\n"
            for p in range(benign_df[benign_df['pattern']==f].shape[0]):
                str_output += f"\t{benign_df[benign_df['pattern']==f]['hash'].iloc[p]}\n"
            same.append(f)
            ctr+=1
    str_output += "\n"
    str_output += f"No. of API Call Patterns of Falsely-Labelled Malicious Samples that match the API Call Patterns of Benign Samples: {len(same)} ({len(same)/benign_df.shape[0]*100:.4f}%)\n\n"
    str_output += "===================================================================================================================\n\n"
    str_output += "In terms of unique API Calls:\n"
    for i, s in enumerate(same):
        str_output += f"PATTERN: {i+1}\n{list(pd.Series(s.split(',')).unique())}\n\n"

    with open("./Output/2 Falsely_Labelled_Matching_Benign.txt", 'w') as f:
        f.write(str_output)
        f.flush()
        f.close()
    print(str_output)

matching_api_in_benign()

Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples

PATTERN: 1
API Call Pattern: GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,NtOpenFile,NtQueryDirectoryFile,NtClose,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,GetSystemTimeAsFileTime,NtQuerySystemInformation,NtProtec