# Malicious False Labels

Describes/summarizes the or set of clusters. Useful for when you aim to enlist those samples (and its API calls) that were falsely labelled as malcious as per the third-party verification tool, VirusTotal.

Note that this notebook only makes use of the verified xxxx_SampleHash_Common.csv file which represents a significant majority of the entire Oliveira dataset.  

## Import Libraries/Datasets

In [13]:
import pandas as pd

malicious_df = pd.read_csv('./Labelled_Manual_DBSCAN_SampleHash_Common.csv', low_memory=False) #This should point to a verified <DataClustering>_SampleHash_Common.csv file
benign_df = pd.read_csv('./API_Patterns_Benign.csv', low_memory=False) #This should point to the API_Patterns.csv file

#DROP ROWS WITH NA
import numpy as nan
malicious_df.dropna(inplace=True, subset=['Type 1'])
malicious_df['Type 1'].unique()

#Load list of API calls
API_LIST = "../api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

def get_unique_clusters(df:pd.DataFrame):
    return list(df['cluster'].unique())

## DataFrame Preview

In [14]:
#Replace '-' empty malware type delimiter with '_' for consistency
malicious_df.replace(to_replace='-',value='_', inplace=True)
display(malicious_df['cluster'].value_counts())
display(malicious_df)

cluster
-1      2230
 0       543
 14      183
 5       178
 59      162
        ... 
 290       1
 284       1
 269       1
 267       1
 560       1
Name: count, Length: 504, dtype: int64

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,071e8c3f8922e186e57548cd4c703a5d,trojan,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,33f8e6d08a6aae939f25a8e0d63dd523,pua,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,2,b68abd064e975e1c6d5f25e748663076,trojan,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,-1,72049be7bd30ea61297ea624ae198067,trojan,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,-1,c9b3700a77facf29172f32df6bc77f48,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
42756,-1,75a7f99cabf09e7dcceddf9f054659ec,trojan,_,_,"GetSystemTimeAsFileTime,LdrGetDllHandle,LdrGet..."
42767,346,40f6cb4e43907e13b3edfc8aaa3bf991,trojan,_,_,"LdrLoadDll,LdrGetProcedureAddress,LdrLoadDll,L..."
42769,400,25ff42eb71b242466bad24632a1ca325,trojan,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
42771,0,ec39e45574d1cb70e9a545486743a77d,trojan,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."


## How many are falsely labelled samples from the verified samples?

In [15]:
false_labelled = malicious_df[(malicious_df['Type 1']=='_')].copy(deep=True)
combined = []
for c in false_labelled['cluster'].unique():
    combined.append([c,false_labelled[false_labelled['cluster'] == c]['hash'].shape[0], malicious_df[malicious_df['cluster'] == c]['hash'].shape[0], f"{false_labelled[false_labelled['cluster'] == c]['hash'].shape[0]/malicious_df[malicious_df['cluster'] == c]['hash'].shape[0]:.4f}"])
false_labelled = pd.DataFrame(data=combined, columns=['cluster', 'false_labelled', 'total_samples', 'percentage'])
false_labelled.sort_values('percentage', inplace=True, ascending=False)

print(f"No. of falsely labelled samples from verified samples: {false_labelled.sum().iloc[1]} ({false_labelled.sum().iloc[1]/malicious_df.shape[0]*100:.4f}%)\n")

print("")
display(false_labelled)
false_labelled.to_csv("./Output/2 Falsely_Labelled.csv", index=False)

No. of falsely labelled samples from verified samples: 503 (6.2091%)




Unnamed: 0,cluster,false_labelled,total_samples,percentage
44,342,1,1,1.0000
39,321,2,2,1.0000
73,537,1,1,1.0000
28,254,1,1,1.0000
30,270,2,2,1.0000
...,...,...,...,...
36,31,5,96,0.0521
80,74,1,28,0.0357
81,3,1,31,0.0323
15,27,4,127,0.0315


## Does the presented API Call Patterns match those from the API Call Patterns of those Benign samples?

**Note:** The samples labelled as benign in Oliveira came from Win7 executables which means that it is guaranteed that those are truly benign which makes it safe for use in comparison.

In [16]:
str_output = ""

unique_false_patterns = list(malicious_df[malicious_df['Type 1'] == '_']['pattern'])
ctr = 1
same = []
str_output += "Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples" + "\n"
for f in unique_false_patterns:
    if benign_df[benign_df['pattern']==f].shape[0]>0 and f not in same:
        str_output += f"\nPATTERN: {ctr}\n"
        str_output += "API Call Pattern: "
        str_output += f"{f}\n"
        # str_output += "API Calls: \n"
        # str_output += f"\t {list(pd.Series(list(f.split(","))).unique())}\n"
        # str_output += "Clusters & Hashes of Matching Falsely Labelled Samples: \n"
#         for p in range(false_labelled[false_labelled['pattern']==f].shape[0]):
#             str_output += f"\tCluster: {false_labelled['cluster'].iloc[p]:3d} - Hash: {false_labelled['hash'].iloc[p]} ({false_labelled['Type 1'].iloc[p]}, {false_labelled['Type 2'].iloc[p]}, {false_labelled['Type 3'].iloc[p]})\n"
        str_output += "Hashes of Benign Samples with Matching API Call Patterns:\n"
        for p in range(benign_df[benign_df['pattern']==f].shape[0]):
            str_output += f"\t{benign_df[benign_df['pattern']==f]['hash'].iloc[p]}\n"
        same.append(f)
        ctr+=1
str_output += "\n"
str_output += f"No. of API Call Patterns of Falsely-Labelled Malicious Samples that match the API Call Patterns of Benign Samples: {len(same)} ({len(same)/benign_df.shape[0]*100:.4f}%)\n\n"
str_output += "===================================================================================================================\n\n"
str_output += "In terms of unique API Calls:\n"
for i, s in enumerate(same):
    str_output += f"PATTERN: {i+1}\n{list(pd.Series(s.split(',')).unique())}\n\n"

with open("./Output/2 Falsely_Labelled_Matching_Benign.txt", 'w') as f:
    f.write(str_output)
    f.flush()
    f.close()
print(str_output)

Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples

PATTERN: 1
API Call Pattern: GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,NtOpenFile,NtQueryDirectoryFile,NtClose,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,GetSystemTimeAsFileTime,NtQuerySystemInformation,NtProtec