# Malicious False Labels

Describes/summarizes the or set of clusters. Useful for when you aim to enlist those samples (and its API calls) that were falsely labelled as malcious as per the third-party verification tool, VirusTotal.

Note that this notebook only makes use of the verified xxxx_SampleHash_Common.csv file which represents a significant majority of the entire Oliveira dataset.  

## Import Libraries/Datasets

In [13]:
import pandas as pd

malicious_df = pd.read_csv('./Converted_(EDITED) DBSCAN_SampleHash_Common.csv', low_memory=False) #This should point to a verified <DataClustering>_SampleHash_Common.csv file
benign_df = pd.read_csv('./API_Patterns.csv', low_memory=False) #This should point to the API_Patterns.csv file

#Load list of API calls
API_LIST = "../api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

def get_unique_clusters(df:pd.DataFrame):
    return list(df['cluster'].unique())

## DataFrame Preview

In [14]:
#Replace '-' empty malware type delimiter with '_' for consistency
malicious_df.replace(to_replace='-',value='_', inplace=True)
malicious_df

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,-1,5e1f079fc9130cd508568da3cb0b219a,adware,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1,-1,2b05809d67062f0af9fec37f33d1b338,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
2,-1,e8a9d42e07c25d00fcc56170e66071fd,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
3,-1,01e2cd4d45e8bc2608f3519a653d3a63,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
4,-1,d93b214c093a9f1e07248962aeb74fc8,trojan,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
...,...,...,...,...,...,...
1490,297,05b379055a79c5e47bdabec418190ac7,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1491,297,d8c65468405b789c56754336c1f8911b,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1492,297,4b58a7c885df8e86be4769fd949d2c37,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1493,297,a4200ec0b146d8a0d37e90e32c674780,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [15]:
malicious_df['cluster'].value_counts()

cluster
-1      5
 204    5
 202    5
 201    5
 200    5
       ..
 98     5
 97     5
 96     5
 95     5
 297    5
Name: count, Length: 299, dtype: int64

## How many are falsely labelled samples from the verified samples?

In [16]:
false_labelled = malicious_df[(malicious_df['Type 1']=='_')].copy(deep=True)
combined = []
for c in false_labelled['cluster'].unique():
    combined.append([c,false_labelled[false_labelled['cluster'] == c]['hash'].shape[0], malicious_df[malicious_df['cluster'] == c]['hash'].shape[0], f"{false_labelled[false_labelled['cluster'] == c]['hash'].shape[0]/malicious_df[malicious_df['cluster'] == c]['hash'].shape[0]:.4f}"])
false_labelled = pd.DataFrame(data=combined, columns=['cluster', 'false_labelled', 'total_samples', 'percentage'])
false_labelled.sort_values('percentage', inplace=True, ascending=False)

print(f"No. of falsely labelled samples from verified samples: {false_labelled.sum().iloc[1]} ({false_labelled.sum().iloc[1]/malicious_df.shape[0]*100:.4f}%)\n")

print("")
display(false_labelled)
false_labelled.to_csv("./Output/2 Falsely_Labelled.csv", index=False)

No. of falsely labelled samples from verified samples: 120 (8.0268%)




Unnamed: 0,cluster,false_labelled,total_samples,percentage
34,242,5,5,1.0
27,202,5,5,1.0
41,271,5,5,1.0
40,267,5,5,1.0
30,215,5,5,1.0
36,248,5,5,1.0
15,136,5,5,1.0
11,107,5,5,1.0
12,109,4,5,0.8
22,172,4,5,0.8


## Does the presented API Call Patterns match those from the API Call Patterns of those Benign samples?

**Note:** The samples labelled as benign in Oliveira came from Win7 executables which means that it is guaranteed that those are truly benign which makes it safe for use in comparison.

In [17]:
str_output = ""

unique_false_patterns = list(malicious_df[malicious_df['Type 1'] == '_']['pattern'])
ctr = 1
same = []
str_output += "Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples" + "\n"
for f in unique_false_patterns:
    if benign_df[benign_df['pattern']==f].shape[0]>0 and f not in same:
        str_output += f"\nPATTERN: {ctr}\n"
        str_output += "API Call Pattern: "
        str_output += f"{f}\n"
        # str_output += "API Calls: \n"
        # str_output += f"\t {list(pd.Series(list(f.split(","))).unique())}\n"
        # str_output += "Clusters & Hashes of Matching Falsely Labelled Samples: \n"
#         for p in range(false_labelled[false_labelled['pattern']==f].shape[0]):
#             str_output += f"\tCluster: {false_labelled['cluster'].iloc[p]:3d} - Hash: {false_labelled['hash'].iloc[p]} ({false_labelled['Type 1'].iloc[p]}, {false_labelled['Type 2'].iloc[p]}, {false_labelled['Type 3'].iloc[p]})\n"
        str_output += "Hashes of Benign Samples with Matching API Call Patterns:\n"
        for p in range(benign_df[benign_df['pattern']==f].shape[0]):
            str_output += f"\t{benign_df[benign_df['pattern']==f]['hash'].iloc[p]}\n"
        same.append(f)
        ctr+=1
str_output += "\n"
str_output += f"No. of API Call Patterns of Falsely-Labelled Malicious Samples that match the API Call Patterns of Benign Samples: {len(same)} ({len(same)/benign_df.shape[0]*100:.4f}%)\n\n"
str_output += "===================================================================================================================\n\n"
str_output += "In terms of unique API Calls:\n"
for i, s in enumerate(same):
    str_output += f"PATTERN: {i+1}\n{list(pd.Series(s.split(',')).unique())}\n\n"

with open("./Output/2 Falsely_Labelled_Matching_Benign.txt", 'w') as f:
    f.write(str_output)
    f.flush()
    f.close()
print(str_output)

Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples

PATTERN: 1
API Call Pattern: SetErrorMode,LdrGetDllHandle,LdrGetProcedureAddress,GetSystemDirectoryW,NtAllocateVirtualMemory,NtFreeVirtualMemory,NtAllocateVirtualMemory,LdrLoadDll,GetSystemDirectoryW,LdrLoadDll,GetSystemDirectoryW,GetSystemWindowsDirectoryW,GetSystemDirectoryW,RegOpenKeyExA,GetSystemWindowsDirectoryW,LoadStringW,GetSystemWindowsDirectoryW,GetSystemDirectoryW,RegOpenKeyExW,LdrLoadDll,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,NtOpenKey,NtQueryValueKey,NtClose,NtOpenKey,NtQueryValueKey,NtClose,RegCloseKey,NtCreateMutant,GetNativeSystemInfo,GetSystemWindowsDirectoryW,NtClose,LdrLoadDll,GetSystemDirectoryW,LdrLoadDll,GetSystemDirectoryW,LdrLoadDll,LdrGetProcedureAddress,LdrGetDllHandle,LdrGetProcedureAddress,LdrLoadDll,GetSystemDirectoryW,LdrLoadDll,GetSystemDirectoryW,LdrLoadDll,GetSystemDirectoryW,NtAllocateVirtualMemory,LdrGetDllHandle,LdrGetProcedu