# Malicious False Labels

Describes/summarizes the or set of clusters. Useful for when you aim to enlist those samples (and its API calls) that were falsely labelled as malcious as per the third-party verification tool, VirusTotal.

Note that this notebook only makes use of the verified xxxx_SampleHash_Common.csv file which represents a significant majority of the entire Oliveira dataset.  

## Import Libraries/Datasets

In [32]:
import pandas as pd

malicious_df = pd.read_csv('./Labelled_Manual_DBSCAN_SampleHash_Common.csv', low_memory=False) #This should point to a verified <DataClustering>_SampleHash_Common.csv file
benign_df = pd.read_csv('./API_Patterns_Benign.csv', low_memory=False) #This should point to the API_Patterns.csv file

#DROP ROWS WITH NA
import numpy as nan
malicious_df.dropna(inplace=True, subset=['Type 1'])
malicious_df['Type 1'].unique()

#Load list of API calls
API_LIST = "../api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

def get_unique_clusters(df:pd.DataFrame):
    return list(df['cluster'].unique())

## DataFrame Preview

In [33]:
#Replace '-' empty malware type delimiter with '_' for consistency
malicious_df.replace(to_replace='-',value='_', inplace=True)
display(malicious_df['cluster'].value_counts())
display(malicious_df)

cluster
-1      9269
 0      2152
 59      729
 5       709
 14      700
        ... 
 267       1
 110       1
 532       1
 470       1
 597       1
Name: count, Length: 599, dtype: int64

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,071e8c3f8922e186e57548cd4c703a5d,trojan,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,33f8e6d08a6aae939f25a8e0d63dd523,pua,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,2,b68abd064e975e1c6d5f25e748663076,trojan,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,-1,72049be7bd30ea61297ea624ae198067,trojan,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,-1,c9b3700a77facf29172f32df6bc77f48,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
42792,11,e3d6d58faa040f0f9742c9d0eaf58be4,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42793,-1,9b917bab7f32188ae40c744f2be9aaf8,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42794,-1,35a18ee05f75f04912018d9f462cb990,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
42795,11,654139d715abcf7ecdddbef5a84f224b,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


## How many are falsely labelled samples from the verified samples?

In [34]:
def count_false_samples():
    false_labelled = malicious_df[(malicious_df['Type 1']=='_')].copy(deep=True)
    combined = []
    for c in false_labelled['cluster'].unique():
        combined.append([c,false_labelled[false_labelled['cluster'] == c]['hash'].shape[0], malicious_df[malicious_df['cluster'] == c]['hash'].shape[0], f"{false_labelled[false_labelled['cluster'] == c]['hash'].shape[0]/malicious_df[malicious_df['cluster'] == c]['hash'].shape[0]:.4f}"])
    false_labelled = pd.DataFrame(data=combined, columns=['cluster', 'false_labelled', 'total_samples', 'percentage'])
    false_labelled.sort_values('percentage', inplace=True, ascending=False)
    false_labelled['percentage'] = false_labelled['percentage'].astype('float')
    print(f"No. of falsely labelled samples verified samples: {false_labelled.sum().iloc[1]} ({false_labelled.sum().iloc[1]/malicious_df.shape[0]*100:.4f}%)\n")
    print(f"No. of clusters with purely falsely labelled verified samples: {false_labelled[false_labelled['percentage'] == 1]['cluster'].count()} ({false_labelled[false_labelled['percentage'] == 1]['cluster'].count()/600*100:.4f}%)\n")
    print("")
    display(false_labelled)
    false_labelled.to_csv("./Output/2 Falsely_Labelled.csv", index=False)
    return false_labelled

false_labelled = count_false_samples()

No. of falsely labelled samples verified samples: 1673.0 (5.8965%)

No. of clusters with purely falsely labelled verified samples: 15 (2.5000%)




Unnamed: 0,cluster,false_labelled,total_samples,percentage
32,270,4,4,1.0000
54,376,3,3,1.0000
80,517,5,5,1.0000
40,297,11,11,1.0000
78,508,4,4,1.0000
...,...,...,...,...
36,14,10,700,0.0143
107,74,1,87,0.0115
108,3,1,113,0.0088
53,81,2,560,0.0036


## Does the presented API Call Patterns match those from the API Call Patterns of those Benign samples?

In [35]:
def matching_api_in_benign():
    unique_false_patterns = list(malicious_df[malicious_df['Type 1'] == '_']['pattern'])
    ctr = 1
    same = []
    str_output = "Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples" + "\n"
    for f in unique_false_patterns:
        if benign_df[benign_df['pattern']==f].shape[0]>0 and f not in same:
            str_output += f"\nPATTERN: {ctr}\n"
            str_output += "API Call Pattern: "
            str_output += f"{f}\n"
            # str_output += "API Calls: \n"
            # str_output += f"\t {list(pd.Series(list(f.split(","))).unique())}\n"
            # str_output += "Clusters & Hashes of Matching Falsely Labelled Samples: \n"
    #         for p in range(false_labelled[false_labelled['pattern']==f].shape[0]):
    #             str_output += f"\tCluster: {false_labelled['cluster'].iloc[p]:3d} - Hash: {false_labelled['hash'].iloc[p]} ({false_labelled['Type 1'].iloc[p]}, {false_labelled['Type 2'].iloc[p]}, {false_labelled['Type 3'].iloc[p]})\n"
            str_output += "Hashes of Benign Samples with Matching API Call Patterns:\n"
            for p in range(benign_df[benign_df['pattern']==f].shape[0]):
                str_output += f"\t{benign_df[benign_df['pattern']==f]['hash'].iloc[p]}\n"
            same.append(f)
            ctr+=1
    str_output += "\n"
    str_output += f"No. of API Call Patterns of Falsely-Labelled Malicious Samples that match the API Call Patterns of Benign Samples: {len(same)} ({len(same)/benign_df.shape[0]*100:.4f}%)\n\n"
    str_output += "===================================================================================================================\n\n"
    str_output += "In terms of unique API Calls:\n"
    for i, s in enumerate(same):
        str_output += f"PATTERN: {i+1}\n{list(pd.Series(s.split(',')).unique())}\n\n"

    with open("./Output/2 Falsely_Labelled_Matching_Benign.txt", 'w') as f:
        f.write(str_output)
        f.flush()
        f.close()
    print(str_output)

matching_api_in_benign()

Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples

PATTERN: 1
API Call Pattern: GetSystemTimeAsFileTime,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKeyEx,NtQueryKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,NtOpenFile,NtQueryDirectoryFile,NtClose,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtOpenFile,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,GetSystemTimeAsFileTime,NtQuerySystemInformation,NtProtec

# What if we removed samples that fall under NOISY CLUSTERS

In [36]:
# DROP CLUSTERS WITH MORE THAN 1 MALWARE TYPE UNDER IT (i.e. NOISE)
unique_clusters = get_unique_clusters(malicious_df)
for u in unique_clusters:
    if len(malicious_df[malicious_df['cluster'] == u]['Type 1'].unique()) > 1:
        malicious_df = malicious_df.drop(malicious_df[malicious_df['cluster'] == u].index)
malicious_df.head()

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
2,2,b68abd064e975e1c6d5f25e748663076,trojan,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
11,7,e7ac6a2de45506164777941faf953094,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
18,9,8220417d83184f62f5734a0c1d140e89,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
19,10,a36c063345128d22bf1dd003af2eb36d,trojan,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
27,15,d6fa2fa31148f6c4a2a1d9837dcda3b4,trojan,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."


In [37]:
false_labelled = count_false_samples()

No. of falsely labelled samples verified samples: 97.0 (2.6124%)

No. of clusters with purely falsely labelled verified samples: 15 (2.5000%)




Unnamed: 0,cluster,false_labelled,total_samples,percentage
0,162,31,31,1.0
1,224,2,2,1.0
2,270,4,4,1.0
3,281,4,4,1.0
4,297,11,11,1.0
5,376,3,3,1.0
6,377,5,5,1.0
7,422,7,7,1.0
8,435,8,8,1.0
9,491,4,4,1.0


In [38]:
matching_api_in_benign()

Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples

No. of API Call Patterns of Falsely-Labelled Malicious Samples that match the API Call Patterns of Benign Samples: 0 (0.0000%)


In terms of unique API Calls:

