# Malicious False Labels

Describes/summarizes the or set of clusters. Useful for when you aim to enlist those samples (and its API calls) that were falsely labelled as malcious as per the third-party verification tool, VirusTotal.

Note that this notebook only makes use of the verified xxxx_SampleHash_Common.csv file which represents a significant majority of the entire Oliveira dataset.  

## Import Libraries/Datasets

In [1]:
import pandas as pd

malicious_df = pd.read_csv('./Clean_Manual_DBSCAN_SampleHash_Common.csv', low_memory=False) #This should point to a verified <DataClustering>_SampleHash_Common.csv file
benign_df = pd.read_csv('./API_Patterns_Benign.csv', low_memory=False) #This should point to the API_Patterns.csv file

#DROP ROWS WITH NA
import numpy as nan
malicious_df.dropna(inplace=True, subset=['Type 1'])
malicious_df['Type 1'].unique()

#Load list of API calls
API_LIST = "../api_calls.txt"
DELIMITER = "NaN"
API_FILE = open(API_LIST,"r")
APIS = API_FILE.readline().split(',')
APIS.append(DELIMITER) #serves as a label for NaN values for Instance-based datasets
API_FILE.close()

def get_unique_clusters(df:pd.DataFrame):
    return list(df['cluster'].unique())

  from pandas.core import (


## DataFrame Preview

In [2]:
#Replace '-' empty malware type delimiter with '_' for consistency
malicious_df.replace(to_replace='-',value='_', inplace=True)
display(malicious_df['cluster'].value_counts())
display(malicious_df)

cluster
0        3307
9        1116
99       1094
21       1049
146       861
         ... 
4851        1
4852        1
4853        1
4854        1
11958       1
Name: count, Length: 11959, dtype: int64

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,071e8c3f8922e186e57548cd4c703a5d,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,33f8e6d08a6aae939f25a8e0d63dd523,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,2,b68abd064e975e1c6d5f25e748663076,_,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,3,72049be7bd30ea61297ea624ae198067,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,4,c9b3700a77facf29172f32df6bc77f48,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
...,...,...,...,...,...,...
41119,17,e3d6d58faa040f0f9742c9d0eaf58be4,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41120,11957,9b917bab7f32188ae40c744f2be9aaf8,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41121,11958,35a18ee05f75f04912018d9f462cb990,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
41122,17,654139d715abcf7ecdddbef5a84f224b,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


## How many are falsely labelled samples from the verified samples?

In [3]:
def count_false_samples():
    false_labelled = malicious_df[(malicious_df['Type 1']=='_')].copy(deep=True)
    combined = []
    for c in false_labelled['cluster'].unique():
        combined.append([c,false_labelled[false_labelled['cluster'] == c]['hash'].shape[0], malicious_df[malicious_df['cluster'] == c]['hash'].shape[0], f"{false_labelled[false_labelled['cluster'] == c]['hash'].shape[0]/malicious_df[malicious_df['cluster'] == c]['hash'].shape[0]:.4f}"])
    false_labelled = pd.DataFrame(data=combined, columns=['cluster', 'false_labelled', 'total_samples', 'percentage'])
    false_labelled.sort_values('percentage', inplace=True, ascending=False)
    false_labelled['percentage'] = false_labelled['percentage'].astype('float')
    print(f"No. of falsely labelled samples verified samples: {false_labelled.sum().iloc[1]} ({false_labelled.sum().iloc[1]/malicious_df.shape[0]*100:.4f}%)\n")
    print(f"No. of clusters with purely falsely labelled verified samples: {false_labelled[false_labelled['percentage'] == 1]['cluster'].count()} ({false_labelled[false_labelled['percentage'] == 1]['cluster'].count()/600*100:.4f}%)\n")
    print("")
    display(false_labelled)
    false_labelled.to_csv("./Output/2 Falsely_Labelled.csv", index=False)
    return false_labelled

false_labelled = count_false_samples()

No. of falsely labelled samples verified samples: 41124.0 (100.0000%)

No. of clusters with purely falsely labelled verified samples: 11959 (1993.1667%)




Unnamed: 0,cluster,false_labelled,total_samples,percentage
0,0,3307,3307,1.0
7976,7976,1,1,1.0
7967,7967,1,1,1.0
7968,7968,1,1,1.0
7969,7969,1,1,1.0
...,...,...,...,...
3989,3989,2,2,1.0
3990,3990,1,1,1.0
3991,3991,1,1,1.0
3992,3992,1,1,1.0


## Does the presented API Call Patterns match those from the API Call Patterns of those Benign samples?

In [4]:
def matching_api_in_benign():
    unique_false_patterns = list(malicious_df[malicious_df['Type 1'] == '_']['pattern'])
    ctr = 1
    same = []
    str_output = "Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples" + "\n"
    for f in unique_false_patterns:
        if benign_df[benign_df['pattern']==f].shape[0]>0 and f not in same:
            str_output += f"\nPATTERN: {ctr}\n"
            str_output += "API Call Pattern: "
            str_output += f"{f}\n"
            # str_output += "API Calls: \n"
            # str_output += f"\t {list(pd.Series(list(f.split(","))).unique())}\n"
            # str_output += "Clusters & Hashes of Matching Falsely Labelled Samples: \n"
    #         for p in range(false_labelled[false_labelled['pattern']==f].shape[0]):
    #             str_output += f"\tCluster: {false_labelled['cluster'].iloc[p]:3d} - Hash: {false_labelled['hash'].iloc[p]} ({false_labelled['Type 1'].iloc[p]}, {false_labelled['Type 2'].iloc[p]}, {false_labelled['Type 3'].iloc[p]})\n"
            str_output += "Hashes of Benign Samples with Matching API Call Patterns:\n"
            for p in range(benign_df[benign_df['pattern']==f].shape[0]):
                str_output += f"\t{benign_df[benign_df['pattern']==f]['hash'].iloc[p]}\n"
            same.append(f)
            ctr+=1
    str_output += "\n"
    str_output += f"No. of API Call Patterns of Falsely-Labelled Malicious Samples that match the API Call Patterns of Benign Samples: {len(same)} ({len(same)/benign_df.shape[0]*100:.4f}%)\n\n"
    str_output += "===================================================================================================================\n\n"
    str_output += "In terms of unique API Calls:\n"
    for i, s in enumerate(same):
        str_output += f"PATTERN: {i+1}\n{list(pd.Series(s.split(',')).unique())}\n\n"

    with open("./Output/2 Falsely_Labelled_Matching_Benign.txt", 'w') as f:
        f.write(str_output)
        f.flush()
        f.close()
    print(str_output)

matching_api_in_benign()

Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples

PATTERN: 1
API Call Pattern: GetSystemTimeAsFileTime,GetSystemInfo,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetDllHandle,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtQuerySystemInformation,NtProt

# What if we removed samples that fall under NOISY CLUSTERS

In [5]:
# DROP CLUSTERS WITH MORE THAN 1 MALWARE TYPE UNDER IT (i.e. NOISE)
unique_clusters = get_unique_clusters(malicious_df)
for u in unique_clusters:
    if len(malicious_df[malicious_df['cluster'] == u]['Type 1'].unique()) > 1:
        malicious_df = malicious_df.drop(malicious_df[malicious_df['cluster'] == u].index)
malicious_df.head()

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,071e8c3f8922e186e57548cd4c703a5d,_,_,_,"RegOpenKeyExA,NtOpenKey,NtQueryValueKey,NtClos..."
1,1,33f8e6d08a6aae939f25a8e0d63dd523,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
2,2,b68abd064e975e1c6d5f25e748663076,_,_,_,"SetUnhandledExceptionFilter,OleInitialize,LdrL..."
3,3,72049be7bd30ea61297ea624ae198067,_,_,_,"GetSystemTimeAsFileTime,NtAllocateVirtualMemor..."
4,4,c9b3700a77facf29172f32df6bc77f48,_,_,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."


In [6]:
false_labelled = count_false_samples()

No. of falsely labelled samples verified samples: 41124.0 (100.0000%)

No. of clusters with purely falsely labelled verified samples: 11959 (1993.1667%)




Unnamed: 0,cluster,false_labelled,total_samples,percentage
0,0,3307,3307,1.0
7976,7976,1,1,1.0
7967,7967,1,1,1.0
7968,7968,1,1,1.0
7969,7969,1,1,1.0
...,...,...,...,...
3989,3989,2,2,1.0
3990,3990,1,1,1.0
3991,3991,1,1,1.0
3992,3992,1,1,1.0


In [7]:
matching_api_in_benign()

Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples

PATTERN: 1
API Call Pattern: GetSystemTimeAsFileTime,GetSystemInfo,NtCreateMutant,GetSystemTimeAsFileTime,NtOpenKey,NtOpenKeyEx,LdrLoadDll,LdrGetProcedureAddress,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumKeyExW,RegOpenKeyExW,RegQueryInfoKeyW,LdrGetProcedureAddress,RegEnumValueW,LdrGetProcedureAddress,RegCloseKey,GetFileAttributesW,RegOpenKeyExW,LdrGetProcedureAddress,RegQueryValueExW,LdrGetProcedureAddress,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegEnumValueW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,RegOpenKeyExW,RegQueryInfoKeyW,RegCloseKey,FindFirstFileExW,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,LdrGetDllHandle,LdrGetProcedureAddress,RegOpenKeyExW,RegQueryValueExW,RegCloseKey,NtQuerySystemInformation,NtProt