# Validated Malicious Samples Cleaner

This Python notebook cleans the valitdaed *xxxx_SampleHash_Common.csv* file.

This assumes that you have prepared the {Data Clustering}_Sample_Common.csv file such that the samples are validated already through VirusTotal.

It will attempt to remove any invalid values in the dataset such as:
1. Remove whitespaces.
2. Replace nan values with '_'
3. Replace any other potential delimiters with '_'

Note that the data processing happens in_place such that the input file itself will be updated after the process.

In [1]:
import pandas as pd
import numpy as np

file_path = './Clustering/(EDITED)KMeans_SampleHash_Common.csv'

df = pd.read_csv(filepath_or_buffer=file_path)

def display_each_cluster():
    display(df['Type 1'].unique())
    display(df['Type 2'].unique())
    display(df['Type 3'].unique())

df

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...
1756,198,d24b78bd73f17379ed62e4c776b4f66e,trojan,adware,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1757,198,f666dd4b3a53b7fe71f8976fa09bfdfb,trojan,adware,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1758,199,b6d6520b608875282d831b1e983cd5e5,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1759,199,18bce1a594550daf8b3f318de48c1674,trojan,dropper,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


## Check for invalid inputs to columns `Type 1`, `Type 2`, `Type 3`

In [2]:
display_each_cluster()

array(['trojan ', '_', 'adware', 'downloader', 'miner', 'pua', 'virus',
       'hacktool', 'ransomware', 'trojan', 'dropper', 'avaquest',
       'opencandy', 'syncopate', 'techjoydown', 'msil', 'nsismod',
       'fugrafa '], dtype=object)

array(['dropper', 'hacktool', '_', 'downloader', 'pua', 'adware',
       'trojan', 'worm', 'miner', 'virus', 'spyware', 'ransomware',
       'banker', 'candyopen', 'nsismod'], dtype=object)

array(['pua', '_', 'dropper', 'adware', 'downloader', 'virus', 'trojan',
       'hacktool', 'ransomware', nan, 'spyware', 'banker', 'worm'],
      dtype=object)

## Remove `nan` values.

In [3]:
df.replace(np.nan, '_', inplace=True)
display_each_cluster()

array(['trojan ', '_', 'adware', 'downloader', 'miner', 'pua', 'virus',
       'hacktool', 'ransomware', 'trojan', 'dropper', 'avaquest',
       'opencandy', 'syncopate', 'techjoydown', 'msil', 'nsismod',
       'fugrafa '], dtype=object)

array(['dropper', 'hacktool', '_', 'downloader', 'pua', 'adware',
       'trojan', 'worm', 'miner', 'virus', 'spyware', 'ransomware',
       'banker', 'candyopen', 'nsismod'], dtype=object)

array(['pua', '_', 'dropper', 'adware', 'downloader', 'virus', 'trojan',
       'hacktool', 'ransomware', 'spyware', 'banker', 'worm'],
      dtype=object)

## Remove whitespaces of values

In [4]:
for i in range(df.shape[0]):
   df.iloc[i,2] = df.iloc[i,2].strip()
   df.iloc[i,3] = df.iloc[i,3].strip()
   df.iloc[i,4] = df.iloc[i,4].strip()

## View Completed Data

In [5]:
display(df)
display_each_cluster()

Unnamed: 0,cluster,hash,Type 1,Type 2,Type 3,pattern
0,0,490d584c7d303ed35c673460b63f3ca8,trojan,dropper,pua,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
1,0,9ab8ea1d2d68a0d4110df413e677976c,trojan,hacktool,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
2,0,adbc74815ef2bd1ea4967abad812233d,trojan,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
3,0,f6eb4841bba3a4cee747700dc0ee1609,_,_,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
4,0,f5a0ad49337ebc87897698e70d03364e,trojan,dropper,_,"GetSystemTimeAsFileTime,NtCreateMutant,GetSyst..."
...,...,...,...,...,...,...
1756,198,d24b78bd73f17379ed62e4c776b4f66e,trojan,adware,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1757,198,f666dd4b3a53b7fe71f8976fa09bfdfb,trojan,adware,_,"GetSystemTimeAsFileTime,LdrLoadDll,LdrGetProce..."
1758,199,b6d6520b608875282d831b1e983cd5e5,_,_,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."
1759,199,18bce1a594550daf8b3f318de48c1674,trojan,dropper,_,"SetErrorMode,OleInitialize,LdrGetDllHandle,Ldr..."


array(['trojan', '_', 'adware', 'downloader', 'miner', 'pua', 'virus',
       'hacktool', 'ransomware', 'dropper', 'avaquest', 'opencandy',
       'syncopate', 'techjoydown', 'msil', 'nsismod', 'fugrafa'],
      dtype=object)

array(['dropper', 'hacktool', '_', 'downloader', 'pua', 'adware',
       'trojan', 'worm', 'miner', 'virus', 'spyware', 'ransomware',
       'banker', 'candyopen', 'nsismod'], dtype=object)

array(['pua', '_', 'dropper', 'adware', 'downloader', 'virus', 'trojan',
       'hacktool', 'ransomware', 'spyware', 'banker', 'worm'],
      dtype=object)

## Save Cleaned Validated Data

In [6]:
df.to_csv(file_path,index=False)