In [None]:
import pandas as pd

def inject_patterns():
    return None

def get_unique_clusters():
    return None

In [None]:
malicious_df = pd.read_csv('./Verified_Samples.csv', low_memory=False, index_col=False)

#Inject pattern (i.e., summarized malware type pattern) to the DataFrame
malicious_df = inject_patterns(malicious_df)

#Drop row that is falsely labelled. (i.e. '_' on all popularity levels of VirusTotal)
malicious_df.drop(malicious_df[(malicious_df['Type 1']=='_')].index, inplace=True)

#Identify malware types (incl. counts) in the dataset
malware_type_count = malicious_df['Type 1'].value_counts()

#Identify the overall list of types each cluster is as designated by VirusTotal.
unique_clusters = get_unique_clusters(malicious_df)
summary = []
for u in unique_clusters:
    cluster_sublist = [u]
    copy = malicious_df[malicious_df['cluster'] == u].copy(deep=True)
    idx = list(copy['Type 1'].value_counts().index)
    counts = list(copy['Type 1'].value_counts())
    type_sublist = []
    for i in range(len(idx)):
        type_sublist.append([idx[i], counts[i]])
    cluster_sublist.append(type_sublist)
    summary.append(cluster_sublist)

#Identify the counts of Malware Types per Cluster
str_output = ""
for s in summary:
    print(f"CLUSTER {s[0]}: ", end="")
    for t in s[1]:
        str_output += f"{t[0]} ({t[1]}); "
    print(str_output)
print(str_output)

# Summarize Clusters that have the same malware types as per VirusTotal.
print(f"# of Unique Malware Type: {len(malicious_df['Type 1'].unique())}\n")
cluster_instance_summary = [0] * len(list(malicious_df['cluster'].unique()))
count_summary = []
for i, u in enumerate(list(malicious_df['Type 1'].unique())):
    matching = malicious_df[malicious_df['Type 1'] == u]['cluster']
    count_summary.append([u, len(list(matching)), 
                          len(list(matching.unique())),
                          str(list(matching.unique()))])
    print(f"Unique Malware Type: {i+1}\n".upper() + 
          f"Malware Type: {u}\n" + 
          f"Matching Clusters Count: {len(list(matching.unique()))}\n" + 
          f"Matching Clusters: {list(matching.unique())}\n")
count_summary.sort(key=lambda x: x[1])
count_summary = pd.DataFrame(count_summary, 
                             columns=['Malware Type', 
                                      'No. of Matching Verified Samples', 
                                      'No. of Matching Clusters', 
                                      "Matching Clusters"])
count_summary.sort_values(by='No. of Matching Clusters',ascending=False, inplace=True)
count_summary = count_summary[['Malware Type','No. of Matching Clusters', 'Matching Clusters']]

In [None]:
malicious_df = pd.read_csv('./Verified_Samples.csv', low_memory=False)
benign_df = pd.read_csv('./API_Patterns.csv', low_memory=False)

# How many are falsely labelled samples from the verified samples?
false_labelled = malicious_df[(malicious_df['Type 1']=='_')].copy(deep=True)
print(f"No. of falsely labelled samples from verified samples: 
      {false_labelled.shape[0]} ({false_labelled.shape[0]/malicious_df.shape[0]*100:.4f}%)\n")
print("Counts of Falsely Labelled Samples in each Cluster")
display(false_labelled['cluster'].value_counts())

# Does the presented API Call Patterns match those 
# from the API Call Patterns of those Benign samples?
unique_false_patterns = list(false_labelled['pattern'])
ctr = 1
same = []
print("Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples")
for f in unique_false_patterns:
    if benign_df[benign_df['pattern']==f].shape[0]>0 and f not in same:
        print(f"\nPATTERN: {ctr}\nAPI Call Pattern: {f}\n")
        print("Hashes of Benign Samples with Matching API Call Patterns:\n")
        for p in range(benign_df[benign_df['pattern']==f].shape[0]):
            print(f"\t{benign_df[benign_df['pattern']==f]['hash'].iloc[p]}\n")
        same.append(f)
        ctr+=1
same_api_calls = {len(same)} # No. of API Call Patterns of 
                             # Falsely-Labelled Malicious Samples == Benign Samples
same_api_calls_per = {len(same)/benign_df.shape[0]*100} 
for i, s in enumerate(same):
    print(f"PATTERN: {i+1}\n{list(pd.Series(s.split(',')).unique())}\n")