# Playground

## This notebooks is used for screenshots of code in the documents using `carbon.now.sh`.

The code shown in this notebooks are the simplified ones used in the actual notebooks.

In [None]:
import pandas as pd

def inject_patterns():
    return None

def get_unique_clusters():
    return None

# Cluster Scan

In [None]:
malicious_df = pd.read_csv('./Verified_Samples.csv', low_memory=False, index_col=False)

#Inject pattern (i.e., summarized malware type pattern) to the DataFrame
malicious_df = inject_patterns(malicious_df)

#Drop row that is falsely labelled. (i.e. '_' on all popularity levels of VirusTotal)
malicious_df.drop(malicious_df[(malicious_df['Type 1']=='_')].index, inplace=True)

#Identify malware types (incl. counts) in the dataset
malware_type_count = malicious_df['Type 1'].value_counts()

#Identify the overall list of types each cluster is as designated by VirusTotal.
unique_clusters = get_unique_clusters(malicious_df)
summary = []
for u in unique_clusters:
    cluster_sublist = [u]
    copy = malicious_df[malicious_df['cluster'] == u].copy(deep=True)
    idx = list(copy['Type 1'].value_counts().index)
    counts = list(copy['Type 1'].value_counts())
    type_sublist = []
    for i in range(len(idx)):
        type_sublist.append([idx[i], counts[i]])
    cluster_sublist.append(type_sublist)
    summary.append(cluster_sublist)

#Identify the counts of Malware Types per Cluster
str_output = ""
for s in summary:
    print(f"CLUSTER {s[0]}: ", end="")
    for t in s[1]:
        str_output += f"{t[0]} ({t[1]}); "
    print(str_output)
print(str_output)

# Summarize Clusters that have the same malware types as per VirusTotal.
print(f"# of Unique Malware Type: {len(malicious_df['Type 1'].unique())}\n")
cluster_instance_summary = [0] * len(list(malicious_df['cluster'].unique()))
count_summary = []
for i, u in enumerate(list(malicious_df['Type 1'].unique())):
    matching = malicious_df[malicious_df['Type 1'] == u]['cluster']
    count_summary.append([u, len(list(matching)), 
                          len(list(matching.unique())),
                          str(list(matching.unique()))])
    print(f"Unique Malware Type: {i+1}\n".upper() + 
          f"Malware Type: {u}\n" + 
          f"Matching Clusters Count: {len(list(matching.unique()))}\n" + 
          f"Matching Clusters: {list(matching.unique())}\n")
count_summary.sort(key=lambda x: x[1])
count_summary = pd.DataFrame(count_summary, 
                             columns=['Malware Type', 
                                      'No. of Matching Verified Samples', 
                                      'No. of Matching Clusters', 
                                      "Matching Clusters"])
count_summary.sort_values(by='No. of Matching Clusters',ascending=False, inplace=True)
count_summary = count_summary[['Malware Type','No. of Matching Clusters', 'Matching Clusters']]

# FalseLabels

In [None]:
malicious_df = pd.read_csv('./Verified_Samples.csv', low_memory=False)
benign_df = pd.read_csv('./API_Patterns.csv', low_memory=False)

# How many are falsely labelled samples from the verified samples?
false_labelled = malicious_df[(malicious_df['Type 1']=='_')].copy(deep=True)
print(f"No. of falsely labelled samples from verified samples: 
      {false_labelled.shape[0]} ({false_labelled.shape[0]/malicious_df.shape[0]*100:.4f}%)\n")
print("Counts of Falsely Labelled Samples in each Cluster")
display(false_labelled['cluster'].value_counts())

# Does the presented API Call Patterns match those 
# from the API Call Patterns of those Benign samples?
unique_false_patterns = list(false_labelled['pattern'])
ctr = 1
same = []
print("Falsely Labelled Malicious Samples that Match API Call Patterns of Benign Samples")
for f in unique_false_patterns:
    if benign_df[benign_df['pattern']==f].shape[0]>0 and f not in same:
        print(f"\nPATTERN: {ctr}\nAPI Call Pattern: {f}\n")
        print("Hashes of Benign Samples with Matching API Call Patterns:\n")
        for p in range(benign_df[benign_df['pattern']==f].shape[0]):
            print(f"\t{benign_df[benign_df['pattern']==f]['hash'].iloc[p]}\n")
        same.append(f)
        ctr+=1
same_api_calls = {len(same)} # No. of API Call Patterns of 
                             # Falsely-Labelled Malicious Samples == Benign Samples
same_api_calls_per = {len(same)/benign_df.shape[0]*100} 
for i, s in enumerate(same):
    print(f"PATTERN: {i+1}\n{list(pd.Series(s.split(',')).unique())}\n")

# PatternCompare

In [None]:
#Compare API Patterns
def print_comparison(types:str, ratios:list, max:int):
    if max > len(ratios):
        print(f"The specified `max` value ({max}) exceeds available ratios to select.")
        max = len(ratios)
    states = []
    output = ""
    for r in range(0,max):
        print(f"MATCH {r+1}")
        print(f"Malicious Hashes ({len(ratios[r]['malicious_hash'])}):")
        for t in range(len(ratios[r]['malicious_hash'])): 
          	# All malicious hashes that have the same API Call
            print(f"\t{ratios[r]['malicious_hash'][t]} - {ratios[r]['Type 1'][t]}")
        print(f"Benign Hashes ({len(ratios[r]['benign_hash'])}):")
        for b in range(len(ratios[r]['benign_hash'])): 
          	# All benign hashes that have the same API Call Pattern
            print(f"\t{ratios[r]['benign_hash'][b]}")
        print(f"Score: {ratios[r]['ratio']:.4f}") # Similarity Ratio
        print(f"Malicious API Call Pattern: {ratios[r]['malicious_pattern']}")
        print(f"Benign API Call Pattern: {ratios[r]['benign_pattern']}\n")
        for t in range(len(ratios[r]['malicious_hash'])): 
          	# Malware Types of matching Malicious Samples
            states.append(ratios[r]['Type 1'][t]) 
        print("================================================\n")
    common_states = pd.Series(states).sort_values()
    print(f"\nTop {max} Most Matching API Call Patterns to Benign Samples:")
    print(str(pd.Series(common_states).sort_values().value_counts()))
    print(output)

from difflib import SequenceMatcher

malicious_df = pd.read_csv('./Verified_Samples.csv', low_memory=False)
benign_df = pd.read_csv('./API_Patterns.csv')

#Remove falsely labelled samples
malicious_df.drop(malicious_df[(malicious_df['Type 1']=='_')].index, inplace=True)

#Extract API Patterns (malicious & benign)
malicious_patterns = malicious_df['pattern'].to_list()
benign_patterns = benign_df['pattern'].to_list()

#Compare API Call Patterns
print("Comparing API Call Patterns...")
ratio = 0
ratios = []
unique_malicious = list(malicious_df['pattern'].unique())
unique_benign = list(benign_df['pattern'].unique())
for m,ma in enumerate(unique_malicious):
    mal_df = malicious_df[malicious_df['pattern']== ma]
    for b,be in enumerate(unique_benign):
        ratios.append({'ratio': SequenceMatcher(None, ma, be).ratio(),
                       'benign_pattern':be, 'malicious_pattern': ma,
                       'Type 1':mal_df['Type 1'].to_list(), 
                       'malicious_hash':mal_df['hash'].to_list(), 
                       'benign_hash':benign_df[benign_df['pattern'] == be]['hash'].to_list()})
        
# MOST SIMILAR/DIFFERENT API CALL PATTERNS TO BOTH MALICIOUS AND BENIGN SAMPLES
top = 20
ratios.sort(reverse=True,key=lambda ratio: ratio['ratio'])
print_comparison("HighMatching_Similar", ratios, top) 
ratios.sort(reverse=False,key=lambda ratio: ratio['ratio'])
print_comparison("LowMatching_Different",ratios, top)

# InstanceCompare

In [None]:
malicious_df = pd.read_csv('./Verified_Samples.csv', low_memory=False)
benign_df = pd.read_csv('./API_Patterns.csv')

#Extract Unique API Calls
malicious_apis = []
for i in range(malicious_df.shape[0]): 
    if not (malicious_df['Type 1'].iloc[i] == '_'):
        malicious_apis += malicious_df['pattern'].iloc[i].split(',')
malicious_apis = list(pd.Series(malicious_apis).unique())
print(f"# of Unique API Calls in Verified Malicious Samples: {len(malicious_apis)}")
print(str(malicious_apis) + "\n")

benign_apis = []
for i in range(benign_df.shape[0]): #Only allow those with 
    benign_apis += benign_df['pattern'].iloc[i].split(',')
benign_apis = list(pd.Series(benign_apis).unique())
print(f"# of Unique API Calls in Benign Samples: {len(benign_apis)}")
print(str(benign_apis)) + "\n"

## Identify the Unique API Calls only found in Malicious API Calls.
unique = []
for m in malicious_apis:
    if m not in benign_apis:
        unique.append(m)
print(f"No. of truly unique API Calls only found in Malicious Samples: {len(unique)} ({len(unique)/len(benign_apis)*100:.2f}% Matches API Calls of Benign Samples)")
print(f"Coverage of 'Malicious-only' API Calls to Official API Calls Oliveira.csv: {(len(unique)/len(APIS))*100:.4f}%")
print("Unique API Calls to Verified Malicious Samples only: "+ str(unique))

# Identify the Same API Calls found in both Malicious and Benign Samples.
same = []
for m in malicious_apis:
    if m in benign_apis:
        same.append(m)
print(f"No. of API Calls in Malicious Samples that is found in API Calls in Benign Samples: {len(same)} ({len(same)/len(benign_apis)*100:.2f}% Matches API Calls of Benign Samples)")
print(f"Coverage of 'Same-to-Malicious-Benign-Samples' API Calls to Official API Calls Oliveira.csv: {(len(same)/len(APIS))*100:.4f}%")
print("Unique API Calls to both Verified Malicious and Benign Samples: "+ str(same) + "\n")

# Dataset Preparation

In [None]:
oli = pd.read_csv('oliviera.csv')

# Dataset Cleaning & Reformatting
hash_col = oli.pop('hash') 
label_col = oli.pop('malware')
oli = pd.concat([label_col, oli], axis=1)
oli = pd.concat([oli, hash_col], axis=1)

# Inverse Label Encoding
def inverse_label(item):
    global APIS
    return item.map(lambda x: APIS[int(x)])
oli.iloc[:, 1:101] = oli.iloc[:, 1:101].apply(inverse_label, axis=1, result_type='reduce')

# Feature Duplicate Processing
TB = oli.copy(deep=True) #Time-based behavior (same as original)
IB = oli.copy(deep=True) #Instance-based behavior (to be created)
IB.transpose()
for r in range(oli.shape[0]):
    row = IB.iloc[r, 1:101].drop_duplicates(keep='first', inplace=False).to_list()
    IB.iloc[r, 1:101] = row + (['NaN']*(100-len(row)))
    if r % 100 == 0:
        print(r, end=" ")
print("\nDuplicates removed!")
IB.transpose()

# Divide to Train and Holdout/Test (i.e., 90:10)
def firstSplit(dataset):
    X = dataset.iloc[:,1:102] #Features
    y = dataset.iloc[:,0] #Labels
    return train_test_split(X, y, test_size=0.1, random_state=1, shuffle=True)
TB_Features, TB_Reserve_Features, TB_Labels, TB_Reserve_Labels = firstSplit(TB)
TB = pd.concat([TB_Labels,TB_Features], axis=1)
TB.pop('hash')
TB_Reserve = pd.concat([TB_Reserve_Labels, TB_Reserve_Features], axis=1)
IB_Features, IB_Reserve_Features, IB_Labels, IB_Reserve_Labels = firstSplit(IB)
IB = pd.concat([IB_Labels,IB_Features], axis=1)
IB.pop('hash')
IB_Reserve = pd.concat([IB_Reserve_Labels, IB_Reserve_Features], axis=1)

# SMOTE
balancer = SMOTEN(sampling_strategy='minority', random_state=1, 
                  k_neighbors=math.ceil(math.sqrt(TB.shape[0])))
X,y = TB.iloc[:,1:101],TB.iloc[:,0]
X,y = balancer.fit_resample(X, y)
print("TB Rebalancing Finished!")
TB = pd.concat([y, X], axis=1)
balancer = SMOTEN(sampling_strategy='minority', random_state=1, 
                  k_neighbors=math.ceil(math.sqrt(TB.shape[0])))
print("IB Rebalance...")
X,y = IB.iloc[:,1:101],IB.iloc[:,0]
X,y = balancer.fit_resample(X, y)
print("IB Rebalancing Finished!")
IB = pd.concat([y, X], axis=1)

# LabelEncoding
ENCODED = [TB.copy(deep=True), IB.copy(deep=True), 
           TB_Reserve.copy(deep=True), IB_Reserve.copy(deep=True)]
le = LabelEncoder()
le.fit(APIS) # List of API Calls; Including 'NaN'.
LGBM_TB_Train = TB.copy(deep=True).iloc[:,1:101].apply(le.transform)
le = LabelEncoder()
le.fit(APIS) # List of API Calls; Including 'NaN'.
LGBM_IB_Train = IB.copy(deep=True).iloc[:,1:101].apply(le.transform)
le = LabelEncoder()
le.fit(APIS) # List of API Calls; Including 'NaN'.
LGBM_TB_Holdout = TB_Reserve.copy(deep=True).iloc[:,1:101].apply(le.transform)
le = LabelEncoder()
le.fit(APIS) # List of API Calls; Including 'NaN'.
LGBM_IB_Holdout = IB_Reserve.copy(deep=True).iloc[:,1:101].apply(le.transform)

# 2nd Tranche Test

In [None]:
K = 5 # 80:20

# K-folds sample visualization
def kfolds_vis(dataset):
    global K
    X = dataset.iloc[:,1:] #All rows, 2nd to last column
    y = dataset.iloc[:,0] #All rows, first column only
    fig, ax = plt.subplots(figsize=(10,K+1), dpi=300)
    train = plot_cv_indices(get_strat_kfold(), X, y, ax, K)
    plt.show()

def get_strat_kfold():
    global K
    return StratifiedKFold(n_splits=K, shuffle=True, random_state=1)

# Render K-folds sample visualization (inner workings)
def plot_cv_indices(cv, X, y, ax, n_splits, lw=25):
    for ii, (tr, tt) in enumerate(cv.split(X=X, y=y)):
        indices = np.array([np.nan] * len(X))
        indices[tt], indices[tr] = 1,0
        ax.scatter(range(len(indices)), [ii] * len(indices), c=indices, marker="_", 
                   lw=lw, cmap=plt.cm.Paired, vmin=0, vmax=1)
    yticklabels = list(range(n_splits))
    ax.set(yticks=np.arange(n_splits), yticklabels=yticklabels, 
           xlabel="Dataset Subsample", ylabel="CV iteration", 
           ylim=[n_splits,-1], xlim=[0, X.shape[0]])
    ax.set_title("{}".format(type(cv).__name__))
    return ax

# Render K-Folds Visualization for LGBM
tb_train = pd.read_csv("../Dataset/TB/LGBM_TB.csv", low_memory=False).fillna("NaN")
ib_train = pd.read_csv("../Dataset/IB/LGBM_IB.csv", low_memory=False).fillna("NaN")
print("Stratified K-Folds Split at",K,"splits.")
print("LGBM TB/TB_Encoded Dataset")
kfolds_vis(tb_train)
print("LGBM IB/IB_Encoded Dataset")
kfolds_vis(ib_train)

# Render K-Folds Visualization for CATB
tb_train = pd.read_csv("../Dataset/TB/CATB_TB.csv", low_memory=False).fillna("NaN")
ib_train = pd.read_csv("../Dataset/IB/CATB_IB.csv", low_memory=False).fillna("NaN")
print("Stratified K-Folds Split at",K,"splits.")
print("CATB TB/TB_Encoded Dataset")
kfolds_vis(tb_train)
print("CATB IB/IB_Encoded Dataset")
kfolds_vis(ib_train)

# Model Training

Generic code

In [None]:
def get_indexes():
    indexes = []
    for i in range(100):
        indexes.append(f"t_{i}")
    return indexes

def get_hyperparams():
    return None

In [None]:
#Setting filenames of files
TB_Train = "../Dataset/TB/LGBM_TB.csv"
IB_Train = "../Dataset/IB/LGBM_IB.csv"
#Load Dataframe
tb_train = pd.read_csv(TB_Train, low_memory=False).fillna("NaN")
ib_train = pd.read_csv(IB_Train, low_memory=False).fillna("NaN")
#Static splitting of Train Split of Time-based (70:30)
X_tb = tb_train.iloc[:,1:101]
y_tb = tb_train.iloc[:,0]
X_tb_train, X_tb_vali, y_tb_train, y_tb_vali = train_test_split(X_tb, y_tb, test_size=0.3, 
                                                                shuffle=True, random_state=1)
#Static splitting of Train Split of Instance-based (70:30)
X_ib = ib_train.iloc[:,1:101]
y_ib = ib_train.iloc[:,0]
X_ib_train, X_ib_vali, y_ib_train, y_ib_vali = train_test_split(X_ib, y_ib, test_size=0.3, 
                                                                shuffle=True, random_state=1)

#Get hyperparams if tuned model to be trained
HYPERPARAMS = None

#Training Model
tb_lgbm = None
tb_lgbm = lightgbm.LGBMClassifier(random_state=1, n_jobs=-1, 
                                  verbose=1, categorical_data=get_indexes())
tb_lgbm.fit(X_tb_train, y_tb_train, 
            eval_set=[(X_tb_vali, y_tb_vali), (X_tb_train, y_tb_train)], 
            eval_metric=['binary_logloss', 'average_precision', 'auc'])
#Training Model
ib_lgbm = lightgbm.LGBMClassifier(random_state=1, n_jobs=-1, 
                                  verbose=1, categorical_data=get_indexes())
ib_lgbm.fit(X_ib_train, y_ib_train, 
            eval_set=[(X_ib_vali, y_ib_vali), (X_ib_train, y_ib_train)], 
            eval_metric=['binary_logloss', 'average_precision', 'auc'])
#Saving Model as file
dump(tb_lgbm, "TB.model")
dump(ib_lgbm, "IB.model")

#Training Model
tb_lgbm = None
tb_lgbm = lightgbm.LGBMClassifier(**TB_HYPERPARAMS, random_state=1, n_jobs=-1, 
                                  verbose=1, categorical_data=get_indexes())
tb_lgbm.fit(X_tb_train, y_tb_train, 
            eval_set=[(X_tb_vali, y_tb_vali), (X_tb_train, y_tb_train)], 
            eval_metric=['binary_logloss', 'average_precision', 'auc'])
#Training Model
ib_lgbm = lightgbm.LGBMClassifier(**IB_HYPERPARAMS, random_state=1, n_jobs=-1, 
                                  verbose=1, categorical_data=get_indexes())
ib_lgbm.fit(X_ib_train, y_ib_train, 
            eval_set=[(X_ib_vali, y_ib_vali), (X_ib_train, y_ib_train)], 
            eval_metric=['binary_logloss', 'average_precision', 'auc'])
#Saving Model as file
dump(tb_lgbm, "TB.model")
dump(ib_lgbm, "IB.model")

# Model Tuning

Generic code

In [None]:
def get_threshold(cv_results, target:str):
    return cv_results[target].max() - cv_results[target].std()

def refit_strategy(cv_results):
    cv_results_ = pd.DataFrame(cv_results)
    # Filter-out all results below 80% score on acc, prec, & recall
    cv_results_ = cv_results_[cv_results_["mean_test_accuracy"] >= .80]
    cv_results_ = cv_results_[cv_results_["mean_test_precision"] >= .80]
    cv_results_ = cv_results_[cv_results_["mean_test_recall"] >= .80]
    # Filter-out all results below max-std threshold score on acc, prec, & recall
    cv_results_ = cv_results_[cv_results_["mean_test_accuracy"] >= get_threshold(cv_results_, 'mean_test_accuracy')]
    cv_results_ = cv_results_[cv_results_["mean_test_precision"] >= get_threshold(cv_results_, 'mean_test_precision')]
    cv_results_ = cv_results_[cv_results_["mean_test_recall"] >= get_threshold(cv_results_, 'mean_test_recall')]
    return cv_results_['mean_fit_time'].idxmin()

def get_tuner(model, params, scoring=['accuracy','precision','recall', 'roc_auc']):
    return GridSearchCV(model, params, scoring=scoring, n_jobs=1, refit=refit_strategy, 
                        cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=1), 
                        verbose=2, pre_dispatch='2*n_jobs', error_score=0, 
                        return_train_score=False)

#Specify features (X) and labels (y) for Time-based data
X_tb = tb_train.iloc[:,1:101] #All rows, 2nd to last column
y_tb = tb_train.iloc[:,0] #All rows, first column only

#Specify features (X) and labels (y) for Instance-based data
X_ib = ib_train.iloc[:,1:101] #All rows, 2nd to last column
y_ib = ib_train.iloc[:,0] #All rows, first column only

lgbm_classifier = lightgbm.LGBMClassifier(random_state=1, 
                                          n_jobs=int(os.cpu_count()), verbose=-1)
tb_tuner = get_tuner(lgbm_classifier, lgbm_params)
tb_tuner.fit(X_tb, y_tb)

print_to_file("LGBM_TB", tb_tuner.best_params_)
cv_results = pd.DataFrame.from_dict(tb_tuner.cv_results_)
cv_results.to_csv(f"./Outputs/LGBM/{OUTPUT_FILENAME}_LGBM_TB_Tune_CVRes.csv")

display(cv_results)
plot_search_results(tb_tuner, cv_results, 'TB')