In [None]:
import numpy as np
import pandas as pd
import math
import networkx as nx
import os
import copy
from collections import Counter
from nltk.util import ngrams
import time
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn import ensemble,metrics
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
import pandas as pd
import pickle
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import joblib


def compute_metrices_detector(y_true, y_pred):
    accuracy  = accuracy_score(y_true=y_true, y_pred=y_pred)
    precision = precision_score(y_true=y_true, y_pred=y_pred)
    recall    = recall_score(y_true=y_true, y_pred=y_pred)
    f1        = f1_score(y_true=y_true, y_pred=y_pred)
    return {'accuracy': accuracy, 
            'precision': precision, 
            'recall': recall, 
            'f1': f1}
    
def compute_metrices_classifier(y_true, y_pred):
    accuracy  = accuracy_score(y_true=y_true, y_pred=y_pred)
    precision = precision_score(y_true=y_true, y_pred=y_pred, average='weighted')
    recall    = recall_score(y_true=y_true, y_pred=y_pred, average='weighted')
    f1        = f1_score(y_true=y_true, y_pred=y_pred, average='weighted')
    return {'accuracy':  accuracy, 
            'precision': precision, 
            'recall':    recall, 
            'f1':        f1}

In [None]:
ben_df = pd.read_csv('./feature/Ben_feature_sym.csv',header=None)
mal_df = pd.read_csv('./feature/Mal_feature_sym.csv',header=None)

mal_df.iloc[:,1].value_counts()

In [None]:
ben_df = ben_df.dropna() 
mal_df = mal_df.dropna()
mask = mal_df.iloc[:, 1] == 9
mal_df = mal_df[~mask]

mal_df.iloc[:,1].value_counts()

In [None]:
ben_df.to_numpy()
mal_df.to_numpy()
data = np.concatenate((ben_df,mal_df),axis=0)
X = data[:,2:]
y = data[:, 1]

# y = np.where(y==0,0,1)

#對label 做one hot encoding
# from sklearn.preprocessing import OneHotEncoder
labelencoder = LabelEncoder()
y = labelencoder.fit_transform(y)
y = y.reshape(-1,1)

# onehotencoder = OneHotEncoder()
# data_str_ohe =onehotencoder.fit_transform(label).toarray()
# label = pd.DataFrame(data_str_ohe)


In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.2)
X_train.shape

### RF

In [None]:
#RF
forest = ensemble.RandomForestClassifier(n_estimators = 110)
forest_fit = forest.fit(X, y)

In [None]:
y_predict = forest.predict(X_train)    
print('RF :',accuracy_score(Y_train,y_predict))

In [None]:
compute_metrices_classifier(Y_train,y_predict)

In [None]:
joblib.dump(forest,"RF_model.joblib")

### KNN

In [None]:
#KNN 
knn = KNeighborsClassifier(weights='distance',n_neighbors=5)
knn_fit = knn.fit(X, y)

In [None]:
y_predict = knn.predict(X_test)
print('KNN',accuracy_score(Y_test,y_predict))

In [None]:
compute_metrices_classifier(Y_test,y_predict)

In [None]:
joblib.dump(knn,"KNN_model.joblib")

### SVM

In [None]:
# SVM
svm_model = SVC()
svm_model.fit(X, y)


In [None]:
y_predict = svm_model.predict(X_train)
print('SVM',accuracy_score(Y_train,y_predict))

In [None]:
compute_metrices_classifier(Y_train,y_predict)

In [None]:
joblib.dump(svm_model,"SVM_model.joblib")

### MLP1


In [None]:
import numpy as np
from time import time

import torch
from torch.nn import Linear
import torch.nn.functional as F

class MLP(torch.nn.Module):
    def __init__(self, num_features, hidden_channels, num_classes):
        super(MLP, self).__init__()
        self.lin1 = Linear(num_features, hidden_channels)
        self.lin2 = Linear(hidden_channels, hidden_channels)
        self.lin = Linear(hidden_channels, num_classes)

    def forward(self, x):
        x = self.lin1(x)
        x = F.relu(x)
        
        x = self.lin2(x)
        x = F.relu(x)

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        
        ## ignore softmax activation here, since we can obtain
        ## higher accuracy in our case
        x = F.softmax(x, dim=1)
        return x


class TorchTrainer:
    def __init__(self, model, optimizer=None, criterion=None, device=None):
        self.model     = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.device    = torch.device('cuda' if torch.cuda.is_available() else 'cpu') if device is None else device
                
        self.arguments = locals()
        self.arguments['device'] = self.device
        
        self.output_dim = list(self.model.modules())[-1].out_features
    
    def train(self, train_loader, valid_loader, epochs=20, save_path='model_saved/mlp.pt', verbose=True):
        self.arguments['epochs'] = epochs
        self.arguments['save_path'] = save_path
        
        train_acc  = np.zeros(epochs)
        train_loss = np.zeros(epochs)
        val_acc    = np.zeros(epochs)
        val_loss   = np.zeros(epochs)
        train_time = np.zeros(epochs)
        
        best_val_acc = 0
        for epoch in range(epochs):
            if verbose:
                epoch_start = f'Epoch ({epoch + 1}/{epochs})'
                print(epoch_start, end=' ')

            train_time[epoch] = self.train_epoch(train_loader)

            # evaluate the training accuracy and validation accuracy after each epoch
            train_acc[epoch], train_loss[epoch] = self.test(train_loader)
            val_acc[epoch], val_loss[epoch] = self.test(valid_loader)

            if val_acc[epoch] > best_val_acc:
                # save the best model according to validation accuracy
                best_val_acc = val_acc[epoch]
                torch.save(self.model, save_path)
            
            if verbose:
                print(f'Train Acc: {train_acc[epoch]:.4f}, Train Loss: {train_loss[epoch]:>7.6f}', end=', ')
                print(f'Val Acc: {val_acc[epoch]:.4f}, Val Loss: {val_loss[epoch]:>7.6f}', end=' -- ')
                print(f'Training Time: {train_time[epoch]:.2f}s')
        
        self.history = {'train_acc':  train_acc, 
                        'train_loss': train_loss, 
                        'val_acc':    val_acc, 
                        'val_loss':   val_loss, 
                        'time':       train_time}

    def train_epoch(self, train_loader):
        start = time()
        
        self.model.train()
        for data, label in train_loader:        # Iterate in batches over the training dataset.
            data.to(self.device)                # Train the data if gpu is available
            out = self.model(data)              # Perform a single forward pass.
            y = F.one_hot(label, num_classes=self.output_dim).to(torch.float)
            loss = self.criterion(out, y)       # Compute the loss.
            
            loss.backward()                     # Derive gradients.
            self.optimizer.step()               # Update parameters based on gradients.
            self.optimizer.zero_grad()          # Clear gradients.
        
        end = time()
        return end - start

    def test(self, loader):
        self.model.eval()

        loss = 0
        correct = 0
        for data, label in loader:                      # Iterate in batches over the training/test dataset.
            data.to(self.device)                        # Train the data if gpu is available
            out = self.model(data)                      # Predict the outcome by trained model
            y = F.one_hot(label, num_classes=self.output_dim).to(torch.float)
            loss += self.criterion(out, y).item()       # Get the loss accumulated of each data sample
            
            pred = out.argmax(dim=1)                    # Use the class with highest probability.
            correct += int((pred == label).sum())       # Check against ground-truth labels.

        acc = correct / len(loader.dataset)             # Get the accuracy
        avg_loss = loss / len(loader.dataset)           # Get the average loss
        return (acc, avg_loss)                          # Return the accuracy and average loss
    
    def load(self, path):
        self.model = torch.load(path)
        self.model.eval()

    def predict(self, loader):
        preds = []
        with torch.no_grad():
            for data in loader:
                data.to(self.device)
                pred = self.model(data).cpu().detach()
                preds.append(pred)
        preds = torch.vstack(preds)
        return preds

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn import Linear
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

class MalwareDataset(Dataset):
    def __init__(self, data, labels):
        self.data = data
        self.labels = labels

    def __len__(self):
        return self.data.size(0)

    def __getitem__(self, idx):
        return self.data[idx], self.labels[idx]

In [None]:
def split_df(df, n_or_frac, column='family', shuffle=True, allow_lower_n=False):
    if type(n_or_frac) is int:
        if allow_lower_n:
            train_df = df.groupby(column).apply(lambda x: x.sample(n=n_or_frac if x.shape[0]>=n_or_frac else x.shape[0])).droplevel(level=0)
        else:
            train_df = df.groupby(column).sample(n=n_or_frac)
                
    else:
        train_df = df.groupby(column).sample(frac=n_or_frac)
    valid_df = df[~df.index.isin(train_df.index)]
    
    if shuffle:
        train_df = train_df.sample(frac=1)
        valid_df = valid_df.sample(frac=1)
    return train_df, valid_df

In [None]:
df = pd.concat([ben_df, mal_df])
# label_map = {label: i for i, label in enumerate(df.iloc[:, 1].value_counts().index)}
label_map = {i: i for i in range(9)}
df = df.assign(family=df.iloc[:, 1].apply(lambda x: label_map[x]))
df = df.assign(malicious=(df.family!=0).astype(np.int64))
df

In [None]:
label_map


In [None]:
## Classifier

# train_df, valid_df = split_df(df, n_or_frac=0.8 ,column='family', shuffle=True, allow_lower_n=True) # family
# train_df, test_df  = split_df(train_df, n_or_frac=10000, column='family', shuffle=True, allow_lower_n=True) # family
# print('Family: ')
# print('Train: ')
# print(train_df.family.value_counts())
# print()
# print('Valid: ')
# print(valid_df.family.value_counts())

# X_train = train_df.iloc[:, 2:9].to_numpy()
# Y_train = train_df.family.to_numpy()
# X_test  = valid_df.iloc[:, 2:9].to_numpy()
# Y_test  = valid_df.family.to_numpy()

In [None]:
## Detector

train_df, valid_df = split_df(df, n_or_frac=0.8, column='malicious', shuffle=True, allow_lower_n=True)  # detector
print('Malicious: ')
print('Train: ')
print(train_df.malicious.value_counts())
print()
print('Valid: ')
print(valid_df.malicious.value_counts())

X_train = train_df.iloc[:, 2:9].to_numpy()
Y_train = train_df.malicious.to_numpy()
X_test  = valid_df.iloc[:, 2:9].to_numpy()
Y_test  = valid_df.malicious.to_numpy()

In [None]:
batch_size = 64
# device     = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device     = torch.device('cpu')

X_train_tensor = torch.tensor(X_train.astype(np.float32))
X_test_tensor  = torch.tensor(X_test.astype(np.float32))
Y_train_tensor = torch.tensor(Y_train, dtype=torch.long)   # detector
Y_test_tensor  = torch.tensor(Y_test, dtype=torch.long)    # detector

train_ds = MalwareDataset(X_train_tensor, Y_train_tensor)
valid_ds = MalwareDataset(X_test_tensor, Y_test_tensor)

train_loader = DataLoader(train_ds, batch_size=batch_size, num_workers=0, drop_last=True, shuffle=True, pin_memory=True)
valid_loader = DataLoader(valid_ds, batch_size=128, num_workers=0, drop_last=False)

In [None]:
model = MLP(num_features=X_train_tensor.size(1), hidden_channels=64, num_classes=Y_train_tensor.unique().size(0)).to(device)
# model = MLPCls(num_features=X_train_tensor.size(1), hidden_channels=64, num_classes=Y_train_tensor.unique().size(0)).to(device)

# define device of model before sending to the optimizer model.parameters() 
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001) # detector lr = 0.0008
criterion = torch.nn.CrossEntropyLoss()

print(f'Device: {device}\n')
print(f'Model: \n{model}\n')
print(f'Optimizer: \n{optimizer}\n')
print(f'Criterion: {criterion}\n')

In [None]:
trainer = TorchTrainer(model, optimizer=optimizer, criterion=criterion, device=device)

madel_path = './mlp_md.pt'
trainer.train(train_loader=train_loader, 
              valid_loader=valid_loader, 
              epochs=30, 
              save_path=madel_path, 
              verbose=True)

In [None]:
fake = np.ones(7)
fake2 = np.zeros(7)
np.vstack([fake,fake2])

#### MLP prediction

In [None]:
predict_trainer = TorchTrainer(model)
predict_trainer.load('./detection_model/mlp_md.pt')

train_data_loader = DataLoader(X_train_tensor, batch_size=batch_size, num_workers=0, drop_last=False, shuffle=False)
print(train_data_loader.dataset.shape)
valid_data_loader = DataLoader(X_test_tensor, batch_size=batch_size, num_workers=0, drop_last=False, shuffle=False)

train_pred = predict_trainer.predict(train_data_loader).argmax(dim=1)
valid_pred = predict_trainer.predict(valid_data_loader).argmax(dim=1)

In [None]:
def compute_metrices(y_true, y_pred, average='binary'):
    accuracy  = accuracy_score(y_true=y_true, y_pred=y_pred)
    precision = precision_score(y_true=y_true, y_pred=y_pred, average=average)
    recall    = recall_score(y_true=y_true, y_pred=y_pred, average=average)
    f1        = f1_score(y_true=y_true, y_pred=y_pred, average=average)
    return {'accuracy':  accuracy, 
            'precision': precision, 
            'recall':    recall, 
            'f1':        f1}

In [None]:
print('Train metrics')
compute_metrices(Y_train_tensor, train_pred)

In [None]:
print('Valid metrics')
compute_metrices(Y_test_tensor, valid_pred)