# Imports

In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
import config
import builder
import transformer

# Fetch Dataset

Dataset is fetched into memory via the function _Reader_.
It reads the features from every malware classes' _dump.json_ in their respective folders.

In [3]:
# Setup dataset for training.
X = []
Y = []
dataset = builder.Reader()
# Iterate over dataset through all specified classes.
for typeClass in config.Classes:
    X += dataset[typeClass]
    # Append labels for all the elements fetched in the given class.
    for i in range(len(dataset[typeClass])):
        Y.append(list(dataset.keys()).index(typeClass))

[36m[*] Initiated dataset read.[00m
[34m[*] Reading dataset for backdoor.[00m
[91m[+] Dataset fetch for backdoor complete.[00m
[34m[*] Reading dataset for worm.[00m
[91m[+] Dataset fetch for worm complete.[00m
[34m[*] Reading dataset for trojan.[00m
[91m[+] Dataset fetch for trojan complete.[00m
[34m[*] Reading dataset for rootkit.[00m
[91m[+] Dataset fetch for rootkit complete.[00m
[34m[*] Reading dataset for virus.[00m
[91m[+] Dataset fetch for virus complete.[00m
[34m[*] Reading dataset for bot.[00m
[91m[+] Dataset fetch for bot complete.[00m
[34m[*] Reading dataset for ransomware.[00m
[91m[+] Dataset fetch for ransomware complete.[00m
[34m[*] Reading dataset for adware.[00m
[91m[+] Dataset fetch for adware complete.[00m
[34m[*] Reading dataset for downloader.[00m
[91m[+] Dataset fetch for downloader complete.[00m
[91m[+] Dataset loading complete.[00m


Form the datasets for training and testing.

In [4]:
Data = np.array(X, dtype=np.float32)
Labels = np.asarray(Y, dtype=np.float32)
print("Shape of data: ", Data.shape)
print("Shape of labels: ", Labels.shape)

Shape of data:  (422, 2152)
Shape of labels:  (422,)


Load the data into dataframes.

In [5]:
# Load respective dataframes.
X = pd.DataFrame(Data)
Y = pd.DataFrame(Labels)

# Make sure samples from all classes are present.
print(Y[0].unique())

# Segregate the data and labels from the same dataframe to prevent inconsistency.
frames = [X, Y]
DATA = pd.concat(frames, axis = 1)
Y = DATA.iloc[:, -1]
print(Y.shape)
X = DATA.iloc[:, :-1]
print(np.unique(Y))

[0. 1. 2. 3. 4. 5. 6. 7. 8.]
(422,)
[0. 1. 2. 3. 4. 5. 6. 7. 8.]


Split the dataset for training and testing.

In [6]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 42)

x_train = np.array(x_train)
print("Training data:", np.shape(x_train))

y_train = np.array(y_train)
print("Training labels:", np.shape(y_train))

x_test = np.array(x_test)
print("Testing data:", np.shape(x_test))

y_test = np.array(y_test)
print("Testing labels:", np.shape(y_test))

Training data: (337, 2152)
Training labels: (337,)
Testing data: (85, 2152)
Testing labels: (85,)


In [7]:
print("Number of samples for training:", np.shape(x_train)[0])
print("Number of samples for testing:", np.shape(x_test)[0])
print("Number of features for each sample:", np.shape(x_train)[1])

Number of samples for training: 337
Number of samples for testing: 85
Number of features for each sample: 2152


# Create Model

Set the parameters for the model.

In [8]:
params = {}
params["learning_rate"] = 0.05
params["boosting_type"] = "gbdt"
params["objective"] = "multiclass"
params["num_class"] = len(config.Classes)
params["metric"] = "multi_logloss"
params["sub_feature"] = 0.3
params["num_leaves"] = 15
params["min_data"] = 95
params["max_depth"] = 15
params["device"] = "cpu"

Train the model.

In [9]:
# Setup the dataset for training.
d_train = lgb.Dataset(x_train, label = y_train)

# Train the model based on the aforementioned dataset.
clf = lgb.train(params, d_train, 100)

Check accuracy on test set.

In [10]:
y_pred = clf.predict(x_test)
best_preds = [np.argmax(line) for line in y_pred]
accuracy_score(y_test, best_preds)

0.7376470588235295

Save the model

In [11]:
clf.save_model("model.mdl")

<lightgbm.basic.Booster at 0x7fe94a227d00>

# Prediction

Define a function to make prediction for a given input.

In [12]:
def Prediction(pe_file, mdlFile):
    
    predictor = lgb.Booster(model_file = mdlFile)    
    # Fetch the feature vector for the PE.
    transformed = transformer.PETransformer(peFile).vector
    # Make prediction for the PE.
    preds = predictor.predict(transformed.reshape(1, 2152))
    # Gives the maximum value out of all the predicted labels.
    return config.Classes[np.argmax(preds)]

Sample prediction.

In [13]:
peFile = "dataset/trojan/04eacd2031de21c56ccec496e1b5ed68"

# Get the data from the file.
data = open(peFile, "rb").read()
# Predict the class of the file.
print(Prediction(data, "model.mdl"))

trojan
