In [69]:
import torch
from torch import nn
from torchmetrics import F1Score
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Android Malware Detection
## A Binary Classification Problem

The dataset can be found [here](https://www.kaggle.com/datasets/joebeachcapital/tuandromd/)

**Variables**:

1-214: Permission-based features
215-241: API based features

**Class Labels**

Class: 1) Malware 2) Goodware

### Importing Data

In [3]:
raw_data = pd.read_csv("./data/TUANDROMD.csv")

In [59]:
raw_data.dropna().describe()

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/content/pm/PackageManager;->getInstalledPackages,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute
count,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,...,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0,4464.0
mean,0.000896,0.000896,0.004928,0.083333,0.006944,0.087814,0.022849,0.023746,0.000224,0.596326,...,0.051971,0.173163,0.24552,0.21595,0.091622,0.08871,0.056228,0.133065,0.013665,0.15009
std,0.029924,0.029924,0.070037,0.276416,0.083053,0.283055,0.14944,0.152272,0.014967,0.490689,...,0.221994,0.378431,0.430443,0.411526,0.288524,0.284356,0.230387,0.339682,0.116108,0.357199
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


Not only the values are just 0 and 1, this matrix is very sparse.

In [19]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4465 entries, 0 to 4464
Columns: 242 entries, ACCESS_ALL_DOWNLOADS to Label
dtypes: float64(241), object(1)
memory usage: 8.2+ MB


In [46]:
raw_data[raw_data.isna().any(axis=1)] 
# there's only 1 row whose values have at leas an NaN

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
2533,,,,,,,,,,,...,,,,,,,,,,


In [47]:
preprocessed_data = raw_data.dropna()
preprocessed_data.info() # 4464 entries, one less than before

<class 'pandas.core.frame.DataFrame'>
Index: 4464 entries, 0 to 4464
Columns: 242 entries, ACCESS_ALL_DOWNLOADS to Label
dtypes: float64(241), object(1)
memory usage: 8.3+ MB


In [48]:
preprocessed_data.head()
# label needs to be encoded.

Unnamed: 0,ACCESS_ALL_DOWNLOADS,ACCESS_CACHE_FILESYSTEM,ACCESS_CHECKIN_PROPERTIES,ACCESS_COARSE_LOCATION,ACCESS_COARSE_UPDATES,ACCESS_FINE_LOCATION,ACCESS_LOCATION_EXTRA_COMMANDS,ACCESS_MOCK_LOCATION,ACCESS_MTK_MMHW,ACCESS_NETWORK_STATE,...,Landroid/telephony/TelephonyManager;->getLine1Number,Landroid/telephony/TelephonyManager;->getNetworkOperator,Landroid/telephony/TelephonyManager;->getNetworkOperatorName,Landroid/telephony/TelephonyManager;->getNetworkCountryIso,Landroid/telephony/TelephonyManager;->getSimOperator,Landroid/telephony/TelephonyManager;->getSimOperatorName,Landroid/telephony/TelephonyManager;->getSimCountryIso,Landroid/telephony/TelephonyManager;->getSimSerialNumber,Lorg/apache/http/impl/client/DefaultHttpClient;->execute,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,malware
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,malware
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,malware
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,malware


In [61]:
label_enc = LabelEncoder()
preprocessed_data["Label"] = label_enc.fit_transform(preprocessed_data["Label"])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  preprocessed_data["Label"] = label_enc.fit_transform(preprocessed_data["Label"])


In [62]:
preprocessed_data["Label"].value_counts()

Label
1    3565
0     899
Name: count, dtype: int64

This dataset only seems to need minimal preprocessing.

### Splitting The Data

In [88]:
# Separating features and label
X = torch.tensor(preprocessed_data.iloc[:,:-1].values, dtype=torch.float16)
# float16 uses less memory
y = torch.tensor(preprocessed_data[["Label"]].values, dtype=torch.float16)
# Create train/test split
train_split = int(0.8 * len(X)) # 80% of data used for training set, 20% for testing 
X_train, y_train = X[:train_split], y[:train_split]

X_test, y_test = X[train_split:], y[train_split:]

X_train.shape, y_train.shape, X_test.shape, y_test.shape

(torch.Size([3571, 241]),
 torch.Size([3571, 1]),
 torch.Size([893, 241]),
 torch.Size([893, 1]))

### Creating The Model

In [80]:
# Make device agnostic code
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [100]:
# 1. Construct a model class that subclasses nn.Module
class MalwareLinearClassifier(nn.Module):
    def __init__(self, bce_loss_with_logits:bool = True):
        super().__init__()
        self.train_loss_values = []
        self.test_loss_values = []
        self.bce_loss_with_logits = bce_loss_with_logits
        self.f1 = F1Score(task="binary", num_classes=2)
        # 2. Create 2 nn.Linear layers capable of handling X and y input and output shapes
        self.layer_1 = nn.Linear(in_features=241, out_features=5, dtype=torch.float16) 
        # takes in 241 features (X), produces 5 features
        # since the matrix is very sparse, I wonder if we can compress the information
        self.layer_2 = nn.Linear(in_features=5, out_features=1, dtype=torch.float16)
            # produces 1 feature, since we're in a binary classification problem
        if not bce_loss_with_logits:
            self.layer_3 = nn.Sigmoid(in_features=5, out_features=1)
        

    def forward(self, x):
        return  self.layer_2(self.layer_1(x))
    
    def train_loop(self, input: torch.Tensor, labels: torch.Tensor, 
                   loss_fn:nn.modules.loss._Loss, optimizer: torch.optim.Optimizer):
        self.train()
        # Forward pass on train data using the forward() method
        if self.bce_loss_with_logits:
            # bce_loss_with_logits works directly with logits
            pred = self(input)
        else:
            pred = torch.round(self(input)) 
        loss = loss_fn(pred, labels)
        f1_value = self.f1(pred, labels)
        # Backpropagation
        loss.backward()
        # optimize parameters
        optimizer.step()
        # restart cycle
        optimizer.zero_grad()
        return loss, f1_value
    
    def test_loop(self, input: torch.Tensor, labels: torch.Tensor, 
                  loss_fn:nn.modules.loss._Loss):
        self.eval()
        with torch.inference_mode():
            pred = self(input)
            loss = loss_fn(pred, labels)
            f1_value = self.f1(pred, labels)
        return loss, f1_value
    
    def train_model(self, input: torch.Tensor, labels: torch.Tensor,
                    loss_fn:nn.modules.loss._Loss, optimizer: torch.optim.Optimizer,
                    n_epochs:int = 100):
        for epoch in range(1,n_epochs+1):
            print(f"Epoch {epoch}\n-------------------------------")
            train_loss, train_f1 = self.train_loop(input = input, labels = labels, 
                                         loss_fn = loss_fn, optimizer = optimizer)
            self.train_loss_values.append(train_loss.detach().cpu().numpy())
            test_loss, test_f1 = self.test_loop(input = input, labels = labels, 
                                       loss_fn = loss_fn)
            self.test_loss_values.append(test_loss.detach().cpu().numpy())
            print(
                f"{loss_fn._get_name()} Train Loss: {train_loss} |"
                f"{loss_fn._get_name()} Test Loss: {test_loss}"
            )
            print(
                f"F1 Train Score: {train_f1}  | F1 Test Score: {test_f1}"
            )
        print("Done!")

In [101]:
# 4. Create an instance of the model and send it to target device
linear_cl = MalwareLinearClassifier().to(device)
linear_cl

MalwareLinearClassifier(
  (f1): BinaryF1Score()
  (layer_1): Linear(in_features=241, out_features=5, bias=True)
  (layer_2): Linear(in_features=5, out_features=1, bias=True)
)

In [102]:
linear_cl(X_train.to(device)) # simple litmus test to check whether it's working

tensor([[-0.2734],
        [-0.2812],
        [-0.2986],
        ...,
        [-0.3066],
        [-0.3188],
        [-0.3511]], device='cuda:0', dtype=torch.float16,
       grad_fn=<AddmmBackward0>)

In [103]:
# Create a loss function
# loss_fn = nn.BCELoss() # BCELoss = no sigmoid built-in
loss_fn = nn.BCEWithLogitsLoss() # BCEWithLogitsLoss = sigmoid built-in

# Create an optimizer
optimizer = torch.optim.SGD(params=linear_cl.parameters(), 
                            lr=0.1)


In [104]:
linear_cl.train_model(input= X_train.to(device), labels=y_train.to(device), 
                      loss_fn=loss_fn, optimizer=optimizer)

Epoch 1
-------------------------------
BCEWithLogitsLoss Train Loss: 0.8291015625 |BCEWithLogitsLoss Test Loss: 0.71533203125
F1 Train Score: 0.0  | F1 Test Score: 0.3769735097885132
Epoch 2
-------------------------------
BCEWithLogitsLoss Train Loss: 0.71533203125 |BCEWithLogitsLoss Test Loss: 0.62548828125
F1 Train Score: 0.3769735097885132  | F1 Test Score: 0.9489138722419739
Epoch 3
-------------------------------
BCEWithLogitsLoss Train Loss: 0.62548828125 |BCEWithLogitsLoss Test Loss: 0.552734375
F1 Train Score: 0.9489138722419739  | F1 Test Score: 0.9493876099586487
Epoch 4
-------------------------------
BCEWithLogitsLoss Train Loss: 0.552734375 |BCEWithLogitsLoss Test Loss: 0.49365234375
F1 Train Score: 0.9493876099586487  | F1 Test Score: 0.9497058987617493
Epoch 5
-------------------------------
BCEWithLogitsLoss Train Loss: 0.49365234375 |BCEWithLogitsLoss Test Loss: 0.4462890625
F1 Train Score: 0.9497058987617493  | F1 Test Score: 0.9497058987617493
Epoch 6
-------------

An F1 of +99%, with such a simple model seems too good to be true...
There could be some sort of label data leakage in the dataset, specially if we look at the test and train F1 score and notice that they are equal. This is highly suspicious.

See part 2, for a better explanation of what's happening here (spoiler: imbalanced classes)