In this exercise, we use a labeled dataset of run-time API calls by Android apps to train a dynamic malware classifier.

In [1]:
import numpy as np
import os
import json 

def getAPICallsFromLog(logPath):
    apiCallsSequence = []
    with open(logPath) as f:
        data = json.load(f)
        api_calls_array = "[" + data['api_calls'] + "]"
        api_calls = json.loads(api_calls_array)
        for api_call in api_calls:
            call = api_call['class']+":"+api_call['method']
            apiCallsSequence.append(call)
    return apiCallsSequence
    
directoriesWithLabels = [("DA Logs/Benign",0), ("DA Logs/Malware",1)]
corpus = []
y = []
for directory, label in directoriesWithLabels:
    files = os.listdir(directory)
    for file in files:
        filePath = directory+"/"+file
        corpus.append(getAPICallsFromLog(filePath))
        y.append(label)

In [2]:
print(corpus[0])

['android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'libcore.io.IoBridge:open', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'libcore.io.IoBridge:open', 'android.os.SystemProperties:get', 'libcore.io.IoBridge:open', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.app.Activity:startActivity', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'android.os.SystemProperties:get', 'a

In [3]:
from sklearn.model_selection import train_test_split
corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.33, random_state=42)

In [4]:
import collections
from nltk import ngrams
import numpy as np

def textToNgrams(text, n):
    Ngrams = ngrams(text, n)
    return list(Ngrams)
    
def extractNgramCounts(text, N):
    Ngrams = textToNgrams(text, N)
    return collections.Counter(Ngrams)

def getNGramFeaturesFromSample(file, K1_most_common_Ngrams_list):
    K1 = len(K1_most_common_Ngrams_list)
    fv = K1*[0]
    fileNgrams = extractNgramCounts(file, N)
    for i in range(K1):
        fv[i]=fileNgrams[K1_most_common_Ngrams_list[i]]
    return fv

In [5]:
N=3
K1 = 2000
totalNgramCount = collections.Counter([])
for file in corpus_train:
    totalNgramCount += extractNgramCounts(file, N)
K1_most_common_Ngrams = totalNgramCount.most_common(K1)
K1_most_common_Ngrams_list = [x[0] for x in K1_most_common_Ngrams]

In [6]:
K1_most_common_Ngrams_list

[('java.lang.reflect.Method:invoke',
  'java.lang.reflect.Method:invoke',
  'java.lang.reflect.Method:invoke'),
 ('android.content.ContentValues:put',
  'android.content.ContentValues:put',
  'android.content.ContentValues:put'),
 ('java.io.FileInputStream:read',
  'java.io.FileInputStream:read',
  'java.io.FileInputStream:read'),
 ('libcore.io.IoBridge:open',
  'libcore.io.IoBridge:open',
  'libcore.io.IoBridge:open'),
 ('android.os.SystemProperties:get',
  'android.os.SystemProperties:get',
  'android.os.SystemProperties:get'),
 ('android.util.Base64:decode',
  'android.util.Base64:decode',
  'android.util.Base64:decode'),
 ('java.net.URL:openConnection',
  'java.net.URL:openConnection',
  'java.net.URL:openConnection'),
 ('java.lang.reflect.Method:invoke',
  'libcore.io.IoBridge:open',
  'java.lang.reflect.Method:invoke'),
 ('libcore.io.IoBridge:open',
  'java.lang.reflect.Method:invoke',
  'java.lang.reflect.Method:invoke'),
 ('java.lang.reflect.Method:invoke',
  'java.lang.reflect

In [7]:
def featurizeSample(file, Ngrams_list):
    K1 = len(Ngrams_list)
    fv = K1*[0]
    fileNgrams = extractNgramCounts(file, N)
    for i in range(K1):
        fv[i]=fileNgrams[Ngrams_list[i]]
    return fv

In [8]:
X_train = []
for sample in corpus_train:
    X_train.append(featurizeSample(sample, K1_most_common_Ngrams_list))
X_train = np.asarray(X_train)
X_test = []
for sample in corpus_test:
    X_test.append(featurizeSample(sample, K1_most_common_Ngrams_list))
X_test = np.asarray(X_test)

In [9]:
print(X_train.shape)
print(X_test.shape)

(1577, 2000)
(778, 2000)


In [10]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
K2 = 1000
mi_rf_pipeline = Pipeline([('mi_selector', SelectKBest(mutual_info_classif, k=K2)),('rf', RandomForestClassifier(n_estimators=100)),])

In [16]:
mi_rf_pipeline.fit(X_train,y_train)
print("Training accuracy:")
print(mi_rf_pipeline.score(X_train, y_train))
print("Testing accuracy:")
print(mi_rf_pipeline.score(X_test, y_test))

Training accuracy:
0.9733671528218135
Testing accuracy:
0.8817480719794345


**Exercise 1:** Analyze the dataset and comment on whether there is imbalance between the classes. If so, use the BalancedBaggingClassifier method of imblearn to train the model, and report the results.

In [None]:
#Uncomment the following line if install is necessary.
#!pip install imbalanced-learn

In [13]:
'''
There is an imbalance between the classes as X_train has further features identified than does X_test,
which is shown by the shape of the numpy arrays above.

Below the BalancedBaggingClassifier is used to show that the accuracy of this method is lower than
the training accuracy or testing accuracy above would suggest.
'''
from sklearn.metrics import balanced_accuracy_score
from imblearn.ensemble import BalancedBaggingClassifier
from sklearn.tree import DecisionTreeClassifier
BBC = BalancedBaggingClassifier(base_estimator=DecisionTreeClassifier(),sampling_strategy='auto',replacement=False)
BBC.fit(X_train, y_train) 
BBCPred = BBC.predict(X_test)
print(collections.Counter(BBCPred))
print(balanced_accuracy_score(y_test, BBCPred))  

Counter({0: 565, 1: 213})
0.797787140213411


**Exercise 2:** Generate the confusion matrix of the original classifier. What is the FPR? Train a logistic regression model on the same dataset to satisfy an FPR constraint of 3%.

In [14]:
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

desiredFPR = 0.03

def FPR(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)
    TN = CM[0][0]
    FP = CM[0][1]
    FPR = FP/(FP+TN)
    return FPR

def TPR(y_true, y_pred):
    CM = confusion_matrix(y_true, y_pred)
    TP = CM[1][1]
    FN = CM[1][0]
    TPR = TP/(TP+FN)
    return TPR

def thresholdVector(vector, threshold):
    return [0 if x>=threshold else 1 for x in vector]

LR=LogisticRegression()
LR.fit(X_train,y_train)
LRPredProb = LR.predict_proba(X_train)
print("Probabilities look like so:", LRPredProb[0:5])
print()

M = 100
print("Testing thresholds:")
for threshold in reversed(range(M)):
    thresholdScaled = float(threshold)/M
    thresholdedPrediction = thresholdVector(LRPredProb[:,0], thresholdScaled)    
    print(threshold, FPR(y_train, thresholdedPrediction), TPR(y_train, thresholdedPrediction))
    if FPR(y_train,thresholdedPrediction)<desiredFPR:
        print("Selected threshold: ", thresholdScaled)
        break

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Probabilities look like so: [[0.61001031 0.38998969]
 [0.99430912 0.00569088]
 [0.00566236 0.99433764]
 [0.99180439 0.00819561]
 [0.28959057 0.71040943]]

Testing thresholds:
99 0.7918552036199095 0.9872881355932204
98 0.7475113122171946 0.9851694915254238
97 0.7203619909502262 0.9851694915254238
96 0.6995475113122172 0.9830508474576272
95 0.6787330316742082 0.9809322033898306
94 0.6615384615384615 0.9788135593220338
93 0.6470588235294118 0.9788135593220338
92 0.6316742081447964 0.9788135593220338
91 0.6144796380090498 0.9766949152542372
90 0.6081447963800904 0.9766949152542372
89 0.5963800904977375 0.9766949152542372
88 0.5882352941176471 0.9745762711864406
87 0.5782805429864254 0.9703389830508474
86 0.5656108597285068 0.9661016949152542
85 0.5529411764705883 0.9639830508474576
84 0.5457013574660633 0.961864406779661
83 0.5339366515837104 0.961864406779661
82 0.5257918552036199 0.9597457627118644
81 0.5158371040723982 0.9533898305084746
80 0.5076923076923077 0.951271186440678
79 0.497