In this exercise, we perform feature extraction, selection and vectorization for binary files using the frequency, mutual information, and Chi test methods available in the SKLearn library.

In [1]:
#To install sklearn, uncomment and run the following line:
!pip install sklearn

Collecting sklearn
  Downloading sklearn-0.0.tar.gz (1.1 kB)
Collecting scikit-learn
  Downloading scikit_learn-0.22.2.post1-cp36-cp36m-manylinux1_x86_64.whl (7.1 MB)
[K     |████████████████████████████████| 7.1 MB 2.0 MB/s eta 0:00:01     |████████▉                       | 1.9 MB 2.0 MB/s eta 0:00:03     |████████████▉                   | 2.8 MB 2.0 MB/s eta 0:00:03     |███████████████████████████▉    | 6.1 MB 2.0 MB/s eta 0:00:01
[?25hCollecting scipy>=0.17.0
  Downloading scipy-1.4.1-cp36-cp36m-manylinux1_x86_64.whl (26.1 MB)
[K     |████████████████████████████████| 26.1 MB 46.5 MB/s eta 0:00:01   |▋                               | 512 kB 912 kB/s eta 0:00:28     |█                               | 860 kB 912 kB/s eta 0:00:28     |███                             | 2.4 MB 912 kB/s eta 0:00:26     |█████                           | 4.1 MB 912 kB/s eta 0:00:25     |█████████                       | 7.3 MB 912 kB/s eta 0:00:21     |█████████▊                      | 7.9 MB 912 kB/s 

In [3]:
import collections
from nltk import ngrams
import numpy as np

def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
    return data

def byteSequenceToNgrams(byteSequence, n):
    Ngrams = ngrams(byteSequence, n)
    return list(Ngrams)
    
def extractNgramCounts(file, N):
    fileByteSequence = readFile(file)
    fileNgrams = byteSequenceToNgrams(fileByteSequence, N)
    return collections.Counter(fileNgrams)

In [4]:
from os import listdir
from os.path import isfile, join
directories = ["Samples/Benign", "Samples/Malware"]
#Assign the value of the N in N-grams
N=2

In [5]:
#This may take a few minutes to run
totalNgramCount = collections.Counter([])
for datasetPath in directories:
    samples = [f for f in listdir(datasetPath) if isfile(join(datasetPath,f))]
    for file in samples:
        filePath = join(datasetPath, file)
        totalNgramCount += extractNgramCounts(filePath, N)

In [6]:
#Extract the list of top 1000 most common N-Grams
K1 = 1000
K1_most_common_Ngrams = totalNgramCount.most_common(K1)
K1_most_common_Ngrams_list = [x[0] for x in K1_most_common_Ngrams]

In [7]:
K1_most_common_Ngrams_list

[(0, 0),
 (255, 255),
 (204, 204),
 (2, 100),
 (1, 0),
 (0, 139),
 (131, 196),
 (2, 0),
 (68, 36),
 (139, 69),
 (0, 131),
 (255, 117),
 (133, 192),
 (255, 139),
 (254, 255),
 (141, 77),
 (139, 77),
 (255, 21),
 (7, 0),
 (69, 252),
 (76, 36),
 (8, 139),
 (0, 1),
 (4, 0),
 (46, 46),
 (137, 69),
 (4, 139),
 (141, 69),
 (255, 131),
 (0, 137),
 (0, 255),
 (51, 192),
 (80, 232),
 (255, 141),
 (85, 139),
 (8, 0),
 (0, 232),
 (3, 100),
 (0, 116),
 (15, 182),
 (80, 141),
 (139, 236),
 (100, 0),
 (15, 132),
 (12, 139),
 (255, 0),
 (84, 36),
 (65, 68),
 (64, 0),
 (73, 78),
 (80, 65),
 (68, 68),
 (253, 255),
 (78, 71),
 (68, 73),
 (0, 204),
 (16, 0),
 (198, 69),
 (192, 116),
 (199, 69),
 (3, 0),
 (80, 255),
 (4, 137),
 (204, 139),
 (139, 68),
 (116, 36),
 (101, 0),
 (139, 76),
 (100, 139),
 (106, 0),
 (64, 2),
 (196, 12),
 (0, 8),
 (139, 70),
 (2, 101),
 (36, 8),
 (0, 89),
 (196, 4),
 (117, 8),
 (32, 0),
 (69, 8),
 (86, 139),
 (95, 94),
 (0, 16),
 (131, 192),
 (0, 80),
 (139, 255),
 (0, 117),
 (10

In [8]:
# Create a vector fv[] of the most frequent n-grams
def featurizeSample(file, K1_most_common_Ngrams_list):
    K1 = len(K1_most_common_Ngrams_list)
    fv = K1*[0]
    fileNgrams = extractNgramCounts(file, N)
    for i in range(K1):
        fv[i]=fileNgrams[K1_most_common_Ngrams_list[i]]
    return fv

In [9]:
# Create a labeled dataset of frequency-based feature vectors 
# for benign and malware samples.
# This may take a few minutes to run.
directoriesWithLabels = [("Samples/Benign",0), ("Samples/Malware",1)]
X = []
y = []
fileNum = 0
for datasetPath, label in directoriesWithLabels:
    samples = [f for f in listdir(datasetPath) if isfile(join(datasetPath,f))]
    for file in samples:
        fileNum +=1
        filePath = join(datasetPath, file)
        X.append(featurizeSample(filePath, K1_most_common_Ngrams_list))
        y.append(label)

In [10]:
X = np.asarray(X)

In [11]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif, chi2
K2 = 100

In [12]:
#Select the 100 most frequent features in each sample
X_top_K2_freq = X[:,:K2]

In [13]:
#Feature selection based on the mutual information method
mi_selector = SelectKBest(mutual_info_classif, k=K2)
X_top_K2_mi = mi_selector.fit_transform(X, y)

In [14]:
#Feature selection based on the Chi test method
chi2_selector = SelectKBest(chi2, k=K2)
X_top_K2_ch2 = chi2_selector.fit_transform(X, y)

**Exercise:** How many feature vectors in X_top_K2_ch2 are the same as their corresponding feature vectors in X_top_K2_freq? (i.e., X_top_K2_ch2[i] == X_top_K2_freq[i]). Calculate the same between X_top_K2_mi and X_top_K2_ch2, as well as between X_top_K2_mi and X_top_K2_freq.

In [15]:
type(X_top_K2_ch2)

numpy.ndarray

In [27]:
chifreq = np.in1d(X_top_K2_ch2, X_top_K2_freq)
chifreq2 = np.intersect1d(X_top_K2_ch2, X_top_K2_freq)

In [29]:
chimi = np.in1d(X_top_K2_ch2, X_top_K2_mi)
chimi2 = np.intersect1d(X_top_K2_ch2, X_top_K2_mi)

In [30]:
freqmi = np.in1d(X_top_K2_freq, X_top_K2_mi)
freqmi2 = np.intersect1d(X_top_K2_freq, X_top_K2_mi)

In [31]:
print("Between the Frequency and the Chi test, the following represents how many correspond: ", chifreq2.size)
print("Between the Frequency and Mutual Information, the following represents how many correspond: ", freqmi2.size)
print("Between the Chi test and Mutual Information, the following represents how many correspond: ", chimi2.size)

Between the Frequency and the Chi test, the following represents how many correspond:  1519
Between the Frequency and Mutual Information, the following represents how many correspond:  356
Between the Chi test and Mutual Information, the following represents how many correspond:  352
