In this exercise, we will use the ngrams toolset of the Natural Language ToolKit (NLTK) to extract N-gram features from "mkdir.exe". We also find the 50 most frequently repeated N-grams in the file.

In [39]:
#to install NLTK, uncomment and run the following line:
!pip install nltk



In [40]:
import collections
from nltk import ngrams
file = "mkdir.exe"

In [41]:
#Reads and returns the byte code from the file
def readFile(filePath):
    with open(filePath, "rb") as binary_file:
        data = binary_file.read()
    return data

#Generates n-grams from the byte code
def byteSequenceToNgrams(byteSequence, n):
    Ngrams = ngrams(byteSequence, n)
    return list(Ngrams)
    
#Generates n-grams from binary file,
# and returns the frequency of each n-gram
def extractNgramCounts(file, N):
    fileByteSequence = readFile(file)
    fileNgrams = byteSequenceToNgrams(fileByteSequence, N)
    return collections.Counter(fileNgrams)

In [42]:
extractedNgrams = extractNgramCounts(file, 3)

In [43]:
extractedNgrams

Counter({(77, 90, 144): 1,
         (90, 144, 0): 1,
         (144, 0, 3): 1,
         (0, 3, 0): 3,
         (3, 0, 0): 39,
         (0, 0, 0): 3232,
         (0, 0, 4): 10,
         (0, 4, 0): 8,
         (4, 0, 0): 42,
         (0, 0, 255): 9,
         (0, 255, 255): 5,
         (255, 255, 0): 33,
         (255, 0, 0): 7,
         (0, 0, 184): 12,
         (0, 184, 0): 1,
         (184, 0, 0): 1,
         (0, 0, 64): 7,
         (0, 64, 0): 10,
         (64, 0, 0): 91,
         (0, 0, 128): 26,
         (0, 128, 0): 4,
         (128, 0, 0): 7,
         (0, 0, 14): 3,
         (0, 14, 31): 1,
         (14, 31, 186): 1,
         (31, 186, 14): 1,
         (186, 14, 0): 1,
         (14, 0, 180): 1,
         (0, 180, 9): 1,
         (180, 9, 205): 1,
         (9, 205, 33): 1,
         (205, 33, 184): 1,
         (33, 184, 1): 1,
         (184, 1, 76): 1,
         (1, 76, 205): 1,
         (76, 205, 33): 1,
         (205, 33, 84): 1,
         (33, 84, 104): 1,
         (84, 104, 105): 2,

In [44]:
#print the top 50 most common n-grams
print(extractedNgrams.most_common(50))

[((0, 0, 0), 3232), ((255, 255, 255), 297), ((144, 144, 144), 261), ((0, 0, 139), 176), ((0, 0, 137), 156), ((199, 4, 36), 114), ((1, 0, 0), 112), ((255, 255, 139), 112), ((137, 68, 36), 104), ((137, 84, 36), 102), ((255, 255, 137), 102), ((66, 64, 0), 99), ((137, 76, 36), 98), ((64, 0, 0), 91), ((64, 0, 137), 87), ((36, 4, 137), 79), ((137, 4, 36), 78), ((36, 8, 137), 78), ((0, 0, 133), 78), ((0, 133, 192), 75), ((145, 64, 0), 75), ((254, 255, 255), 73), ((0, 0, 66), 73), ((0, 66, 64), 71), ((4, 36, 232), 67), ((0, 144, 144), 64), ((160, 0, 0), 63), ((32, 37, 115), 62), ((64, 0, 144), 59), ((144, 255, 37), 58), ((144, 144, 255), 58), ((2, 0, 0), 57), ((85, 137, 229), 57), ((0, 0, 199), 57), ((137, 52, 36), 57), ((0, 0, 160), 56), ((163, 0, 0), 55), ((137, 92, 36), 55), ((253, 255, 255), 53), ((36, 4, 232), 52), ((0, 160, 0), 50), ((0, 0, 141), 50), ((137, 116, 36), 48), ((137, 124, 36), 47), ((52, 36, 232), 47), ((0, 0, 131), 46), ((84, 36, 4), 45), ((64, 0, 139), 44), ((0, 0, 232), 4

**Exercise:** Find the largest value of N for which there exists at least one N-gram with with a frequency of 3 or more. 

In [45]:
def checkHighN(file, N):
    fileByteSequence = readFile(file)
    fileNgrams = byteSequenceToNgrams(fileByteSequence, N)
    return collections.Counter(fileNgrams)

In [46]:
'''Checked through the following itertations until finding an N which was
so high that there was only 1 N-gram with a frequency above 3.

checkN = checkHighN(file, 10)
checkN = checkHighN(file, 20)
checkN = checkHighN(file, 50)
checkN = checkHighN(file, 100)
checkN = checkHighN(file, 150)
checkN = checkHighN(file, 175)
checkN = checkHighN(file, 180)
checkN = checkHighN(file, 185)
checkN = checkHighN(file, 190)
checkN = checkHighN(file, 200)
'''
checkN = checkHighN(file, 186)#point at which there is only one result with a frequency above 3.

In [47]:
checkN

Counter({(77,
          90,
          144,
          0,
          3,
          0,
          0,
          0,
          4,
          0,
          0,
          0,
          255,
          255,
          0,
          0,
          184,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          64,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          0,
          128,
          0,
          0,
          0,
          14,
          31,
          186,
          14,
          0,
          180,
          9,
          205,
          33,
          184,
          1,


In [48]:
print(checkN.most_common(50))

[((0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0), 1411), ((0, 0, 0, 0, 0, 0, 80, 162, 0, 0, 92, 162, 0, 0, 102, 162, 0, 0, 112, 162, 0, 0, 122, 162, 0, 0, 130, 162, 0, 0, 140, 162, 0, 0, 150, 162, 0, 0, 160, 162, 0, 0, 168, 162, 0, 0, 176, 162, 0, 0, 184, 162, 0, 0, 212, 162, 0, 0, 230, 162, 0, 0, 2, 163, 0, 0, 10, 163, 0, 0, 20, 163, 0, 0, 30, 163, 0, 0, 40, 163, 0, 0, 50, 163, 0, 0, 58, 163, 0, 0, 66, 163, 0, 0, 76, 163, 0, 0, 86, 163, 0, 0, 94, 163, 0, 0, 104, 163, 0, 0, 114, 163,