# Hyperparameters

In [None]:
path = './benignData/'

numFeatures  = 256
minWordCount = 50
windowSize   = 10
numWorkers   = 7

epochs = 10
lr     = 0.0002

# Read the Assembly Files

In [None]:
import os
import pickle
import numpy as np
import tqdm

In [None]:
# data = []
# for fileName in os.listdir(path):
#     filePath = path + fileName
#     if os.path.isfile(filePath):
#         d = pickle.load(open(filePath, 'rb'))
#         for i in d:
#             for j in i:
#                 data.append(j)
#                 
# print('The number of data is ' + str(len(data)))

# Embedding

In [None]:
from sklearn.manifold import TSNE
import matplotlib.pyplot as plt
import gensim 
import pandas as pd
import time

In [None]:
def drawResult(model, fileName=None):
    vocab = list(model.wv.vocab)
    X     = model.wv[vocab]
    tsne  = TSNE(n_components=2)
    tsneX = tsne.fit_transform(X)
    df    = pd.DataFrame(tsneX, index=vocab, columns=['x', 'y'])
    
    fig = plt.figure(figsize=(60, 60))
    ax  = fig.add_subplot(1, 1, 1)
    ax.scatter(df['x'], df['y'])

    for word, pos in df.iterrows():
        ax.annotate(word, pos, fontsize=30)
    plt.title(fileName)
    plt.show()
    
    if fileName is not None:
        fig.savefig(fileName + '.png', dpi=300)

## Word2Vec

In [None]:
class Word2VecSentense(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fileName in os.listdir(self.dirname):
            filePath = path + fileName
            if os.path.isfile(filePath):
                d = pickle.load(open(filePath, 'rb'))
                for i in d:
                    for j in i:
                        yield j
                        
class EpochLogger(gensim.models.callbacks.CallbackAny2Vec):
    def __init__(self):
        self.epoch = 0
        self.beginTime = 0
    
    def on_epoch_begin(self, model):
        print('Epoch {} '.format(self.epoch), end='')
        self.epoch += 1
        self.beginTime = time.time()
        
    def on_epoch_end(self, model):
        print('- {} sec'.format(int(time.time() - self.beginTime)))

### Skip-Gram (SG)

In [None]:
for numFeatures in [8, 16, 32, 64, 128, 256]:
    print('----------------------------------------')
    print('----------------- [{}] -----------------'.format(numFeatures))
    print('----------------------------------------')

    epochLogger = EpochLogger()
    fileName = './result/SG_{}feature_{}window_{}minword_{}epochs'.format(numFeatures, windowSize, minWordCount, epochs)
    SG = gensim.models.Word2Vec(Word2VecSentense(path), size=numFeatures, window=windowSize, min_count=minWordCount, workers=numWorkers, iter=epochs, sg=1, callbacks=[epochLogger])
    SG.save(fileName)
    drawResult(SG, fileName)

    print('\n---------- add ----------')
    print(SG.wv.most_similar('add'))
    print('\n---------- mov ----------')
    print(SG.wv.most_similar('mov'))
    print('\n---------- jmp ----------')
    print(SG.wv.most_similar('jmp'))

### Continuous Bag Of Words (CBOW)

In [None]:
for numFeatures in [8, 16, 32, 64, 128, 256]:
    print('----------------------------------------')
    print('----------------- [{}] -----------------'.format(numFeatures))
    print('----------------------------------------')

    epochLogger = EpochLogger()
    fileName = './result/CBOW_{}feature_{}window_{}minword_{}epochs'.format(numFeatures, windowSize, minWordCount, epochs)
    CBOW = gensim.models.Word2Vec(Word2VecSentense(path), size=numFeatures, window=windowSize, min_count=minWordCount, workers=numWorkers, iter=epochs, sg=0, callbacks=[epochLogger])
    CBOW.save(fileName)
    drawResult(CBOW, fileName)

    print('\n---------- add ----------')
    print(CBOW.wv.most_similar('add'))
    print('\n---------- mov ----------')
    print(CBOW.wv.most_similar('mov'))
    print('\n---------- jmp ----------')
    print(CBOW.wv.most_similar('jmp'))

## Doc2Vec

In [None]:
class Doc2VecSentense(object):
    def __init__(self, dirname):
        self.dirname = dirname
 
    def __iter__(self):
        for fileName in os.listdir(self.dirname):
            filePath = path + fileName
            if os.path.isfile(filePath):
                d = pickle.load(open(filePath, 'rb'))
                for i in d:
                    for j in i:
                        yield gensim.models.doc2vec.TaggedDocument(j, fileName)

### Distributed-Memory (DM)

In [None]:
for numFeatures in [8, 16, 32, 64, 128, 256]:
    print('----------------------------------------')
    print('----------------- [{}] -----------------'.format(numFeatures))
    print('----------------------------------------')

    epochLogger = EpochLogger()
    fileName = './result/DM_{}feature_{}window_{}minword_{}epochs'.format(numFeatures, windowSize, minWordCount, epochs)
    DM = gensim.models.doc2vec.Doc2Vec(Doc2VecSentense(path), vector_size=numFeatures, window=windowSize, min_count=minWordCount, workers=numWorkers, epochs=epochs, dm=1, callbacks=[epochLogger])
    DM.save(fileName)
    drawResult(DM, fileName)

    print('\n---------- add ----------')
    print(DM.wv.most_similar('add'))
    print('\n---------- mov ----------')
    print(DM.wv.most_similar('mov'))
    print('\n---------- jmp ----------')
    print(DM.wv.most_similar('jmp'))

### Distributed Bag Of Words (DBOW)

In [None]:
for numFeatures in [8, 16, 32, 64, 128, 256]:
    print('----------------------------------------')
    print('----------------- [{}] -----------------'.format(numFeatures))
    print('----------------------------------------')

    epochLogger = EpochLogger()
    fileName = './result/DBOW_{}feature_{}window_{}minword_{}epochs'.format(numFeatures, windowSize, minWordCount, epochs)
    DBOW = gensim.models.doc2vec.Doc2Vec(Doc2VecSentense(path), vector_size=numFeatures, window=windowSize, min_count=minWordCount, workers=numWorkers, epochs=epochs, dm=0, callbacks=[epochLogger])
    DBOW.save(fileName)
    drawResult(DBOW, fileName)

    print('\n---------- add ----------')
    print(DBOW.wv.most_similar('add'))
    print('\n---------- mov ----------')
    print(DBOW.wv.most_similar('mov'))
    print('\n---------- jmp ----------')
    print(DBOW.wv.most_similar('jmp'))

## FastText

In [None]:
for numFeatures in [8, 16, 32, 64, 128, 256]:
    print('----------------------------------------')
    print('----------------- [{}] -----------------'.format(numFeatures))
    print('----------------------------------------')

    epochLogger = EpochLogger()
    fileName = './result/FT_{}feature_{}window_{}minword_{}epochs'.format(numFeatures, windowSize, minWordCount, epochs)
    FT = gensim.models.FastText(Word2VecSentense(path), size=numFeatures, window=windowSize, min_count=minWordCount, workers=numWorkers, iter=epochs, sg=1, callbacks=[epochLogger])
    FT.save(fileName)
    drawResult(FT, fileName)

    print('\n---------- add ----------')
    print(FT.wv.most_similar('add'))
    print('\n---------- mov ----------')
    print(FT.wv.most_similar('mov'))
    print('\n---------- jmp ----------')
    print(FT.wv.most_similar('jmp'))

In [None]:
for numFeatures in [8, 16, 32, 64, 128, 256]:
    print('----------------------------------------')
    print('----------------- [{}] -----------------'.format(numFeatures))
    print('----------------------------------------')

    epochLogger = EpochLogger()
    fileName = './result/FT_CBOW_{}feature_{}window_{}minword_{}epochs'.format(numFeatures, windowSize, minWordCount, epochs)
    FT = gensim.models.FastText(Word2VecSentense(path), size=numFeatures, window=windowSize, min_count=minWordCount, workers=numWorkers, iter=epochs, sg=0, callbacks=[epochLogger])
    FT.save(fileName)
    drawResult(FT, fileName)

    print('\n---------- add ----------')
    print(FT.wv.most_similar('add'))
    print('\n---------- mov ----------')
    print(FT.wv.most_similar('mov'))
    print('\n---------- jmp ----------')
    print(FT.wv.most_similar('jmp'))