# Praetorian ML Challenge - explore data

Notes:
On https://p16.praetorian.com/blog/machine-learning-tutorial the link "Machine Learning Binaries" is broken

## Discussion
This is a supervised learning task since we can guess wrong and get the correct answer for a given challenge; we can build a supervised training set

Inspecting a few ISA's for the supported architectures, it appears there are typically hundreds of instructions, not tens of thousands, so a non-sparse matrix approach might work fine for a first principles text embedding.

In [1]:
from __future__ import division

In [1]:
# %load etl.py

from __future__ import print_function
import base64
import binascii
import datetime
import json
import logging
import numpy as np
import os
import random
import requests
import sys
import time
import uuid

SUPPORTED_ARCHITECTURES = ["avr", "alphaev56", "arm", "m68k", "mips", 
                           "mipsel", "powerpc", "s390", "sh4", "sparc", "x86_64", "xtensa"]

logging.basicConfig(level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')


class Server(object):
    url = 'https://mlb.praetorian.com'
    log = logging.getLogger(__name__)

    def __init__(self):
        self.session = requests.session()
        # in a sample of a few dozen, self.binary is either 32 or 36 bytes so pad X to 36 byte columns
        self.binary  = None
        self.hashes    = []
        self.wins    = 0
        self.targets = []
        self.rate_limit_count = 0
        self.unknown_server_exception_count = 0
        self.retry_wait = 10
        self.count = 0
        self.failure_record = []
        self.start_time = datetime.datetime.now()
        self.response = None
        self.email = None

    def _request(self, route, method='get', data=None):
        while True:
            if self.count > 597:
                print('Resetting session at count {}'.format(self.count))
                self.session = requests.session()
            try:
                if method == 'get':
                    r = self.session.get(self.url + route)
                else:
                    r = self.session.post(self.url + route, data=data)
                self.status_code = r.status_code
                if r.status_code == 429:
                    self.rate_limit_count += 1
                    self.failure_record.append({'type': 'rate_limit',
                                                'count': self.count,
                                                'time': (datetime.datetime.now() -
                                                         self.start_time).total_seconds()})
                    raise Exception('Rate Limit Exception')
                if r.status_code == 500:
                    self.unknown_server_exception_count += 1
                    self.failure_record.append({'type': 'unknown_server_exception',
                                                'count': self.count,
                                                'time': (datetime.datetime.now() -
                                                         self.start_time).total_seconds()})
                    raise Exception('Unknown Server Exception')
                self.response = r
                return r.json()
            except Exception as e:
                self.log.error(e)
                self.log.info('Waiting 60 seconds before next request')
                time.sleep(60)
                self.status_code = None

    def get(self):
        r = self._request("/challenge")
        self.targets = r.get('target', [])
        # removed base64.base64decode(r.get('binary', '')) to keep raw data raw
        self.binary  = r.get('binary', '')
        return r

    def post(self, target):
        r = self._request("/solve", method="post", data={"target": target})
        self.wins = r.get('correct', 0)
        hash = r.get('hash', None)
        if hash:
            self.log.info("You win! {}".format(hash))
            self.collect_hash()
        self.ans  = r.get('target', 'unknown')
        return r
    
    def collect_hash(self):
        r = self._request("/hash", method="post", data={"email": self.email})
        hash = r.get('hash', None)
        print('hash_response is {}'.format(r))
        self.hashes.append(hash)
        # reset the session to earn more hashes
        self.session.close()
        self.session = requests.session()
        
    def get_data(self, number=10000, model=None, email=None):
        """
        Retrieves data in format
        @param number: nubmer of samples to return
        @returns {binary_data: values, targets: values, answers: values}
        where binary_data = [ "<base64 encoded string>", ... ]
        targets =  [ "avr", "x86_64", ... ]
        answers = [one item from targets,]
        """
        if email:
            self.email = email
            
        data = {}
        # If we grab a large data set make it a little more efficient by pre-allocating
        data['binary_data'] = [None]*number
        data['targets'] = [None]*number
        data['answers'] = [None]*number
        for i in range(number):
            self.count = 0
            self.get()
            while((self.status_code != 200) and (self.count < MAX_RETRIES) ):
                print('Status code != 200. Retrying')
                self.count = self.count + 1
                self.get()
            data['binary_data'][i] = self.binary
            data['targets'][i] = self.targets
            if model:
                X = hex_data([self.binary])
                probs = model.predict_proba(X)    # array of shape [1,12]
                targets = class_to_ones_hot_targets([self.targets], model.classes_.tolist())
                #print('data', X)
                #print('probs', probs)
                #print('targets', targets)
                guess_arch = guess_arch_name(model, X, targets)
            else:
                guess_arch = self.targets[0]
            self.post(guess_arch)
            while((self.status_code != 200) and (self.count < MAX_RETRIES) ):
                print('post status code {}'.format(self.status_code))
                self.count = self.count + 1
                self.post(guess_arch)
            #print(i, type(i), self.ans, X, type(data['answers']))
            data['answers'][i] = self.ans
            
        return data
    
    def get_data_sets(self, num_train=1024, num_test=128, num_dev=128, model=None, email=None):
        """
        @param model: If we have a trained model, make trained guesses while downloading data
        
        @returns {'train': {'binary_data': [num_train values],
                            'targets': [num_train [6 values]],
                            'answers': [num_train values],
                  'dev': format as with 'train',
                  'test': format as with 'train'
        }
        """
        data = {}
        data['train'] = self.get_data(num_train, model=model, email=email)
        data['dev'] = self.get_data(num_dev, model=model, email=email)
        data['test'] = self.get_data(num_test, model=model, email=email)
        return data
        

# We only use the server class for interacting with the api
# The methods below can be used with stored data    

        
def store(data, root_dir=None):
    """
    Store data to root_dir in a generated filename
    """
    # extract some info from data to generate friendly part of filename
    num_train = str(len(data['train']['answers']))
    num_dev = str(len(data['dev']['answers']))
    num_test = str(len(data['test']['answers']))
    
    if not root_dir:
        root_dir = os.getcwd()
    else:
        root_dir = os.path.realpath(root_dir)
    file_name = "_".join([num_train, num_dev, num_test, str(uuid.uuid1())[0:8]]) + ".json"
    file_path = os.path.join(root_dir, file_name)
    
    with open(file_path, 'w+') as f:
        json.dump(data, f)
            
def load(path):
    """
    load data from a single file.  Data must have format specified in get_data_sets        
    """
    path = os.path.realpath(path)
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def load_dir(path):
    """
    load data from an entire directory.  Data must have format specified in get_data_sets
    """
    merged = {'train': { 'answers': [],
                         'targets': [],
                         'binary_data': [] },
              'dev': { 'answers': [],
                         'targets': [],
                         'binary_data': [] },
              'test': { 'answers': [],
                         'targets': [],
                         'binary_data': [] }
             }
    if os.path.isdir(path):
        for root, dirs, filenames in os.walk(path, topdown=True):
            for filename in filenames:
                if filename.endswith('.json'):
                    merged = merge_data(merged, load(os.path.join(root, filename)))
    else:
        merged = load(os.path.abspath(path)
)
    return merged

def merge_data(dict1, dict2):
    """
    Perform list additions for two identically formatted dicts with lists at depth 2
    """
    #_dict1 = deepcopy(dict1)
    for key in dict2:
        assert(key in dict1)
        for key2 in dict2[key]:
            assert(key2 in dict2[key])
            dict1[key][key2] = dict1[key][key2] + dict2[key][key2]
    return dict1

def hex_data(base64_binary_data, stride=1, expected_len=None):
    """
    Take an list of base64 encoded data and convert to lists of hex characters
    @params base64_binary_data: a base64 encoded string of (presumeably) 32 or 36 hex numbers
                                shape = (m, 1)
    @params stride: number of bytes to consider a word
    @params expected_len: max length of hex array, used if we need to pad shorter arrays
    @returns: np.array([map(hexlify,base64string),])
              shape = (m, len(base64.b64decode(base64_binary_data))/stride)
    """
    # Should check that we aren't discarding https://docs.python.org/2/library/base64.html
    # Characters that are neither in the normal base-64 alphabet nor the
    # alternative alphabet are discarded prior to the padding check.
    # Only python2 version does checking and uses stride, expected_len as it was found
    # that data was uniformly 0 misfits.
    misfits = []
    hex_X = []

    for data in base64_binary_data:
        data = base64.b64decode(data)

        if sys.version_info > (3,0):
            hex_X.append(binascii.hexlify(data).decode('utf-8'))
        else:
            byte_strings = []
            for i in range(0, len(data) , stride):
                byte_strings.append(data[i:i+stride])
            # probably not needed for most text_embeddings, but since most seem 32 or 36 it might help
            if expected_len:
                delta = len(byte_strings) - expected_len
                div = delta/stride
                remainder = delta % stride
                if remainder or (delta < 0):
                    print('misfit data', byte_strings)
                else:
                    byte_strings.extend(['\x00'*stride]*div)
            hex_X.append([ binascii.hexlify(e) for e in byte_strings ])

    return np.array(hex_X)

def class_to_ones_hot_answers(answers, classes):
    """
    Take an list of answers as strings and convert to array of ones hot
    
    @param answers: (m, 1) array of strings
    @param classes: len(classes) array of strings
    @returns: (m, len(classes)) array of 0s and 1s
    """
    get_arch_index = np.vectorize(lambda x: classes.index(x))
    answer_indices = get_arch_index(np.array(answers))
    hots = np.zeros(shape=(len(answers),len(classes)))
    hots[np.arange(len(answers)), answer_indices] = 1
    return hots

def class_to_ones_hot_targets(targets, classes):
    """
    Take an list of targets as strings and convert to array of ones hot
    
    @param answers: (m, 1) array of strings
    @param classes: len(classes) array of strings
    @returns: (m, len(classes)) array of 0s and 1s
    """
    get_arch_index = np.vectorize(lambda x: classes.index(x))
    target_indices = get_arch_index(np.array(targets))
    hots = np.zeros(shape=(len(targets),len(classes)))
    # loop over the six columns of targets array
    for idx in range(target_indices.shape[1]):
        hots[np.arange(len(targets)), target_indices[:,idx]] = 1
    return hots

def class_to_ones_hot(answers, targets, classes):
    """
    Converts lists to arrays and strings to ones-hot
    
    param answers: length m list of strings
    param targets: length m list of [6 strings]
    @returns (m, len(classes) answers, (m, len(classes) targets in ones-hot
    """
    return class_to_ones_hot_answers(answers, classes), class_to_ones_hot_targets(targets, classes) 


def guess_arch_name(model, X, allowed_Y, use_targets=True):
    """
    Get the arch name prediction from the probabilities in ones-hot
    
    model: model trained on (m, 1) input
    X: numerical array of shape (m, 1)
    targets: ones-hot array of shape (m, n_classes), ignored if use_targets = False
    use_targets: if True, Improve our chances by taking the max over the possible targets (6 instead of 12)
    
    returns: (m, 1) of the most likely ISA arch names 
    """
    probs = model.predict_proba(X)
    get_arch = np.vectorize(lambda x: model.classes_.__getitem__(x))
    if use_targets:
        return get_arch(np.argmax(probs*allowed_Y, axis=1))
    else:
        return get_arch(np.argmax(probs, axis=1))
    

Overwriting etl.py


In [2]:
!pwd

/home/jovyan/isa-classifier/sklearn


In [2]:
!pip install -r ../requirements.txt



### Fetch data while guessing with the model trained below

In [3]:
# handy when developing etl in this notebook
from importlib import reload
if 'etl' in globals():
    reload(etl)
else:
    import etl

In [None]:
s = etl.Server()
data_set_new = s.get_data_sets(num_train=2048,
                               num_test=256,
                               num_dev=256,
                               model=mnb_model,
                               email='kesten.broughton@gmail.com')
etl.store(data_set_new, '../ml_challenge/')

# If this errors on mnb_model is not defined, you must run the training below first

### Fetch previously collected data on first run of this notebook
Fetch some data in the format supplied by etl.py (train, dev, test sets)

In [7]:
### Only run this cell the first time
%cd ../
!wget https://s3.us-east-2.amazonaws.com/isa-classifier/isa_classifier_data.tar.gz
!tar xvf isa_classifier_data.tar.gz


In [3]:
!pwd

/notebooks/projects/isa-classifier/sklearn


In [1]:
!ls ../ml_challenge




1024_128_128_eba414fa.json  2048_256_256_9e7a4848.json
2048_256_256_075b60e8.json  2048_256_256_b951fec6.json
2048_256_256_0b966df8.json  2048_256_256_cb0707f0.json
2048_256_256_0baba5f4.json  2048_256_256_dd02089e.json
2048_256_256_2847388a.json  2048_256_256_dd67c64e.json
2048_256_256_666e0b7e.json  4096_512_512_6f9ef7d8.json
2048_256_256_8f6606b2.json  4096_512_512_94537036.json
2048_256_256_98813c7c.json  512_128_128_76fe753c.json


### Load data for training

In [4]:
import etl

# For more accurate results you will need > 10Gb server and 
# data_set = etl.load_dir('../ml_challenge/')
data_set = etl.load('../ml_challenge/2048_256_256_dd67c64e.json')

In [5]:
len(data_set['train']['binary_data'])

2048

In [6]:
orig_X_train, orig_Y_train, orig_train_targets = data_set['train']['binary_data'], data_set['train']['answers'], data_set['train']['targets']
orig_X_dev, orig_Y_dev, orig_dev_targets = data_set['dev']['binary_data'], data_set['dev']['answers'], data_set['dev']['targets']
orig_X_test, orig_Y_test, orig_test_targets = data_set['test']['binary_data'], data_set['test']['answers'], data_set['test']['targets']

print('orig_X_train[0:4]', '\n', orig_X_train[0:4])
print('orig_Y_train[0:4]', '\n', orig_Y_train[0:4])
print('orig_train_targets[0:4]', '\n', orig_train_targets[0:4])


orig_X_train[0:4] 
 ['AADtjAAywX8ADD0gAADACQAA7AsAMuwMACj9gAIQPSAAAMAJAAD/jAAAQJ0ADMAfAAhIAAAQwZ8ACMAfAAzsDA==', 'AAb/////AAAAAAADbGx4CgAgAJgLAKkRwCAAmBuQkPWgmSDAIACZCMAgAJgrwCAAmQjBAADRAAChAACBAADgCA==', 'jIkAGI1LAACNaQAAAShIJK1pAACMSQAgJSkAARAA//GsSQAgjUIAAK+iABiMwgAIAAAwIYxCAACsogAMPAIAAA==', 'WDAwAFAwEACnSAAAWDDR4lBAEABYQNHeUEAQAFhAMABQQBAAWEAwBFBAEABYQNHaWDAwCFAwEABYMEAAUDAQAA==']
orig_Y_train[0:4] 
 ['powerpc', 'xtensa', 'mips', 's390']
orig_train_targets[0:4] 
 [['alphaev56', 'arm', 'powerpc', 's390', 'sh4', 'xtensa'], ['alphaev56', 'avr', 'mipsel', 'powerpc', 'sh4', 'xtensa'], ['avr', 'm68k', 'mips', 'powerpc', 'sh4', 'sparc'], ['arm', 'avr', 'm68k', 'mips', 'mipsel', 's390']]


In [7]:
hex_X_train = etl.hex_data(orig_X_train)
hex_X_dev = etl.hex_data(orig_X_dev)
hex_X_test = etl.hex_data(orig_X_test)

print('hex_X_train[0:4]', hex_X_train[0:4], '\n')

hex_X_train[0:4] [ '0000ed8c0032c17f000c3d200000c0090000ec0b0032ec0c0028fd8002103d200000c0090000ff8c0000409d000cc01f000848000010c19f0008c01f000cec0c'
 '0006ffffffff0000000000036c6c780a002000980b00a911c02000981b9090f5a09920c020009908c02000982bc020009908c10000d10000a10000810000e008'
 '8c8900188d4b00008d69000001284824ad6900008c490020252900011000fff1ac4900208d420000afa200188cc20008000030218c420000aca2000c3c020000'
 '5830300050301000a74800005830d1e2504010005840d1de50401000584030005040100058403004504010005840d1da58303008503010005830400050301000'] 



In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

idf_opts = {
    "ngram_range": (1, 6),  # allow n-grams of 1-6 words in length (32-bits)
    "analyzer": "word",     # analyze hex words
    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
    "min_df": 2,          # for demo purposes, be very selective about features
    "max_df": .7
}

pipeline = Pipeline(steps=[
    ('idf',  TfidfVectorizer(**idf_opts)),
    ('mnb_classifier', MultinomialNB(alpha=1e-4))
])

mnb_model = pipeline.fit(hex_X_train, orig_Y_train)


### express correct arch and targets in ones-hot

In [9]:
Y_train, allowed_Y_train = etl.class_to_ones_hot(orig_Y_train, orig_train_targets, mnb_model.classes_.tolist())
Y_dev, allowed_Y_dev = etl.class_to_ones_hot(orig_Y_dev, orig_dev_targets, mnb_model.classes_.tolist())
Y_test, allowed_Y_test = etl.class_to_ones_hot(orig_Y_test, orig_test_targets, mnb_model.classes_.tolist())

print('Y_train[0:4]', '\n', Y_train[0:4], '\n')
print('allowed_Y_train[0:4]', '\n', allowed_Y_train[0:4], '\n')

Y_train[0:4] 
 [[ 0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.]
 [ 0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.]] 

allowed_Y_train[0:4] 
 [[ 1.  1.  0.  0.  0.  0.  1.  1.  1.  0.  0.  1.]
 [ 1.  0.  1.  0.  0.  1.  1.  0.  1.  0.  0.  1.]
 [ 0.  0.  1.  1.  1.  0.  1.  0.  1.  1.  0.  0.]
 [ 0.  1.  1.  1.  1.  1.  0.  1.  0.  0.  0.  0.]] 



In [10]:
mnb_model.named_steps

{'idf': TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
         dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
         lowercase=True, max_df=0.7, max_features=None, min_df=2,
         ngram_range=(1, 6), norm='l2', preprocessor=None, smooth_idf=True,
         stop_words=None, strip_accents=None, sublinear_tf=False,
         token_pattern='..', tokenizer=None, use_idf=True, vocabulary=None),
 'mnb_classifier': MultinomialNB(alpha=0.0001, class_prior=None, fit_prior=True)}

In [11]:
print(list(mnb_model.named_steps['idf'].vocabulary_.items())[0:10])
print(len(mnb_model.named_steps['idf'].vocabulary_))

[('ed', 52205), ('8c', 40562), ('32', 27544), ('c1', 47209), ('7f', 37009), ('0c', 16876), ('3d', 28921), ('20', 21609), ('c0', 46629), ('09', 16144)]
55389


In [12]:
print(mnb_model.classes_.tolist())

['alphaev56', 'arm', 'avr', 'm68k', 'mips', 'mipsel', 'powerpc', 's390', 'sh4', 'sparc', 'x86_64', 'xtensa']


In [13]:
import numpy as np
probs_train = mnb_model.predict_proba(hex_X_train)
print("raw probs")
print(probs_train[0:2])
print("allowed probs")
print(probs_train[0:2]*allowed_Y_train[0:2])
print("allowed")
print(allowed_Y_train[0:2])
print(np.argmax(probs_train[0:2]*allowed_Y_train[0:2], axis=1))
print(list(map(mnb_model.classes_.tolist().__getitem__, np.argmax(probs_train[0:10]*allowed_Y_train[0:10], axis=1))))
print("correct")
print(orig_Y_train[0:10])
print(Y_train[0:10])

raw probs
[[  3.47346486e-46   1.85174299e-44   3.84898312e-45   1.18398949e-44
    8.79953096e-45   7.36921467e-43   1.00000000e+00   3.48942071e-44
    3.42281558e-44   2.10079016e-44   2.90184092e-45   1.10689350e-42]
 [  1.22862989e-38   1.43118953e-36   5.02068075e-42   8.39340188e-35
    1.11558564e-38   5.36644684e-41   6.53898977e-35   8.10018136e-38
    5.58671351e-37   1.69716228e-40   3.25690908e-38   1.00000000e+00]]
allowed probs
[[  3.47346486e-46   1.85174299e-44   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   3.48942071e-44
    3.42281558e-44   0.00000000e+00   0.00000000e+00   1.10689350e-42]
 [  1.22862989e-38   0.00000000e+00   5.02068075e-42   0.00000000e+00
    0.00000000e+00   5.36644684e-41   6.53898977e-35   0.00000000e+00
    5.58671351e-37   0.00000000e+00   0.00000000e+00   1.00000000e+00]]
allowed
[[ 1.  1.  0.  0.  0.  0.  1.  1.  1.  0.  0.  1.]
 [ 1.  0.  1.  0.  0.  1.  1.  0.  1.  0.  0.  1.]]
[ 6 11]
['powerpc',

In [14]:
guess_train = etl.guess_arch_name(mnb_model, hex_X_train, allowed_Y_train, use_targets=True)

In [15]:
guess_train[0], orig_Y_train[0]

('powerpc', 'powerpc')

### Error analysis

In [18]:
import pandas as pd

#### Get the class distributions

In [22]:
D = pd.DataFrame(guess_train)
D.describe()

Unnamed: 0,0
count,2048
unique,12
top,m68k
freq,192


In [23]:
C = pd.Series(orig_Y_train)


In [24]:
C.value_counts()

powerpc      192
m68k         191
x86_64       188
xtensa       184
sh4          182
mips         177
arm          168
alphaev56    167
sparc        151
s390         150
mipsel       150
avr          148
dtype: int64

In [25]:
# We have a pretty good distribution of classes < 10% variance in counts

In [26]:
def describe_results(predictions, orig_Y):
    wrong = []
    for i in range(len(predictions)):
        if predictions[i] != orig_Y[i]:
            wrong.append([predictions[i], orig_Y[i]])
    print('training error: {}%'.format(100.0*len(wrong)/len(predictions)))
    print('some mistakes')
    print(wrong[0:15])
    return wrong

In [27]:
wrong = describe_results(guess_train, orig_Y_train)

training error: 0.048828125%
some mistakes
[['m68k', 'xtensa']]


In [28]:
pdw = pd.DataFrame(wrong)
pdw.columns = ['incorrect', 'correct']
pdw.describe()

Unnamed: 0,incorrect,correct
count,1,1
unique,1,1
top,m68k,xtensa
freq,1,1


In [29]:
pdw.groupby(by=pdw.correct).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,incorrect
correct,Unnamed: 1_level_1,Unnamed: 2_level_1
xtensa,count,1
xtensa,unique,1
xtensa,top,m68k
xtensa,freq,1


In [30]:
# model is worst at predicting powerpc, then sh4 and sparc


In [31]:
pdw.groupby(by=pdw.incorrect).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,correct
incorrect,Unnamed: 1_level_1,Unnamed: 2_level_1
m68k,count,1
m68k,unique,1
m68k,top,xtensa
m68k,freq,1


In [32]:
# most false positives come from x86_64 and m68k, we are slightly over-fitting to them
pdw[pdw.incorrect == 'm68k'][0:5]

Unnamed: 0,incorrect,correct
0,m68k,xtensa


In [33]:
pdw.head()

Unnamed: 0,incorrect,correct
0,m68k,xtensa


In [34]:
pdw[pdw.correct == 'sparc']

Unnamed: 0,incorrect,correct


# repeat for dev dataset

In [35]:
probs_dev = mnb_model.predict_proba(hex_X_dev)

print(len(probs_dev), len(allowed_Y_dev))
print(probs_dev[0], allowed_Y_dev[0], orig_dev_targets[0])

256 256
[  2.06932784e-13   3.90070797e-15   6.27651073e-17   3.38994007e-11
   2.58364985e-13   1.22955623e-15   1.00000000e+00   1.05945217e-15
   2.33171986e-12   2.71414387e-18   1.29291917e-13   1.32116605e-14] [ 1.  0.  0.  1.  0.  0.  1.  1.  1.  0.  1.  0.] ['alphaev56', 'm68k', 'powerpc', 's390', 'sh4', 'x86_64']


In [36]:
predictions_dev = etl.guess_arch_name(mnb_model, hex_X_dev, allowed_Y_dev, use_targets=True)

In [37]:
wrong_dev = describe_results(predictions_dev, orig_Y_dev)

training error: 0.390625%
some mistakes
[['m68k', 'xtensa']]


In [38]:
mnb_model.score(hex_X_dev, orig_Y_dev)

0.99609375

In [39]:
pdw = pd.DataFrame(wrong_dev)
pdw.columns = ['incorrect', 'correct']
pdw.describe()

Unnamed: 0,incorrect,correct
count,1,1
unique,1,1
top,m68k,xtensa
freq,1,1


In [40]:
pdw.groupby(by=pdw.correct).describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,incorrect
correct,Unnamed: 1_level_1,Unnamed: 2_level_1
xtensa,count,1
xtensa,unique,1
xtensa,top,m68k
xtensa,freq,1


### Analysis
With the complete training set ~ 24k samples - we see .01% error rate for train and .3% for dev.
We are substantially over-fitting, but it is sufficient to easily get 500 correct guesses in a row.<br>

*TODO*: reduce overfitting in the model


# end dev