# Praetorian ML Challenge - explore data

Notes:
On https://p16.praetorian.com/blog/machine-learning-tutorial the link "Machine Learning Binaries" is broken

## Discussion
At first glance, this may seem like an unsupervised task, but since we can guess wrong and get the correct answer for a given challenge, we can build a supervised training set

Inspecting a few ISA's for the supported architectures, it appears there are typically hundreds of instructions, not tens of thousands, so a non-sparse matrix approach might work fine for a first principles text embedding.

In [1]:
from __future__ import division

In [2]:
!pwd

/notebooks/projects/isa-classifier


In [1]:
!pip install -r ../requirements.txt



In [4]:
#%%writefile etl.py

from __future__ import print_function
import base64
import binascii
import datetime
import json
import logging
import numpy as np
import os
import random
import requests
import time
import uuid

SUPPORTED_ARCHITECTURES = ["avr", "alphaev56", "arm", "m68k", "mips", 
                           "mipsel", "powerpc", "s390", "sh4", "sparc", "x86_64", "xtensa"]

logging.basicConfig(level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

class Server(object):
    url = 'https://mlb.praetorian.com'
    log = logging.getLogger(__name__)

    def __init__(self):
        self.session = requests.session()
        # in a sample of a few dozen, self.binary is either 32 or 36 bytes so pad X to 36 byte columns
        self.binary  = None
        self.hash    = None
        self.wins    = 0
        self.targets = []
        self.rate_limit_count = 0
        self.unknown_server_exception_count = 0
        self.retry_wait = 10
        self.count = 0
        self.failure_record = []
        self.start_time = datetime.datetime.now()
        self.response = None

    def _request(self, route, method='get', data=None):
        while True:
            if self.count > 597:
                print('Resetting session at count {}'.format(self.count))
                self.session = requests.session()
            try:
                if method == 'get':
                    r = self.session.get(self.url + route)
                else:
                    r = self.session.post(self.url + route, data=data)
                self.status_code = r.status_code
                if r.status_code == 429:
                    self.rate_limit_count += 1
                    self.failure_record.append({'type': 'rate_limit',
                                                'count': self.count,
                                                'time': (datetime.datetime.now() -
                                                         self.start_time).total_seconds()})
                    raise Exception('Rate Limit Exception')
                if r.status_code == 500:
                    self.unknown_server_exception_count += 1
                    self.failure_record.append({'type': 'unknown_server_exception',
                                                'count': self.count,
                                                'time': (datetime.datetime.now() -
                                                         self.start_time).total_seconds()})
                    raise Exception('Unknown Server Exception')
                self.response = r
                return r.json()
            except Exception as e:
                self.log.error(e)
                self.log.info('Waiting 60 seconds before next request')
                time.sleep(60)
                self.status_code = None

    def get(self):
        r = self._request("/challenge")
        self.targets = r.get('target', [])
        # removed base64.base64decode(r.get('binary', '')) to allow writes to disk without re-encoding
        self.binary  = r.get('binary', '')
        return r

    def post(self, target):
        r = self._request("/solve", method="post", data={"target": target})
        self.wins = r.get('correct', 0)
        self.hash = r.get('hash', self.hash)
        self.ans  = r.get('target', 'unknown')
        return r
    
    def get_data(self, number=10000):
        """
        Retrieves data in format
        @param number: nubmer of samples to return
        @returns {binary_data: values, targets: values, answers: values}
        where binary_data = [ "<base64 encoded string>", ... ]
        targets =  [ "avr", "x86_64", ... ]
        answers = [one item from targets,]
        """
        data = {}
        # If we grab a large data set make it a little more efficient by pre-allocating
        data['binary_data'] = [None]*number
        data['targets'] = [None]*number
        data['answers'] = [None]*number
        for i in range(number):
            count = 0
            self.get()
            while((self.status_code != 200) and (count < MAX_RETRIES) ):
                print('Status code != 200. Retrying')
                count = count + 1
                self.get()
            data['binary_data'][i] = self.binary
            data['targets'][i] = self.targets
            self.post(self.targets[0])
            while((self.status_code != 200) and (count < MAX_RETRIES) ):
                print('post status code {}'.format(self.status_code))
                count = count + 1
                self.post(self.targets[0])
            data['answers'][i] = self.ans
        return data
    
    def get_data_sets(self, num_train=1024, num_test=128, num_dev=128):
        """
        @returns {'train': {'binary_data': [num_train values],
                            'targets': [num_train [6 values]],
                            'answers': [num_train values],
                  'dev': format as with 'train',
                  'test': format as with 'train'
        }
        @param name_prefix: Write data to a file of this name. If None, do not write
        """
        data = {}
        data['train'] = self.get_data(num_train)
        data['dev'] = self.get_data(num_dev)
        data['test'] = self.get_data(num_test)
        return data
        
        
def submit(number):
    s = Server()

    for _ in range(number):
        # query the /challenge endpoint
        s.get()

        # choose a random target and /solve
        target = random.choice(s.targets)
        s.post(target)

        s.log.info("Guess:[{: >9}]   Answer:[{: >9}]   Wins:[{: >3}]".format(target, s.ans, s.wins))

        # 500 consecutive correct answers are required to win
        # very very unlikely with current code
        if s.hash:
            s.log.info("You win! {}".format(s.hash))    




        
def store(data, root_dir=None):
    """
    Store data to root_dir in a generated filename
    """
    # extract some info from data to generate friendly part of filename
    num_train = str(len(data['train']['answers']))
    num_dev = str(len(data['dev']['answers']))
    num_test = str(len(data['test']['answers']))
    
    if not root_dir:
        root_dir = os.getcwd()
    else:
        root_dir = os.path.realpath(root_dir)
    file_name = "_".join([num_train, num_dev, num_test, str(uuid.uuid1())[0:8]]) + ".json"
    file_path = os.path.join(root_dir, file_name)
    
    with open(file_path, 'w+') as f:
        json.dump(data, f)
            
def load(path):
    """
    load data from a single file.  Data must have format specified in get_data_sets        
    """
    path = os.path.realpath(path)
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def load_dir(path):
    """
    load data from an entire directory.  Data must have format specified in get_data_sets
    """
    merged = {'train': { 'answers': [],
                         'targets': [],
                         'binary_data': [] },
              'dev': { 'answers': [],
                         'targets': [],
                         'binary_data': [] },
              'test': { 'answers': [],
                         'targets': [],
                         'binary_data': [] }
             }
    if os.path.isdir(path):
        for root, dirs, filenames in os.walk(path, topdown=True):
            for filename in filenames:
                if filename.endswith('.json'):
                    merged = merge_data(merged, load(os.path.join(root, filename)))
    else:
        merged = load(os.path.abspath(path)
)
    return merged

def merge_data(dict1, dict2):
    """
    Perform list additions for two identically formatted dicts with lists at depth 2
    """
    #_dict1 = deepcopy(dict1)
    for key in dict2:
        assert(key in dict1)
        for key2 in dict2[key]:
            assert(key2 in dict2[key])
            dict1[key][key2] = dict1[key][key2] + dict2[key][key2]
    return dict1

def hex_data(base64_binary_data, stride=1, expected_len=None):
    """
    Take an list of base64 encoded data and convert to lists of hex characters
    @params base64_binary_data: a base64 encoded string of (presumeably) 32 or 36 hex numbers
                                shape = (m, 1)
    @params stride: number of bytes to consider a word
    @params expected_len: max length of hex array, used if we need to pad shorter arrays
    @returns: np.array([map(hexlify,base64string),])
              shape = (m, len(base64.b64decode(base64_binary_data))/stride)
    """
    # Should check that we aren't discarding https://docs.python.org/2/library/base64.html
    # Characters that are neither in the normal base-64 alphabet nor the
    # alternative alphabet are discarded prior to the padding check.
    misfits = []
    hex_X = []

    for data in base64_binary_data:
        byte_strings = []
        data = base64.b64decode(data)
        for i in range(0, len(data) , stride):
            byte_strings.append(data[i:i+stride])
        # probably not needed for most text_embeddings, but since most seem 32 or 36 it might help
        if expected_len:
            delta = len(byte_strings) - expected_len
            div = delta/stride
            remainder = delta % stride
            if remainder or (delta < 0):
                misfits.append(byte_strings)
            else:
                byte_strings.extend(['\x00'*stride]*div)
        hex_X.append([ binascii.hexlify(e) for e in byte_strings ])

    return np.array(hex_X), misfits

def class_to_ones_hot(answers, targets, supported_architectures):
    Y = []
    allowed_Y = []
    for answer, target in zip(answers, targets):
        y = [0]*len(supported_architectures)
        index = supported_architectures.index(answer)
        y[index] = 1
        Y.append(y)
        target_hot = [0]*len(supported_architectures)
        for j, arch in enumerate(target):
            index = supported_architectures.index(arch)
            target_hot[index] = 1
        allowed_Y.append(target_hot)
    return np.array(Y), np.array(allowed_Y) 

        

## Fetch data on first run of this notebook
Fetch some data in the format supplied by etl.py (train, dev, test sets)

In [7]:
### Only run this cell the first time
%cd ../
!wget https://s3.us-east-2.amazonaws.com/isa-classifier/isa_classifier_data.tar.gz
!tar xvf isa_classifier_data.tar.gz


In [None]:
!ls ../ml_challenge

### Get and store new data
We will store the data in as close to the original form as possible.
We will do a small sample walkthrough of loading data and transforming it here as far as 
binary_data = [ [ 32 or 36 hex numbers ], ...].
We also transform the ISA class into one-hot vectors.
The word embedding transformations will be done in text_embeddings/I-TFID.ipynb, word2vec.ipynb 

In [286]:
s = Server()
data_set1 = s.get_data_sets(num_train=2048, num_test=256, num_dev=256)
store(data_set1, '../ml_challenge/')

In [9]:
print('binary_data', data_set1['train']['binary_data'][0:4])

binary_data [u'AAAAIk8Q0AtACQAP0RFiLWIP0SIhBuIiIRSSIiEN0iJiIiEI4iIhC9IiYiIhC9IiYiIhCtQA5QrQC0AA5gDgJg==', u'D5/CD7bSjQwSAdEBySHPifoxwiEVAAAAAEiLFQAAAADHBgAAAABIixKLEoXSD4VqAQAAxwUAAAAADQAAAI0UAA==', u'AAAAPAIAAIxDABQ8AgAArEMAABAAAAgAABAhPAMAACRjAAAAgxghhGQAADwDAACsZAAAJEIAAShDAAMUYP/3AA==', u'II3l/v//6/8QAOIEAJ3l/v//6xAgneVhHILjlBCB4wEAEOEHAKABAAAACgEAoOMIMJ3lAhCg4QAAg+H/AADi/g==']


In [3]:
!pwd

/notebooks/projects/isa-classifier/sklearn


In [2]:
!ls ../ml_challenge/


1024_128_128_eba414fa.json  2048_256_256_cb0707f0.json
2048_256_256_0b966df8.json  2048_256_256_dd02089e.json
2048_256_256_0baba5f4.json  2048_256_256_dd67c64e.json
2048_256_256_2847388a.json  4096_512_512_6f9ef7d8.json
2048_256_256_666e0b7e.json  512_128_128_76fe753c.json
2048_256_256_8f6606b2.json  isa_classifier_data.tar.gz


In [2]:
s2 = Server()
data_set = load_dir('../ml_challenge/2048_256_256_dd67c64e.json')

In [3]:
len(data_set['train']['binary_data'])

2048

In [4]:
orig_X_train, orig_Y_train, orig_train_targets = data_set['train']['binary_data'], data_set['train']['answers'], data_set['train']['targets']
orig_X_dev, orig_Y_dev, orig_dev_targets = data_set['dev']['binary_data'], data_set['dev']['answers'], data_set['dev']['targets']
orig_X_test, orig_Y_test, orig_test_targets = data_set['test']['binary_data'], data_set['test']['answers'], data_set['test']['targets']

print('orig_X_train[0:4]', '\n', orig_X_train[0:4])
print('orig_Y_train[0:4]', '\n', orig_Y_train[0:4])
print('orig_train_targets[0:4]', '\n', orig_train_targets[0:4])


orig_X_train[0:4] 
 [u'AADtjAAywX8ADD0gAADACQAA7AsAMuwMACj9gAIQPSAAAMAJAAD/jAAAQJ0ADMAfAAhIAAAQwZ8ACMAfAAzsDA==', u'AAb/////AAAAAAADbGx4CgAgAJgLAKkRwCAAmBuQkPWgmSDAIACZCMAgAJgrwCAAmQjBAADRAAChAACBAADgCA==', u'jIkAGI1LAACNaQAAAShIJK1pAACMSQAgJSkAARAA//GsSQAgjUIAAK+iABiMwgAIAAAwIYxCAACsogAMPAIAAA==', u'WDAwAFAwEACnSAAAWDDR4lBAEABYQNHeUEAQAFhAMABQQBAAWEAwBFBAEABYQNHaWDAwCFAwEABYMEAAUDAQAA==']
orig_Y_train[0:4] 
 [u'powerpc', u'xtensa', u'mips', u's390']
orig_train_targets[0:4] 
 [[u'alphaev56', u'arm', u'powerpc', u's390', u'sh4', u'xtensa'], [u'alphaev56', u'avr', u'mipsel', u'powerpc', u'sh4', u'xtensa'], [u'avr', u'm68k', u'mips', u'powerpc', u'sh4', u'sparc'], [u'arm', u'avr', u'm68k', u'mips', u'mipsel', u's390']]


In [5]:
#reload(etl)
import etl

In [6]:
import binascii
a = b'00011000'
print(int(a))
b = hex(int(a))
print(b)
binascii.hexlify(a)

11000
0x2af8


'3030303131303030'

In [7]:
base64_encoded = u'GAAAABgAAAAAAAAAqAAAAABIDhBHmgJa2g4AAAAAQaAAAD0kAABetxEE/0cCAOJDAAAQIgAAQbAAAF0kAABCoA=='

In [11]:
base64.b64decode(orig_X_train[0])

'\x00H\xe7>\x00B\xb9\x00\x00\x00\x00&9\x00\x00\x00\x00\x18\x03 9\x00\x00\x00\x00 9\x00\x00\x00\x00t\x0cr\x01S\x02g0 9\x00\x00\x00\x00p\x01B\x86LF\x08\x00\n@\x00\x01\x02\x00\x00\x01\x80\x04F'

In [12]:
print([ binascii.hexlify(x) for x in base64.b64decode(orig_X_train[0])])


['00', '48', 'e7', '3e', '00', '42', 'b9', '00', '00', '00', '00', '26', '39', '00', '00', '00', '00', '18', '03', '20', '39', '00', '00', '00', '00', '20', '39', '00', '00', '00', '00', '74', '0c', '72', '01', '53', '02', '67', '30', '20', '39', '00', '00', '00', '00', '70', '01', '42', '86', '4c', '46', '08', '00', '0a', '40', '00', '01', '02', '00', '00', '01', '80', '04', '46']


In [8]:

hex_X_train, train_misfits = etl.hex_data(orig_X_train)
hex_X_dev, dev_misfits = etl.hex_data(orig_X_dev)
hex_X_test, test_misfits = etl.hex_data(orig_X_test)

print('hex_X_train[0:4]', hex_X_train[0:4], '\n')
print('train_misfits[0:4]', train_misfits[0:4], '\n')



hex_X_train[0:4] [['00' '00' 'ed' '8c' '00' '32' 'c1' '7f' '00' '0c' '3d' '20' '00' '00'
  'c0' '09' '00' '00' 'ec' '0b' '00' '32' 'ec' '0c' '00' '28' 'fd' '80'
  '02' '10' '3d' '20' '00' '00' 'c0' '09' '00' '00' 'ff' '8c' '00' '00'
  '40' '9d' '00' '0c' 'c0' '1f' '00' '08' '48' '00' '00' '10' 'c1' '9f'
  '00' '08' 'c0' '1f' '00' '0c' 'ec' '0c']
 ['00' '06' 'ff' 'ff' 'ff' 'ff' '00' '00' '00' '00' '00' '03' '6c' '6c'
  '78' '0a' '00' '20' '00' '98' '0b' '00' 'a9' '11' 'c0' '20' '00' '98'
  '1b' '90' '90' 'f5' 'a0' '99' '20' 'c0' '20' '00' '99' '08' 'c0' '20'
  '00' '98' '2b' 'c0' '20' '00' '99' '08' 'c1' '00' '00' 'd1' '00' '00'
  'a1' '00' '00' '81' '00' '00' 'e0' '08']
 ['8c' '89' '00' '18' '8d' '4b' '00' '00' '8d' '69' '00' '00' '01' '28'
  '48' '24' 'ad' '69' '00' '00' '8c' '49' '00' '20' '25' '29' '00' '01'
  '10' '00' 'ff' 'f1' 'ac' '49' '00' '20' '8d' '42' '00' '00' 'af' 'a2'
  '00' '18' '8c' 'c2' '00' '08' '00' '00' '30' '21' '8c' '42' '00' '00'
  'ac' 'a2' '00' '0c' '3c' '02' '

In [14]:
hex_X_test_stride2, test_misfits_stride_2 = etl.hex_data(orig_X_test, stride=2)

In [15]:
hex_X_test_stride2[0]

array(['9109', '0000', '9169', '0000', '810a', '0038', '9109', '0000',
       'a10a', '003d', 'a14a', '003f', '9089', '0000', '9109', '0000',
       '9149', '0000', '4cc6', '3182', '4800', '0001', '8001', '0014',
       '3860', '0000', '3821', '0010', '7c08', '03a6', '4e80', '0020'],
      dtype='|S4')

In [9]:
hexdoc_X_train = map("".join, hex_X_train)
hexdoc_X_test =  map("".join, hex_X_test)
hexdoc_X_dev = map("".join, hex_X_dev)


In [34]:
def compare_vecs(X, num_indices):
    """
    sample the representation of two vecs over first num_indices of vector
    """
    for feature_0, freq_0, feature_1, freq_1 in zip(v.inverse_transform(X)[0][0:num_indices], 
                                                    X.A[0][0:num_indices],
                                                    v.inverse_transform(X)[1][0:num_indices], 
                                                    X.A[1][0:num_indices],
                                                   ):
        print(feature_0, freq_0, "   ", feature_1, freq_1)

        

In [35]:
compare_vecs(X_cv, 50)

00 0c ec 0c 0     00 00 e0 08 3
1f 00 0c ec 0     81 00 00 e0 2
c0 1f 00 0c 0     00 81 00 00 0
08 c0 1f 00 0     00 00 81 00 0
00 08 c0 1f 0     a1 00 00 81 1
9f 00 08 c0 0     00 a1 00 00 0
c1 9f 00 08 0     00 00 a1 00 0
48 00 00 10 0     d1 00 00 a1 0
08 48 00 00 0     00 d1 00 00 0
00 08 48 00 0     00 00 d1 00 0
1f 00 08 48 0     c1 00 00 d1 0
c0 1f 00 08 0     08 c1 00 00 0
0c c0 1f 00 0     99 08 c1 00 0
00 0c c0 1f 0     00 99 08 c1 0
9d 00 0c c0 0     08 c0 20 00 0
40 9d 00 0c 0     99 08 c0 20 0
00 40 9d 00 0     00 99 08 c0 0
00 00 40 9d 0     20 00 99 08 0
8c 00 00 40 0     c0 20 00 99 0
ff 8c 00 00 0     20 c0 20 00 0
00 ff 8c 00 0     c0 20 00 98 0
00 00 ff 8c 0     11 c0 20 00 0
09 00 00 ff 0     6c 78 0a 00 0
10 3d 20 00 0     6c 6c 78 0a 0
02 10 3d 20 0     00 00 03 6c 0
80 02 10 3d 0     00 00 00 03 0
fd 80 02 10 0     00 00 00 00 0
32 ec 0c 00 0     ff 00 00 00 0
00 32 ec 0c 0     ff ff 00 00 0
0b 00 32 ec 0     ff ff ff 00 0
ec 0b 00 32 0     ff ff ff ff 0
00 ec 0b

In [18]:
bow_transform2 = v.transform(hexdoc_X_train[0:2])
print(bow_transform2)
print(bow_transform2[0:2])

  (0, 0)	26
  (0, 1)	16
  (0, 2)	10
  (0, 3)	5
  (0, 27)	1
  (0, 35)	1
  (0, 41)	1
  (0, 115)	1
  (0, 119)	1
  (0, 253)	1
  (0, 274)	1
  (0, 538)	1
  (0, 589)	1
  (0, 601)	1
  (0, 739)	1
  (0, 745)	1
  (0, 1379)	1
  (0, 1381)	1
  (0, 1408)	1
  (0, 2600)	2
  (0, 2617)	1
  (0, 2752)	1
  (0, 3983)	1
  (0, 4016)	1
  (0, 4017)	1
  :	:
  (1, 71260)	1
  (1, 71261)	1
  (1, 71334)	1
  (1, 71335)	1
  (1, 72874)	2
  (1, 72875)	2
  (1, 72876)	2
  (1, 73663)	1
  (1, 73672)	1
  (1, 73825)	1
  (1, 74036)	1
  (1, 74062)	1
  (1, 74674)	4
  (1, 74896)	3
  (1, 74897)	3
  (1, 74904)	1
  (1, 74906)	1
  (1, 74920)	1
  (1, 84968)	1
  (1, 85326)	1
  (1, 85493)	1
  (1, 85516)	1
  (1, 85785)	1
  (1, 86074)	1
  (1, 87628)	1
  (0, 0)	26
  (0, 1)	16
  (0, 2)	10
  (0, 3)	5
  (0, 27)	1
  (0, 35)	1
  (0, 41)	1
  (0, 115)	1
  (0, 119)	1
  (0, 253)	1
  (0, 274)	1
  (0, 538)	1
  (0, 589)	1
  (0, 601)	1
  (0, 739)	1
  (0, 745)	1
  (0, 1379)	1
  (0, 1381)	1
  (0, 1408)	1
  (0, 2600)	2
  (0, 2617)	1
  (0, 2752)	1
  (0, 398

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
#from IPython.core.debugger import set_trace

vec_opts = {
    "ngram_range": (1, 4),  # allow n-grams of 1-4 words in length (32-bits)
    "analyzer": "word",     # analyze hex words
    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
    "min_df": 2,          # for demo purposes, be very selective about features
    "max_df": .7
}
v = CountVectorizer(**vec_opts)
X_cv = v.fit_transform(hexdoc_X_train)

In [34]:
print(v.vocabulary_.items()[0:10])
print(len(v.vocabulary_))

[(u'42 00 20', 16814), (u'30 8d', 15044), (u'30 8c', 15039), (u'30 8a', 15038), (u'00 a7 2a', 4425), (u'08 d0 4b', 8089), (u'08 d0 4d', 8091), (u'bd 27 00 00', 26929), (u'3b 2c 4c', 15859), (u'fd 24 00 00', 31702)]
32618


In [35]:
print(len(X_cv.indices))
print(type(X_cv))

268515
<class 'scipy.sparse.csr.csr_matrix'>


In [36]:
Xd = X_cv.todense()
print(Xd.shape)


(2048, 32618)


In [44]:
print(Xd.A[1][0:280])
print(Xd.A[2][0:280])
print(len(Xd.A[1]), len(Xd.A[2]))

[3 2 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0

In [40]:
X_cv.shape

(2048, 32618)

In [42]:
print(X_cv.A[1][1])


2


In [45]:
from sklearn.feature_extraction.text import TfidfTransformer

idf_opts = {"use_idf": True}
idf = TfidfTransformer(**idf_opts)

# perform the idf transform
X_idf = idf.fit_transform(X_cv)

In [46]:
X_idf

<2048x32618 sparse matrix of type '<type 'numpy.float64'>'
	with 268515 stored elements in Compressed Sparse Row format>

In [16]:
from sklearn.naive_bayes import MultinomialNB
mnbClassifier = MultinomialNB(alpha=.01)


In [17]:
mnb_model = mnbClassifier.fit(X_idf, np.array(orig_Y_train))

In [57]:
mnb_class2 = MultinomialNB(alpha=.9)
mnb_m2 = mnb_class2.fit(X_idf, np.array(orig_Y_train))
mnb_m2

MultinomialNB(alpha=0.9, class_prior=None, fit_prior=True)

In [79]:
mnb_m2.coef_.shape

(12, 32618)

In [77]:
diff = mnb_m2.coef_.argmax(axis=0) - mnb_model.coef_.argmax(axis=0)
print(diff, len(diff))

[0 0 0 ..., 0 0 0] 32618


In [76]:
print(diff.nonzero(), len(diff.nonzero()[0]))

(array([  119,   122,   145,   348,   508,   638,   678,   706,   732,
         735,   811,   980,  1057,  1140,  1257,  1259,  1365,  1663,
        1829,  1908,  2036,  2083,  2112,  2170,  2185,  2225,  2228,
        2270,  2288,  2306,  2485,  2543,  2611,  2947,  3047,  3227,
        3382,  3447,  3580,  3648,  3679,  3881,  4532,  4840,  5220,
        5231,  5376,  5377,  5443,  5444,  5500,  5723,  5819,  6005,
        6143,  6391,  6440,  6635,  6648,  6676,  6822,  6823,  7318,
        7635,  7639,  7672,  7915,  8403,  8550,  8587,  8915,  8943,
        8993,  9445,  9547,  9599,  9985, 10065, 10411, 10412, 10432,
       10508, 10780, 10848, 11003, 11091, 11156, 11912, 12988, 13343,
       13385, 13395, 13396, 13523, 13541, 13575, 13644, 13652, 13836,
       13895, 14227, 14254, 14268, 14283, 14375, 14389, 14400, 14590,
       15026, 15070, 15196, 15650, 15653, 15687, 15688, 15840, 15882,
       15937, 15938, 15982, 16146, 16231, 16466, 16922, 17016, 17290,
       17486, 17501

In [85]:
np.array([[1,2,3], [3,4,5]]).argmax(axis=0)

array([1, 1, 1])

In [86]:
### fraction of 


array([[ 0.78816702,  0.15874927],
       [ 0.99573485,  0.67326485],
       [ 0.30202746,  0.08254132],
       [ 0.17074652,  0.23769081],
       [ 0.58169284,  0.89623799]])

In [55]:
mnb_model.coef_ - mnb_m2.coef_

array([[ -1.43609427e-01,  -1.43091363e-01,  -1.23641583e-01, ...,
          2.15780009e+00,   2.15780009e+00,  -7.42260677e-02],
       [ -1.52737588e-01,  -1.51384156e-01,   3.88255060e-05, ...,
          2.14736399e+00,   2.14736399e+00,  -1.19542527e-01],
       [ -1.20021201e-01,   2.11932672e+00,   2.11932672e+00, ...,
          2.11932672e+00,   2.11932672e+00,   2.11932672e+00],
       ..., 
       [ -1.65670445e-01,  -1.44586141e-01,  -1.45224101e-01, ...,
          2.13577858e+00,   2.13577858e+00,   2.13577858e+00],
       [ -1.46388702e-01,  -1.46248894e-01,  -1.38590454e-01, ...,
          2.15596414e+00,   2.15596414e+00,  -1.40172263e-01],
       [ -1.50320225e-01,  -1.45531084e-01,   2.14856712e+00, ...,
          2.14856712e+00,   2.14856712e+00,  -1.43953078e-01]])

In [62]:
mnb_model.coef_

array([[ -5.65005987,  -6.01492261,  -8.52966459, ..., -12.29128362,
        -12.29128362,  -9.71026516],
       [ -6.33280976,  -6.7671353 , -10.39285075, ..., -12.2267706 ,
        -12.2267706 ,  -8.98114002],
       [ -9.3873931 , -12.07434756, -12.07434756, ..., -12.07434756,
        -12.07434756, -12.07434756],
       ..., 
       [ -5.48503144,  -8.44794896,  -8.41913596, ..., -12.16042398,
        -12.16042398, -12.16042398],
       [ -4.01714529,  -4.48830593,  -7.55641829, ..., -12.2795785 ,
        -12.2795785 ,  -7.33783876],
       [ -6.73744773,  -7.56585362, -12.23396578, ..., -12.23396578,
        -12.23396578,  -7.73559688]])

In [67]:
delta = (mnb_model.coef_ - mnb_m2.coef_)*100.0 /mnb_model.coef_
print(delta)

[[ -4.51747173e+01  -4.16894052e+01  -1.80662927e+01 ...,   1.49495869e+01
    1.49495869e+01  -6.34685816e+00]
 [ -3.95257515e+01  -3.54849225e+01   2.07833167e-03 ...,   1.45365664e+01
    1.45365664e+01  -1.36503864e+01]
 [ -9.70584006e+00   1.35350022e+01   1.35350022e+01 ...,   1.35350022e+01
    1.35350022e+01   1.35350022e+01]
 ..., 
 [ -4.88267772e+01  -1.92930002e+01  -1.95923378e+01 ...,   1.41050451e+01
    1.41050451e+01   1.41050451e+01]
 [ -6.59797211e+01  -5.87544761e+01  -2.76647139e+01 ...,   1.48751313e+01
    1.48751313e+01  -2.97248897e+01]
 [ -3.56898976e+01  -2.78660683e+01   1.45829523e+01 ...,   1.45829523e+01
    1.45829523e+01  -2.62092706e+01]]


In [68]:
delta.mean()

11.194846000324119

In [18]:
mnb_model

MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)

In [19]:
raw_predictions = mnb_model.predict(X_idf)

In [20]:
print(etl.SUPPORTED_ARCHITECTURES)

['avr', 'alphaev56', 'arm', 'm68k', 'mips', 'mipsel', 'powerpc', 's390', 'sh4', 'sparc', 'x86_64', 'xtensa']


In [21]:
raw_predictions[0:10]

array([u'powerpc', u'xtensa', u'mips', u's390', u's390', u'alphaev56',
       u'alphaev56', u'x86_64', u'powerpc', u'mips'],
      dtype='<U9')

In [22]:
print(mnb_model.classes_.tolist())


[u'alphaev56', u'arm', u'avr', u'm68k', u'mips', u'mipsel', u'powerpc', u's390', u'sh4', u'sparc', u'x86_64', u'xtensa']


In [23]:

Y_train, allowed_Y_train = class_to_ones_hot(orig_Y_train, orig_train_targets, mnb_model.classes_.tolist())
Y_dev, allowed_Y_dev = class_to_ones_hot(orig_Y_dev, orig_dev_targets, mnb_model.classes_.tolist())
Y_test, allowed_Y_test = class_to_ones_hot(orig_Y_test, orig_test_targets, mnb_model.classes_.tolist())

print('Y_train[0:4]', '\n', Y_train[0:4], '\n')
print('allowed_Y_train[0:4]', '\n', allowed_Y_train[0:4], '\n')
print(hex_X_train[0])
print(map("".join, hex_X_train[0:2]))
print(map(" ".join, hex_X_train[0:2]))
print(len(map("".join, [hex_X_train[0]])[0]))


Y_train[0:4] 
 [[0 0 0 0 0 0 1 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0 0 1]
 [0 0 0 0 1 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 1 0 0 0 0]] 

allowed_Y_train[0:4] 
 [[1 1 0 0 0 0 1 1 1 0 0 1]
 [1 0 1 0 0 1 1 0 1 0 0 1]
 [0 0 1 1 1 0 1 0 1 1 0 0]
 [0 1 1 1 1 1 0 1 0 0 0 0]] 

['00' '00' 'ed' '8c' '00' '32' 'c1' '7f' '00' '0c' '3d' '20' '00' '00' 'c0'
 '09' '00' '00' 'ec' '0b' '00' '32' 'ec' '0c' '00' '28' 'fd' '80' '02' '10'
 '3d' '20' '00' '00' 'c0' '09' '00' '00' 'ff' '8c' '00' '00' '40' '9d' '00'
 '0c' 'c0' '1f' '00' '08' '48' '00' '00' '10' 'c1' '9f' '00' '08' 'c0' '1f'
 '00' '0c' 'ec' '0c']
['0000ed8c0032c17f000c3d200000c0090000ec0b0032ec0c0028fd8002103d200000c0090000ff8c0000409d000cc01f000848000010c19f0008c01f000cec0c', '0006ffffffff0000000000036c6c780a002000980b00a911c02000981b9090f5a09920c020009908c02000982bc020009908c10000d10000a10000810000e008']
['00 00 ed 8c 00 32 c1 7f 00 0c 3d 20 00 00 c0 09 00 00 ec 0b 00 32 ec 0c 00 28 fd 80 02 10 3d 20 00 00 c0 09 00 00 ff 8c 00 00 40 9d 00 0c c0 1f 00 

In [24]:
probs_train = mnb_model.predict_proba(X_idf)
print("raw probs")
print(probs_train[0:4])
print("allowed probs")
print(probs_train[0:2]*allowed_Y_train[0:2])
print("allowed")
print(allowed_Y_train[0:2])
print(np.argmax(probs_train[0:2]*allowed_Y_train[0:2], axis=1))
print(map(mnb_model.classes_.tolist().__getitem__, np.argmax(probs_train[0:10]*allowed_Y_train[0:10], axis=1)))
print("correct")
print(orig_Y_train[0:10])
print(Y_train[0:10])

raw probs
[[  3.95378015e-16   2.37632598e-15   2.39473240e-15   4.89321789e-15
    2.13349722e-15   2.55725879e-14   1.00000000e+00   5.49013143e-15
    4.80712870e-15   4.08227072e-15   1.47239899e-15   7.91660446e-14]
 [  1.72830322e-17   3.02696647e-16   8.74247472e-18   2.32475725e-15
    7.59035168e-17   1.15949590e-17   3.20994269e-15   3.71650269e-16
    1.78774921e-16   5.77322918e-17   1.49534581e-16   1.00000000e+00]
 [  2.22521023e-15   4.51549638e-15   2.47002444e-15   2.00395398e-14
    1.00000000e+00   9.55206016e-13   1.65454600e-13   2.76210222e-14
    4.78126814e-14   6.84727854e-16   8.60855130e-15   3.57770739e-14]
 [  3.26558414e-15   7.67673832e-14   1.46144655e-15   3.60330698e-14
    1.82243572e-15   1.56065544e-14   3.89855363e-16   1.00000000e+00
    1.37067205e-15   5.60524901e-16   3.39215386e-16   3.69157015e-15]]
allowed probs
[[  3.95378015e-16   2.37632598e-15   0.00000000e+00   0.00000000e+00
    0.00000000e+00   0.00000000e+00   1.00000000e+00   5.4901

In [25]:
def guess_from_target(probs, allowed_Y, supported_architectures, ones_hot=True):
    """
    Improve our chances by taking the max over the possible targets (6 instead of 12)
    probs: numerical array of shape (m, n_classes)
    targets: ones-hot array of shape (m, n_classes)
    
    returns: (m, 1) of the most likely ISA arch names after discards or
             (m, n_classes) one-hot representation of best guess
    """
    if ones_hot:
        result = np.zeros(probs.shape)
        result[np.argmax(probs*allowed_Y, axis=1)] = 1
        return 
    return map(supported_architectures.__getitem__, np.argmax(probs*allowed_Y, axis=1))
    

In [26]:
print(len(probs_train), len(allowed_Y_train))
print(probs_train[0], allowed_Y_train[0], orig_train_targets[0])

2048 2048
[  3.95378015e-16   2.37632598e-15   2.39473240e-15   4.89321789e-15
   2.13349722e-15   2.55725879e-14   1.00000000e+00   5.49013143e-15
   4.80712870e-15   4.08227072e-15   1.47239899e-15   7.91660446e-14] [1 1 0 0 0 0 1 1 1 0 0 1] [u'alphaev56', u'arm', u'powerpc', u's390', u'sh4', u'xtensa']


In [27]:
np.argmax(probs_train*allowed_Y_train, axis=1)

array([ 6, 11,  4, ...,  6,  4,  6])

In [28]:
predictions = guess_from_target(probs_train, allowed_Y_train, mnb_model.classes_.tolist(), ones_hot=False)

In [29]:
predictions[0], orig_Y_train[0]

(u'powerpc', u'powerpc')

In [30]:
import pandas as pd

In [31]:
D = pd.DataFrame(predictions)
D.describe()

Unnamed: 0,0
count,2048
unique,12
top,m68k
freq,192


In [61]:
C = pd.Series(orig_Y_train)


In [62]:
C.value_counts()

powerpc      192
m68k         191
x86_64       188
xtensa       184
sh4          182
mips         177
arm          168
alphaev56    167
sparc        151
mipsel       150
s390         150
avr          148
dtype: int64

In [63]:
# We have a pretty good distribution of classes < 10% variance in counts

In [32]:
def describe_results(predictions, orig_Y):
    wrong = []
    for i in range(len(predictions)):
        if predictions[i] != orig_Y[i]:
            wrong.append([predictions[i], orig_Y[i]])
    print('training error: {}%'.format(100.0*len(wrong)/len(predictions)))
    print('some mistakes')
    print(wrong[0:15])
    return wrong

In [33]:
wrong = describe_results(predictions,orig_Y_train)

training error: 0.048828125%
some mistakes
[[u'm68k', u'xtensa']]


In [66]:
X_cv

<2048x32618 sparse matrix of type '<type 'numpy.int64'>'
	with 268515 stored elements in Compressed Sparse Row format>

In [67]:
pdw = pd.DataFrame(wrong)

In [68]:
pdw.columns = ['incorrect', 'correct']

In [69]:
pdw.describe()

Unnamed: 0,incorrect,correct
count,1,1
unique,1,1
top,m68k,xtensa
freq,1,1


In [70]:
pdw.groupby(by=pdw.correct).describe()

Unnamed: 0_level_0,incorrect,incorrect,incorrect,incorrect
Unnamed: 0_level_1,count,unique,top,freq
correct,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
xtensa,1,1,m68k,1


In [71]:
# model is worst at predicting sh4, then arm, m68k, xtensa


In [72]:
pdw.groupby(by=pdw.incorrect).describe()

Unnamed: 0_level_0,correct,correct,correct,correct
Unnamed: 0_level_1,count,unique,top,freq
incorrect,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
m68k,1,1,xtensa,1


In [73]:
# most false positives come from x86_64 and m68k, we are slightly over-fitting to them
pdw[pdw.incorrect == 'm68k'][0:5]

Unnamed: 0,incorrect,correct
0,m68k,xtensa


In [74]:
pdw.head()

Unnamed: 0,incorrect,correct
0,m68k,xtensa


In [75]:
pdw[pdw.correct == 'sparc']

Unnamed: 0,incorrect,correct


# repeat for dev dataset

In [87]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vec',   CountVectorizer(**vec_opts)),
    ('idf',  TfidfTransformer(**idf_opts))
    #('mnb_classifier',MultinomialNB())
])

X_dev_idf = pipeline.fit_transform(hexdoc_X_dev)

In [89]:
#X_cv_dev = v.transform(hex_X_dev, lowercase=False)

In [90]:
vec_opts = {
    "ngram_range": (1, 4),  # allow n-grams of 1-4 words in length (32-bits)
    "analyzer": "word",     # analyze hex words
    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
    "min_df": 3,          # 
    "vocabulary": v.vocabulary_
}
v_dev = CountVectorizer(**vec_opts)
X_cv_dev = v_dev.fit_transform(hexdoc_X_dev)

In [51]:
idf_dev = TfidfTransformer(**idf_opts)

# perform the idf transform
X_idf_dev = idf.fit_transform(X_cv_dev)

In [91]:
X_idf_dev = idf.transform(X_cv_dev)

In [276]:
X_cv_dev

<1792x456942 sparse matrix of type '<type 'numpy.int64'>'
	with 267835 stored elements in Compressed Sparse Row format>

In [92]:
X_idf_dev

<2816x197698 sparse matrix of type '<type 'numpy.float64'>'
	with 411843 stored elements in Compressed Sparse Row format>

NameError: name 'X_cv_dev' is not defined

In [93]:
probs_dev = mnb_model.predict_proba(X_idf_dev)

print(len(probs_dev), len(allowed_Y_dev))
print(probs_dev[0], allowed_Y_dev[0], orig_dev_targets[0])

2816 2816
[  6.64878586e-27   5.54827445e-27   6.76160444e-27   7.68025984e-26
   3.44199010e-25   8.20187998e-27   1.00000000e+00   1.27719061e-28
   2.06622779e-26   1.05862086e-25   6.68052332e-26   5.57266354e-23] [1 0 1 1 1 0 1 0 0 0 0 1] [u'alphaev56', u'avr', u'm68k', u'mips', u'powerpc', u'xtensa']


In [94]:
predictions_dev = guess_from_target(probs_dev, allowed_Y_dev, mnb_model.classes_.tolist(), ones_hot=False)

In [95]:
wrong = describe_results(predictions_dev, orig_Y_dev)

training error: 0.390625%
some mistakes
[[u'x86_64', u'sh4'], [u'x86_64', u'arm'], [u'm68k', u'powerpc'], [u'x86_64', u'xtensa'], [u'xtensa', u's390'], [u'x86_64', u'm68k'], [u'm68k', u'xtensa'], [u'alphaev56', u'mipsel'], [u'mips', u'm68k'], [u'sh4', u'mipsel'], [u'm68k', u'xtensa']]


# end dev

In [159]:
a = np.array([[7,2,4],[2,0,4]])
b = np.array([[1,0,1], [0,0,1]])
c = a*b
print(c)
d = np.zeros(c.shape)
print(np.argmax(c, axis=1))
print(d)
d[np.argmax(c, axis=1)] = 1
d

[[7 0 4]
 [0 0 4]]
[0 2]
[[ 0.  0.  0.]
 [ 0.  0.  0.]]


IndexError: index 2 is out of bounds for axis 0 with size 2

In [152]:
np.max(c, axis=1)

array([7, 4])

In [147]:
c[ c >= np.max(c, axis=1, keepdims=1)]

array([7, 4])

In [163]:
a = np.array([[1,2], [2, 1]])
b = np.array([[1,-1], [-1, 1]])
print(a)
print(b)
print(a*b)
print(np.multiply(a,b))

[[1 2]
 [2 1]]
[[ 1 -1]
 [-1  1]]
[[ 1 -2]
 [-2  1]]
[[ 1 -2]
 [-2  1]]


In [49]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA


def plot_hyperplane(clf, min_x, max_x, linestyle, label):
    # get the separating hyperplane
    w = clf.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(min_x - 5, max_x + 5)  # make sure the line is long enough
    yy = a * xx - (clf.intercept_[0]) / w[1]
    plt.plot(xx, yy, linestyle, label=label)


def plot_subfigure(X, Y, subplot, title, transform):
    if transform == "pca":
        X = PCA(n_components=12).fit_transform(X)
    elif transform == "cca":
        X = CCA(n_components=12).fit(X, Y_train).transform(X)
    else:
        raise ValueError

    min_x = np.min(X[:, 0])
    max_x = np.max(X[:, 0])

    min_y = np.min(X[:, 1])
    max_y = np.max(X[:, 1])

    classif = SVC(kernel='linear')
    classif.fit(X, Y_train.transpose())

    plt.subplot(2, 2, subplot)
    plt.title(title)

    zero_class = np.where(Y_train[:, 0])
    one_class = np.where(Y_train[:, 1])
    plt.scatter(X[:, 0], X[:, 1], s=40, c='gray', edgecolors=(0, 0, 0))
    plt.scatter(X[zero_class, 0], X[zero_class, 1], s=160, edgecolors='b',
                facecolors='none', linewidths=2, label='Class 1')
    plt.scatter(X[one_class, 0], X[one_class, 1], s=80, edgecolors='orange',
                facecolors='none', linewidths=2, label='Class 2')

    plot_hyperplane(classif.estimators_[0], min_x, max_x, 'k--',
                    'Boundary\nfor class 1')
    plot_hyperplane(classif.estimators_[1], min_x, max_x, 'k-.',
                    'Boundary\nfor class 2')
    plt.xticks(())
    plt.yticks(())

    plt.xlim(min_x - .5 * max_x, max_x + .5 * max_x)
    plt.ylim(min_y - .5 * max_y, max_y + .5 * max_y)
    if subplot == 2:
        plt.xlabel('First principal component')
        plt.ylabel('Second principal component')
        plt.legend(loc="upper left")


plt.figure(figsize=(8, 6))

# X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
#                                       allow_unlabeled=True,
#                                       random_state=1)
X_arr = X.toarray()

plot_subfigure(X_arr, Y_train, 1, "With unlabeled samples + CCA", "cca")
plot_subfigure(X_arr, Y, 2, "With unlabeled samples + PCA", "pca")

# X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
#                                       allow_unlabeled=False,
#                                       random_state=1)

# plot_subfigure(X, Y, 3, "Without unlabeled samples + CCA", "cca")
# plot_subfigure(X, Y, 4, "Without unlabeled samples + PCA", "pca")

plt.subplots_adjust(.04, .02, .97, .94, .09, .2)
plt.show()



ValueError: bad input shape (12, 4096)

In [46]:
Xt, Yt = make_multilabel_classification(n_classes=12, n_labels=1,
                                      allow_unlabeled=True,
                                      random_state=1)

In [48]:
Yt[0:2]

array([[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])

In [54]:
print(Xt.shape, Yt.shape)

(100, 20) (100, 12)


In [79]:
print Y.shape
Y

(100, 2)


array([[0, 1],
       [1, 0],
       [1, 0],
       [1, 1],
       [0, 1],
       [1, 1],
       [0, 1],
       [0, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [0, 1],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [1, 0],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [0, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [0, 1],
       [0, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [0, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1, 1],
       [1, 0],
       [1, 1],
       [0, 1],
       [1, 1],
       [0, 1],
       [1, 1],
       [0, 1],
       [0, 1],
       [0, 1],
       [1, 1],
       [0, 1],
       [0, 1],
       [1, 1],
       [1, 0],
       [1, 1],
       [1, 1],
       [1, 1],
       [1, 0],
       [0, 1],
       [0, 1],
       [1, 0],
       [1,