# Praetorian ML Challenge - explore data

### This is the same as praet_ml_challenge notebook with a little more rough work included

Notes:
On https://p16.praetorian.com/blog/machine-learning-tutorial the link "Machine Learning Binaries" is broken

## Discussion
At first glance, this may seem like an unsupervised task, but since we can guess wrong and get the correct answer for a given challenge, we can build a supervised training set

Inspecting a few ISA's for the supported architectures, it appears there are typically hundreds of instructions, not tens of thousands, so a non-sparse matrix approach might work fine for a first principles text embedding.

In [0]:
from __future__ import division

In [0]:
!pwd

In [0]:
!pip install -r ../requirements.txt

In [0]:
#%load etl.py

from __future__ import print_function
import base64
import binascii
import datetime
import json
import logging
import numpy as np
import os
import random
import requests
import sys
import time
import uuid

SUPPORTED_ARCHITECTURES = ["avr", "alphaev56", "arm", "m68k", "mips", 
                           "mipsel", "powerpc", "s390", "sh4", "sparc", "x86_64", "xtensa"]

logging.basicConfig(level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')

class Server(object):
    url = 'https://mlb.praetorian.com'
    log = logging.getLogger(__name__)

    def __init__(self):
        self.session = requests.session()
        # in a sample of a few dozen, self.binary is either 32 or 36 bytes so pad X to 36 byte columns
        self.binary  = None
        self.hash    = None
        self.wins    = 0
        self.targets = []
        self.rate_limit_count = 0
        self.unknown_server_exception_count = 0
        self.retry_wait = 10
        self.count = 0
        self.failure_record = []
        self.start_time = datetime.datetime.now()
        self.response = None

    def _request(self, route, method='get', data=None):
        while True:
            if self.count > 597:
                print('Resetting session at count {}'.format(self.count))
                self.session = requests.session()
            try:
                if method == 'get':
                    r = self.session.get(self.url + route)
                else:
                    r = self.session.post(self.url + route, data=data)
                self.status_code = r.status_code
                if r.status_code == 429:
                    self.rate_limit_count += 1
                    self.failure_record.append({'type': 'rate_limit',
                                                'count': self.count,
                                                'time': (datetime.datetime.now() -
                                                         self.start_time).total_seconds()})
                    raise Exception('Rate Limit Exception')
                if r.status_code == 500:
                    self.unknown_server_exception_count += 1
                    self.failure_record.append({'type': 'unknown_server_exception',
                                                'count': self.count,
                                                'time': (datetime.datetime.now() -
                                                         self.start_time).total_seconds()})
                    raise Exception('Unknown Server Exception')
                self.response = r
                return r.json()
            except Exception as e:
                self.log.error(e)
                self.log.info('Waiting 60 seconds before next request')
                time.sleep(60)
                self.status_code = None

    def get(self):
        r = self._request("/challenge")
        self.targets = r.get('target', [])
        # removed base64.base64decode(r.get('binary', '')) to allow writes to disk without re-encoding
        self.binary  = r.get('binary', '')
        return r

    def post(self, target):
        r = self._request("/solve", method="post", data={"target": target})
        self.wins = r.get('correct', 0)
        self.hash = r.get('hash', self.hash)
        self.ans  = r.get('target', 'unknown')
        return r
    
    def get_data(self, number=10000):
        """
        Retrieves data in format
        @param number: nubmer of samples to return
        @returns {binary_data: values, targets: values, answers: values}
        where binary_data = [ "<base64 encoded string>", ... ]
        targets =  [ "avr", "x86_64", ... ]
        answers = [one item from targets,]
        """
        data = {}
        # If we grab a large data set make it a little more efficient by pre-allocating
        data['binary_data'] = [None]*number
        data['targets'] = [None]*number
        data['answers'] = [None]*number
        for i in range(number):
            count = 0
            self.get()
            while((self.status_code != 200) and (count < MAX_RETRIES) ):
                print('Status code != 200. Retrying')
                count = count + 1
                self.get()
            data['binary_data'][i] = self.binary
            data['targets'][i] = self.targets
            self.post(self.targets[0])
            while((self.status_code != 200) and (count < MAX_RETRIES) ):
                print('post status code {}'.format(self.status_code))
                count = count + 1
                self.post(self.targets[0])
            data['answers'][i] = self.ans
        return data
    
    def get_data_sets(self, num_train=1024, num_test=128, num_dev=128):
        """
        @returns {'train': {'binary_data': [num_train values],
                            'targets': [num_train [6 values]],
                            'answers': [num_train values],
                  'dev': format as with 'train',
                  'test': format as with 'train'
        }
        @param name_prefix: Write data to a file of this name. If None, do not write
        """
        data = {}
        data['train'] = self.get_data(num_train)
        data['dev'] = self.get_data(num_dev)
        data['test'] = self.get_data(num_test)
        return data
        
        
def submit(number):
    s = Server()

    for _ in range(number):
        # query the /challenge endpoint
        s.get()

        # choose a random target and /solve
        target = random.choice(s.targets)
        s.post(target)

        s.log.info("Guess:[{: >9}]   Answer:[{: >9}]   Wins:[{: >3}]".format(target, s.ans, s.wins))

        # 500 consecutive correct answers are required to win
        # very very unlikely with current code
        if s.hash:
            s.log.info("You win! {}".format(s.hash))    




        
def store(data, root_dir=None):
    """
    Store data to root_dir in a generated filename
    """
    # extract some info from data to generate friendly part of filename
    num_train = str(len(data['train']['answers']))
    num_dev = str(len(data['dev']['answers']))
    num_test = str(len(data['test']['answers']))
    
    if not root_dir:
        root_dir = os.getcwd()
    else:
        root_dir = os.path.realpath(root_dir)
    file_name = "_".join([num_train, num_dev, num_test, str(uuid.uuid1())[0:8]]) + ".json"
    file_path = os.path.join(root_dir, file_name)
    
    with open(file_path, 'w+') as f:
        json.dump(data, f)
            
def load(path):
    """
    load data from a single file.  Data must have format specified in get_data_sets        
    """
    path = os.path.realpath(path)
    with open(path, 'r') as f:
        data = json.load(f)
    return data

def load_dir(path):
    """
    load data from an entire directory.  Data must have format specified in get_data_sets
    """
    merged = {'train': { 'answers': [],
                         'targets': [],
                         'binary_data': [] },
              'dev': { 'answers': [],
                         'targets': [],
                         'binary_data': [] },
              'test': { 'answers': [],
                         'targets': [],
                         'binary_data': [] }
             }
    if os.path.isdir(path):
        for root, dirs, filenames in os.walk(path, topdown=True):
            for filename in filenames:
                if filename.endswith('.json'):
                    merged = merge_data(merged, load(os.path.join(root, filename)))
    else:
        merged = load(os.path.abspath(path)
)
    return merged

def merge_data(dict1, dict2):
    """
    Perform list additions for two identically formatted dicts with lists at depth 2
    """
    #_dict1 = deepcopy(dict1)
    for key in dict2:
        assert(key in dict1)
        for key2 in dict2[key]:
            assert(key2 in dict2[key])
            dict1[key][key2] = dict1[key][key2] + dict2[key][key2]
    return dict1

def hex_data(base64_binary_data, stride=1, expected_len=None):
    """
    Take an list of base64 encoded data and convert to lists of hex characters
    @params base64_binary_data: a base64 encoded string of (presumeably) 32 or 36 hex numbers
                                shape = (m, 1)
    @params stride: number of bytes to consider a word
    @params expected_len: max length of hex array, used if we need to pad shorter arrays
    @returns: np.array([map(hexlify,base64string),])
              shape = (m, len(base64.b64decode(base64_binary_data))/stride)
    """
    # Should check that we aren't discarding https://docs.python.org/2/library/base64.html
    # Characters that are neither in the normal base-64 alphabet nor the
    # alternative alphabet are discarded prior to the padding check.
    # Only python2 version does checking and uses stride, expected_len as it was found
    # that data was uniformly 0 misfits.
    misfits = []
    hex_X = []

    for data in base64_binary_data:
        data = base64.b64decode(data)

        if sys.version_info > (3,0):
            hex_X.append(binascii.hexlify(data).decode('utf-8'))
        else:
            byte_strings = []
            for i in range(0, len(data) , stride):
                byte_strings.append(data[i:i+stride])
            # probably not needed for most text_embeddings, but since most seem 32 or 36 it might help
            if expected_len:
                delta = len(byte_strings) - expected_len
                div = delta/stride
                remainder = delta % stride
                if remainder or (delta < 0):
                    print('misfit data', byte_strings)
                else:
                    byte_strings.extend(['\x00'*stride]*div)
            hex_X.append([ binascii.hexlify(e) for e in byte_strings ])

    return np.array(hex_X)

def class_to_ones_hot(answers, targets, supported_architectures):
    Y = []
    allowed_Y = []
    for answer, target in zip(answers, targets):
        y = [0]*len(supported_architectures)
        index = supported_architectures.index(answer)
        y[index] = 1
        Y.append(y)
        target_hot = [0]*len(supported_architectures)
        for j, arch in enumerate(target):
            index = supported_architectures.index(arch)
            target_hot[index] = 1
        allowed_Y.append(target_hot)
    return np.array(Y), np.array(allowed_Y) 

        

## Fetch data on first run of this notebook
Fetch some data in the format supplied by etl.py (train, dev, test sets)

In [0]:
### Only run this cell the first time
%cd ../
!wget https://s3.us-east-2.amazonaws.com/isa-classifier/isa_classifier_data.tar.gz
!tar xvf isa_classifier_data.tar.gz


In [0]:
!ls ../ml_challenge




### (skip?) Get and store new data
We will store the data in as close to the original form as possible.
We will do a small sample walkthrough of loading data and transforming it here as far as 
binary_data = [ [ 32 or 36 hex numbers ], ...].
We also transform the ISA class into one-hot vectors.
The word embedding transformations will be done in text_embeddings/I-TFID.ipynb, word2vec.ipynb 

In [0]:
import etl
s = etl.Server()
data_set1 = s.get_data_sets(num_train=4096, num_test=512, num_dev=512)
etl.store(data_set1, '../ml_challenge/')

In [0]:
print('binary_data', data_set1['train']['binary_data'][0:4])

In [0]:
!pwd

In [0]:
!ls ../ml_challenge/


In [0]:
import etl

data_set = etl.load('../ml_challenge/2048_256_256_dd67c64e.json')

In [0]:
len(data_set['train']['binary_data'])

In [0]:
orig_X_train, orig_Y_train, orig_train_targets = data_set['train']['binary_data'], data_set['train']['answers'], data_set['train']['targets']
orig_X_dev, orig_Y_dev, orig_dev_targets = data_set['dev']['binary_data'], data_set['dev']['answers'], data_set['dev']['targets']
orig_X_test, orig_Y_test, orig_test_targets = data_set['test']['binary_data'], data_set['test']['answers'], data_set['test']['targets']

print('orig_X_train[0:4]', '\n', orig_X_train[0:4])
print('orig_Y_train[0:4]', '\n', orig_Y_train[0:4])
print('orig_train_targets[0:4]', '\n', orig_train_targets[0:4])


In [0]:
import binascii
a = b'00011000'
print(int(a))
b = hex(int(a))
print(b)
binascii.hexlify(a)

In [0]:
base64_encoded = u'GAAAABgAAAAAAAAAqAAAAABIDhBHmgJa2g4AAAAAQaAAAD0kAABetxEE/0cCAOJDAAAQIgAAQbAAAF0kAABCoA=='

In [0]:
len(orig_X_train[0])

In [0]:
decoded = base64.b64decode(orig_X_train[0])
print(decoded)

In [0]:
int_bytes = [x for x in decoded]
print(int_bytes, len(int_bytes))

In [0]:
hex_bytes = [ hex(x) for x in int_bytes]
print(hex_bytes, len(hex_bytes))

In [0]:
# we want the values left 0-padded in a single string (doc)
hex_bytes = binascii.hexlify(decoded).decode('utf-8')
print(hex_bytes, len(hex_bytes))

In [0]:
# handy when developing etl in this notebook
from importlib import reload
if 'etl' in globals():
    reload(etl)
else:
    import etl

In [0]:

hex_X_train = etl.hex_data(orig_X_train)
hex_X_dev = etl.hex_data(orig_X_dev)
hex_X_test = etl.hex_data(orig_X_test)

print('hex_X_train[0:4]', hex_X_train[0:4], '\n')



In [0]:
def compare_vecs(X, num_indices):
    """
    sample the representation of two vecs over first num_indices of vector
    """
    for feature_0, freq_0, feature_1, freq_1 in zip(v.inverse_transform(X)[0][0:num_indices], 
                                                    X.A[0][0:num_indices],
                                                    v.inverse_transform(X)[1][0:num_indices], 
                                                    X.A[1][0:num_indices],
                                                   ):
        print(feature_0, freq_0, "   ", feature_1, freq_1)

        

In [0]:
compare_vecs(X_cv, 50)

In [0]:
bow_transform2 = v.transform(hexdoc_X_train[0:2])
print(bow_transform2)
print(bow_transform2[0:2])

In [0]:
from sklearn.feature_extraction.text import CountVectorizer
#from IPython.core.debugger import set_trace

vec_opts = {
    "ngram_range": (1, 6),  # allow n-grams of 1-6 words in length (32-bits)
    "analyzer": "word",     # analyze hex words
    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
    "min_df": 2,          # for demo purposes, be very selective about features
    "max_df": .7
}
v = CountVectorizer(**vec_opts)
X_cv = v.fit_transform(hex_X_train)

In [0]:
print(list(v.vocabulary_.items())[0:10])
print(len(v.vocabulary_))

In [0]:
print(len(X_cv.indices))
print(type(X_cv))

In [0]:
Xd = X_cv.todense()
print(Xd.shape)


In [0]:
print(Xd.A[1][0:280])
print(Xd.A[2][0:280])
print(len(Xd.A[1]), len(Xd.A[2]))

In [0]:
X_cv.shape

In [0]:
print(X_cv.A[1][1])


In [0]:
from sklearn.feature_extraction.text import TfidfTransformer

idf_opts = {"use_idf": True}
idf = TfidfTransformer(**idf_opts)

# perform the idf transform
X_idf = idf.fit_transform(X_cv)

In [0]:
X_idf

In [0]:
from sklearn.naive_bayes import MultinomialNB
mnbClassifier = MultinomialNB(alpha=.01)


In [0]:
mnb_model = mnbClassifier.fit(X_idf, np.array(orig_Y_train))

In [0]:
mnb_class2 = MultinomialNB(alpha=.9)
mnb_m2 = mnb_class2.fit(X_idf, np.array(orig_Y_train))
mnb_m2

In [0]:
mnb_m2.coef_.shape

In [0]:
diff = mnb_m2.coef_.argmax(axis=0) - mnb_model.coef_.argmax(axis=0)
print(diff, len(diff))

In [0]:
print(diff.nonzero(), len(diff.nonzero()[0]))

In [0]:
np.array([[1,2,3], [3,4,5]]).argmax(axis=0)

In [0]:
### fraction of 


In [0]:
mnb_model.coef_ - mnb_m2.coef_

In [0]:
mnb_model.coef_

In [0]:
delta = (mnb_model.coef_ - mnb_m2.coef_)*100.0 /mnb_model.coef_
print(delta)

In [0]:
delta.mean()

In [0]:
mnb_model

In [0]:
raw_predictions = mnb_model.predict(X_idf)

In [0]:
print(etl.SUPPORTED_ARCHITECTURES)

In [0]:
raw_predictions[0:10]

In [0]:
print(mnb_model.classes_.tolist())


In [0]:

Y_train, allowed_Y_train = class_to_ones_hot(orig_Y_train, orig_train_targets, mnb_model.classes_.tolist())
Y_dev, allowed_Y_dev = class_to_ones_hot(orig_Y_dev, orig_dev_targets, mnb_model.classes_.tolist())
Y_test, allowed_Y_test = class_to_ones_hot(orig_Y_test, orig_test_targets, mnb_model.classes_.tolist())

print('Y_train[0:4]', '\n', Y_train[0:4], '\n')
print('allowed_Y_train[0:4]', '\n', allowed_Y_train[0:4], '\n')
print(list(map("".join, hex_X_train[0:2])))


In [0]:
probs_train = mnb_model.predict_proba(X_idf)
print("raw probs")
print(probs_train[0:4])
print("allowed probs")
print(probs_train[0:2]*allowed_Y_train[0:2])
print("allowed")
print(allowed_Y_train[0:2])
print(np.argmax(probs_train[0:2]*allowed_Y_train[0:2], axis=1))
print(list(map(mnb_model.classes_.tolist().__getitem__, np.argmax(probs_train[0:10]*allowed_Y_train[0:10], axis=1))))
print("correct")
print(orig_Y_train[0:10])
print(Y_train[0:10])

In [0]:
def guess_from_target(probs, allowed_Y, supported_architectures, ones_hot=True):
    """
    Improve our chances by taking the max over the possible targets (6 instead of 12)
    probs: numerical array of shape (m, n_classes)
    targets: ones-hot array of shape (m, n_classes)
    
    returns: (m, 1) of the most likely ISA arch names after discards or
             (m, n_classes) one-hot representation of best guess
    """
    if ones_hot:
        result = np.zeros(probs.shape)
        result[np.argmax(probs*allowed_Y, axis=1)] = 1
        return 
    return list(map(supported_architectures.__getitem__, np.argmax(probs*allowed_Y, axis=1)))
    

In [0]:
print(len(probs_train), len(allowed_Y_train))
print(probs_train[0], allowed_Y_train[0], orig_train_targets[0])

In [0]:
np.argmax(probs_train*allowed_Y_train, axis=1)

In [0]:
predictions = guess_from_target(probs_train, allowed_Y_train, mnb_model.classes_.tolist(), ones_hot=False)

In [0]:
predictions[0], orig_Y_train[0]

In [0]:
import pandas as pd

In [0]:
D = pd.DataFrame(predictions)
D.describe()

In [0]:
C = pd.Series(orig_Y_train)


In [0]:
C.value_counts()

In [0]:
# We have a pretty good distribution of classes < 10% variance in counts

In [0]:
def describe_results(predictions, orig_Y):
    wrong = []
    for i in range(len(predictions)):
        if predictions[i] != orig_Y[i]:
            wrong.append([predictions[i], orig_Y[i]])
    print('training error: {}%'.format(100.0*len(wrong)/len(predictions)))
    print('some mistakes')
    print(wrong[0:15])
    return wrong

In [0]:
wrong = describe_results(predictions,orig_Y_train)

In [0]:
X_cv

In [0]:
pdw = pd.DataFrame(wrong)

In [0]:
pdw.columns = ['incorrect', 'correct']

In [0]:
pdw.describe()

In [0]:
pdw.groupby(by=pdw.correct).describe()

In [0]:
# model is worst at predicting sh4, then arm, m68k, xtensa


In [0]:
pdw.groupby(by=pdw.incorrect).describe()

In [0]:
# most false positives come from x86_64 and m68k, we are slightly over-fitting to them
pdw[pdw.incorrect == 'm68k'][0:5]

In [0]:
pdw.head()

In [0]:
pdw[pdw.correct == 'sparc']

# repeat for dev dataset

In [0]:
from sklearn.pipeline import Pipeline
pipeline = Pipeline([
    ('vec',   CountVectorizer(**vec_opts)),
    ('idf',  TfidfTransformer(**idf_opts))
    #('mnb_classifier',MultinomialNB())
])

X_dev_idf = pipeline.fit_transform(hexdoc_X_dev)

In [0]:
#X_cv_dev = v.transform(hex_X_dev, lowercase=False)

In [0]:
vec_opts = {
    "ngram_range": (1, 4),  # allow n-grams of 1-4 words in length (32-bits)
    "analyzer": "word",     # analyze hex words
    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
    "min_df": 3,          # 
    "vocabulary": v.vocabulary_
}
v_dev = CountVectorizer(**vec_opts)
X_cv_dev = v_dev.fit_transform(hexdoc_X_dev)

In [0]:
idf_dev = TfidfTransformer(**idf_opts)

# perform the idf transform
X_idf_dev = idf.fit_transform(X_cv_dev)

In [0]:
X_idf_dev = idf.transform(X_cv_dev)

In [0]:
X_cv_dev

In [0]:
X_idf_dev

In [0]:
probs_dev = mnb_model.predict_proba(X_idf_dev)

print(len(probs_dev), len(allowed_Y_dev))
print(probs_dev[0], allowed_Y_dev[0], orig_dev_targets[0])

In [0]:
predictions_dev = guess_from_target(probs_dev, allowed_Y_dev, mnb_model.classes_.tolist(), ones_hot=False)

In [0]:
wrong = describe_results(predictions_dev, orig_Y_dev)

# end dev

In [0]:
a = np.array([[7,2,4],[2,0,4]])
b = np.array([[1,0,1], [0,0,1]])
c = a*b
print(c)
d = np.zeros(c.shape)
print(np.argmax(c, axis=1))
print(d)
d[np.argmax(c, axis=1)] = 1
d

In [0]:
np.max(c, axis=1)

In [0]:
c[ c >= np.max(c, axis=1, keepdims=1)]

In [0]:
a = np.array([[1,2], [2, 1]])
b = np.array([[1,-1], [-1, 1]])
print(a)
print(b)
print(a*b)
print(np.multiply(a,b))

In [0]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import make_multilabel_classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
from sklearn.preprocessing import LabelBinarizer
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import CCA


def plot_hyperplane(clf, min_x, max_x, linestyle, label):
    # get the separating hyperplane
    w = clf.coef_[0]
    a = -w[0] / w[1]
    xx = np.linspace(min_x - 5, max_x + 5)  # make sure the line is long enough
    yy = a * xx - (clf.intercept_[0]) / w[1]
    plt.plot(xx, yy, linestyle, label=label)


def plot_subfigure(X, Y, subplot, title, transform):
    if transform == "pca":
        X = PCA(n_components=12).fit_transform(X)
    elif transform == "cca":
        X = CCA(n_components=12).fit(X, Y_train).transform(X)
    else:
        raise ValueError

    min_x = np.min(X[:, 0])
    max_x = np.max(X[:, 0])

    min_y = np.min(X[:, 1])
    max_y = np.max(X[:, 1])

    classif = SVC(kernel='linear')
    classif.fit(X, Y_train.transpose())

    plt.subplot(2, 2, subplot)
    plt.title(title)

    zero_class = np.where(Y_train[:, 0])
    one_class = np.where(Y_train[:, 1])
    plt.scatter(X[:, 0], X[:, 1], s=40, c='gray', edgecolors=(0, 0, 0))
    plt.scatter(X[zero_class, 0], X[zero_class, 1], s=160, edgecolors='b',
                facecolors='none', linewidths=2, label='Class 1')
    plt.scatter(X[one_class, 0], X[one_class, 1], s=80, edgecolors='orange',
                facecolors='none', linewidths=2, label='Class 2')

    plot_hyperplane(classif.estimators_[0], min_x, max_x, 'k--',
                    'Boundary\nfor class 1')
    plot_hyperplane(classif.estimators_[1], min_x, max_x, 'k-.',
                    'Boundary\nfor class 2')
    plt.xticks(())
    plt.yticks(())

    plt.xlim(min_x - .5 * max_x, max_x + .5 * max_x)
    plt.ylim(min_y - .5 * max_y, max_y + .5 * max_y)
    if subplot == 2:
        plt.xlabel('First principal component')
        plt.ylabel('Second principal component')
        plt.legend(loc="upper left")


plt.figure(figsize=(8, 6))

# X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
#                                       allow_unlabeled=True,
#                                       random_state=1)
X_arr = X.toarray()

plot_subfigure(X_arr, Y_train, 1, "With unlabeled samples + CCA", "cca")
plot_subfigure(X_arr, Y, 2, "With unlabeled samples + PCA", "pca")

# X, Y = make_multilabel_classification(n_classes=2, n_labels=1,
#                                       allow_unlabeled=False,
#                                       random_state=1)

# plot_subfigure(X, Y, 3, "Without unlabeled samples + CCA", "cca")
# plot_subfigure(X, Y, 4, "Without unlabeled samples + PCA", "pca")

plt.subplots_adjust(.04, .02, .97, .94, .09, .2)
plt.show()

In [0]:
Xt, Yt = make_multilabel_classification(n_classes=12, n_labels=1,
                                      allow_unlabeled=True,
                                      random_state=1)

In [0]:
Yt[0:2]

In [0]:
print(Xt.shape, Yt.shape)

In [0]:
print Y.shape
Y