# mnb parameter tuning (Multinomial Naieve Bayes)

In [6]:
from __future__ import division
import etl


In [7]:
!free -h

              total        used        free      shared  buff/cache   available
Mem:            29G        504M         20G        884K        8.2G         28G
Swap:            0B          0B          0B


In [8]:
!pwd

/home/jovyan/isa-classifier/sklearn


In [9]:
!ls ../ml_challenge/

1024_128_128_eba414fa.json  2048_256_256_cb0707f0.json
2048_256_256_0b966df8.json  2048_256_256_dd02089e.json
2048_256_256_0baba5f4.json  2048_256_256_dd67c64e.json
2048_256_256_2847388a.json  4096_512_512_6f9ef7d8.json
2048_256_256_666e0b7e.json  512_128_128_76fe753c.json
2048_256_256_8f6606b2.json


In [12]:
#data_set = etl.load_dir('../ml_challenge')
data_set = etl.load('../ml_challenge/4096_512_512_6f9ef7d8.json')


In [13]:
orig_X_train, orig_Y_train, orig_train_targets = data_set['train']['binary_data'], data_set['train']['answers'], data_set['train']['targets']
orig_X_dev, orig_Y_dev, orig_dev_targets = data_set['dev']['binary_data'], data_set['dev']['answers'], data_set['dev']['targets']
orig_X_test, orig_Y_test, orig_test_targets = data_set['test']['binary_data'], data_set['test']['answers'], data_set['test']['targets']

print('orig_X_train[0:4]', '\n', orig_X_train[0:4])
print('orig_Y_train[0:4]', '\n', orig_Y_train[0:4])
print('orig_train_targets[0:4]', '\n', orig_train_targets[0:4])

orig_X_train[0:4] 
 ['GAAAABgAAAAAAAAAqAAAAABIDhBHmgJa2g4AAAAAQaAAAD0kAABetxEE/0cCAOJDAAAQIgAAQbAAAF0kAABCoA==', 'AAAAAAABelIABHgaARsNHgAYAAAAGAAAAAAAAACMAAAAAEMOEEGaAl7aDgAAAwDjQwAAPSQAAGGwAABCIAQAQg==', 'AOUBIRjRGdMSYRnUEB9iYRjRMiEyITIhCtMCITJjFtAyISIhYmMA5jIhIiFxUiIhclIiIQtACQAA4AR/Jk/2aQ==', 'g/gCfui4AAAAAOsTSGPQixSVAAAAAIkVAAAAAIPAAYXAfum4AAAAAOsTSGPQD7aSAAAAAIkVAAAAAIPAAYP4AQ==']
orig_Y_train[0:4] 
 ['alphaev56', 'alphaev56', 'sh4', 'x86_64']
orig_train_targets[0:4] 
 [['alphaev56', 'arm', 'm68k', 'mips', 'x86_64', 'xtensa'], ['alphaev56', 'arm', 'avr', 'm68k', 'mipsel', 'powerpc'], ['avr', 'm68k', 'mips', 'mipsel', 'sh4', 'sparc'], ['arm', 'm68k', 'mips', 'mipsel', 'sh4', 'x86_64']]


In [15]:
hex_X_train = etl.hex_data(orig_X_train)
hex_X_dev = etl.hex_data(orig_X_dev)
hex_X_test = etl.hex_data(orig_X_test)


In [20]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd

In [25]:
def guess_from_target(probs, allowed_Y, supported_architectures):
    """
    Improve our chances by taking the max over the possible targets (6 instead of 12)
    @probs: numerical array of shape (m, n_classes)
    @allowed_Y: ones-hot array of shape (m, n_classes)
    @supported_architectures: Use CountVectorizer.classes_ not etl.SUPPORTED_ARCHITECTURES for ordering
    
    @returns: (m, 1) of the most likely ISA arch names after discards or
             (m, n_classes) one-hot representation of best guess
    """
    return list(map(supported_architectures.__getitem__, np.argmax(probs*allowed_Y, axis=1)))

In [131]:
def describe_results(predictions, probs, orig_Y, param_str, error_type='train'):
    """
    @returns tuple(index, prediction, actual_value)
    """
    wrong = []
    for i in range(len(predictions)):
        if predictions[i] != orig_Y[i]:
            wrong.append([i, predictions[i], orig_Y[i], probs[i]])
    if error_type == 'train':
        print(param_str)
    print('{} error: {}'.format(error_type, len(wrong)/len(predictions)))
    return wrong, len(wrong)/len(predictions)

In [27]:
params = {}
params['alpha'] = [0.0001, 0.001, 0.1]
params['max_ngram_range'] = [4, 6, 8]
params['smooth_idf'] = [True, False]
params['norm'] = ['l2', None]
params['sublinear_tf'] = [True, False]
params['min_df'] = [1, 2]
params['max_df'] = [.5, .9]


In [130]:
def search_hyperparams(params):
    df = pd.DataFrame()
    wrong_train = {}
    wrong_dev = {}
    for alpha in params['alpha']:
        for max_ngram_range in params['max_ngram_range']:
            for smooth_idf in params['smooth_idf']:
                for norm in params['norm']:
                    for sublinear_tf in params['sublinear_tf']:
                        for min_df in params['min_df']:
                            for max_df in params['max_df']:
                                vec_opts = {
                                    "ngram_range": (1, max_ngram_range),  # allow n-grams of 1-4 words in length (32-bits)
                                    "analyzer": "word",     # analyze hex words
                                    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
                                    "min_df": min_df,          # for demo purposes, be very selective about features
                                    "max_df": max_df
                                }
                                v = CountVectorizer(**vec_opts)
                                X_cv = v.fit_transform(hex_X_train)

                                idf_opts = {"use_idf": True}
                                idf = TfidfTransformer(**idf_opts)

                                # perform the idf transform
                                X_idf = idf.fit_transform(X_cv)

                                mnbClassifier = MultinomialNB(alpha=alpha)

                                mnb_model = mnbClassifier.fit(X_idf, np.array(orig_Y_train))

                                Y_train, allowed_Y_train = etl.class_to_ones_hot(orig_Y_train, orig_train_targets, mnb_model.classes_.tolist())
                                Y_dev, allowed_Y_dev = etl.class_to_ones_hot(orig_Y_dev, orig_dev_targets, mnb_model.classes_.tolist())
                                Y_test, allowed_Y_test = etl.class_to_ones_hot(orig_Y_test, orig_test_targets, mnb_model.classes_.tolist())

                                probs_train = mnb_model.predict_proba(X_idf)

                                predictions = guess_from_target(probs_train, allowed_Y_train, mnb_model.classes_.tolist())

                                param_str = "alpha={}, max_ngram_range={}, smooth_idf={}, norm={}, sublinear_tf={}, min_df={}, max_df={}".format(
                                alpha, max_ngram_range, smooth_idf, norm, sublinear_tf, min_df, max_df)
                                mismatches, train_error = describe_results(predictions, probs_train, orig_Y_train, param_str)
                                
                                wrong_train[param_str + '-train-' + str(train_error)] = mismatches
                                
                                vec_opts.update({'vocabulary': v.vocabulary_})
                                v_dev = CountVectorizer(**vec_opts)
                                X_cv_dev = v_dev.transform(hex_X_dev)
                                X_idf_dev = idf.transform(X_cv_dev)
                                probs_dev = mnb_model.predict_proba(X_idf_dev)

                                predictions_dev = guess_from_target(probs_dev, allowed_Y_dev, mnb_model.classes_.tolist())
                                mismatches, dev_error = describe_results(predictions_dev, probs_dev, orig_Y_dev, param_str, error_type='dev')
                                wrong_dev[param_str + '-dev-' + str(dev_error)] = mismatches
                                
                                params_dict = {'alpha': alpha, 'max_ngram_range': max_ngram_range, 
                                                'smooth_idf': smooth_idf, 'norm': norm, 
                                          'sublinear_tf': sublinear_tf, 'min_df': min_df, 
                                          'max_df': max_df, 'train_error': train_error, 
                                          'dev_error': dev_error}
                                df = df.append(params_dict, ignore_index=True)

    return wrong_train, wrong_dev, df

In [46]:
def param_space_size(params):
    prod = 1
    for k,v in params.items():
        prod *= len(v)
    return prod
param_space_size(params)

288

In [34]:
wrong_train, wrong_dev, df_mnb = search_hyperparams(params)

alpha=0.0001, max_ngram_range=4, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.00390625
alpha=0.0001, max_ngram_range=4, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=1, max_df=0.9
train error: 0.000244140625
dev error: 0.00390625
alpha=0.0001, max_ngram_range=4, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.5
train error: 0.000244140625
dev error: 0.00390625
alpha=0.0001, max_ngram_range=4, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.9
train error: 0.00048828125
dev error: 0.00390625
alpha=0.0001, max_ngram_range=4, smooth_idf=True, norm=l2, sublinear_tf=False, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.00390625
alpha=0.0001, max_ngram_range=4, smooth_idf=True, norm=l2, sublinear_tf=False, min_df=1, max_df=0.9
train error: 0.000244140625
dev error: 0.00390625
alpha=0.0001, max_ngram_range=4, smooth_idf=True, norm=l2, sublinear_tf=False, min_df=2, max_df=0.5
train

dev error: 0.001953125
alpha=0.0001, max_ngram_range=6, smooth_idf=False, norm=l2, sublinear_tf=False, min_df=2, max_df=0.9
train error: 0.000244140625
dev error: 0.001953125
alpha=0.0001, max_ngram_range=6, smooth_idf=False, norm=None, sublinear_tf=True, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.005859375
alpha=0.0001, max_ngram_range=6, smooth_idf=False, norm=None, sublinear_tf=True, min_df=1, max_df=0.9
train error: 0.000244140625
dev error: 0.005859375
alpha=0.0001, max_ngram_range=6, smooth_idf=False, norm=None, sublinear_tf=True, min_df=2, max_df=0.5
train error: 0.000244140625
dev error: 0.001953125
alpha=0.0001, max_ngram_range=6, smooth_idf=False, norm=None, sublinear_tf=True, min_df=2, max_df=0.9
train error: 0.000244140625
dev error: 0.001953125
alpha=0.0001, max_ngram_range=6, smooth_idf=False, norm=None, sublinear_tf=False, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.005859375
alpha=0.0001, max_ngram_range=6, smooth_idf=False, norm=Non

alpha=0.001, max_ngram_range=4, smooth_idf=True, norm=None, sublinear_tf=False, min_df=1, max_df=0.9
train error: 0.000244140625
dev error: 0.00390625
alpha=0.001, max_ngram_range=4, smooth_idf=True, norm=None, sublinear_tf=False, min_df=2, max_df=0.5
train error: 0.000732421875
dev error: 0.00390625
alpha=0.001, max_ngram_range=4, smooth_idf=True, norm=None, sublinear_tf=False, min_df=2, max_df=0.9
train error: 0.0009765625
dev error: 0.00390625
alpha=0.001, max_ngram_range=4, smooth_idf=False, norm=l2, sublinear_tf=True, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.00390625
alpha=0.001, max_ngram_range=4, smooth_idf=False, norm=l2, sublinear_tf=True, min_df=1, max_df=0.9
train error: 0.000244140625
dev error: 0.00390625
alpha=0.001, max_ngram_range=4, smooth_idf=False, norm=l2, sublinear_tf=True, min_df=2, max_df=0.5
train error: 0.000732421875
dev error: 0.00390625
alpha=0.001, max_ngram_range=4, smooth_idf=False, norm=l2, sublinear_tf=True, min_df=2, max_df=0.9
tra

dev error: 0.00390625
alpha=0.001, max_ngram_range=8, smooth_idf=True, norm=l2, sublinear_tf=False, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.005859375
alpha=0.001, max_ngram_range=8, smooth_idf=True, norm=l2, sublinear_tf=False, min_df=1, max_df=0.9
train error: 0.000244140625
dev error: 0.005859375
alpha=0.001, max_ngram_range=8, smooth_idf=True, norm=l2, sublinear_tf=False, min_df=2, max_df=0.5
train error: 0.000244140625
dev error: 0.00390625
alpha=0.001, max_ngram_range=8, smooth_idf=True, norm=l2, sublinear_tf=False, min_df=2, max_df=0.9
train error: 0.000244140625
dev error: 0.00390625
alpha=0.001, max_ngram_range=8, smooth_idf=True, norm=None, sublinear_tf=True, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.005859375
alpha=0.001, max_ngram_range=8, smooth_idf=True, norm=None, sublinear_tf=True, min_df=1, max_df=0.9
train error: 0.000244140625
dev error: 0.005859375
alpha=0.001, max_ngram_range=8, smooth_idf=True, norm=None, sublinear_tf=True,

alpha=0.1, max_ngram_range=4, smooth_idf=False, norm=None, sublinear_tf=True, min_df=2, max_df=0.9
train error: 0.002685546875
dev error: 0.005859375
alpha=0.1, max_ngram_range=4, smooth_idf=False, norm=None, sublinear_tf=False, min_df=1, max_df=0.5
train error: 0.000732421875
dev error: 0.005859375
alpha=0.1, max_ngram_range=4, smooth_idf=False, norm=None, sublinear_tf=False, min_df=1, max_df=0.9
train error: 0.0009765625
dev error: 0.005859375
alpha=0.1, max_ngram_range=4, smooth_idf=False, norm=None, sublinear_tf=False, min_df=2, max_df=0.5
train error: 0.002685546875
dev error: 0.005859375
alpha=0.1, max_ngram_range=4, smooth_idf=False, norm=None, sublinear_tf=False, min_df=2, max_df=0.9
train error: 0.002685546875
dev error: 0.005859375
alpha=0.1, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.005859375
alpha=0.1, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=1, max_df=0.9
train

dev error: 0.0078125
alpha=0.1, max_ngram_range=8, smooth_idf=False, norm=l2, sublinear_tf=True, min_df=2, max_df=0.9
train error: 0.001708984375
dev error: 0.0078125
alpha=0.1, max_ngram_range=8, smooth_idf=False, norm=l2, sublinear_tf=False, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.005859375
alpha=0.1, max_ngram_range=8, smooth_idf=False, norm=l2, sublinear_tf=False, min_df=1, max_df=0.9
train error: 0.000244140625
dev error: 0.005859375
alpha=0.1, max_ngram_range=8, smooth_idf=False, norm=l2, sublinear_tf=False, min_df=2, max_df=0.5
train error: 0.001708984375
dev error: 0.0078125
alpha=0.1, max_ngram_range=8, smooth_idf=False, norm=l2, sublinear_tf=False, min_df=2, max_df=0.9
train error: 0.001708984375
dev error: 0.0078125
alpha=0.1, max_ngram_range=8, smooth_idf=False, norm=None, sublinear_tf=True, min_df=1, max_df=0.5
train error: 0.000244140625
dev error: 0.005859375
alpha=0.1, max_ngram_range=8, smooth_idf=False, norm=None, sublinear_tf=True, min_df=1, max

In [42]:
df_mnb.describe()

Unnamed: 0,alpha,dev_error,max_df,max_ngram_range,min_df,smooth_idf,sublinear_tf,train_error
count,288.0,288.0,288.0,288.0,288.0,288.0,288.0,288.0
mean,0.0337,0.005208,0.7,6.0,1.5,0.5,0.5,0.000665
std,0.046964,0.001458,0.200348,1.635836,0.50087,0.50087,0.50087,0.000751
min,0.0001,0.001953,0.5,4.0,1.0,0.0,0.0,0.000244
25%,0.0001,0.003906,0.5,4.0,1.0,0.0,0.0,0.000244
50%,0.001,0.005859,0.7,6.0,1.5,0.5,0.5,0.000244
75%,0.1,0.005859,0.9,8.0,2.0,1.0,1.0,0.000732
max,0.1,0.007812,0.9,8.0,2.0,1.0,1.0,0.002686


In [47]:
df_mnb.groupby(by='train_error').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,alpha,dev_error,max_df,max_ngram_range,min_df,smooth_idf,sublinear_tf
train_error,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.000244,count,184.000000,184.000000,184.000000,184.000000,184.000000,184.000000,184.000000
0.000244,mean,0.017787,0.004925,0.691304,6.434783,1.304348,0.500000,0.500000
0.000244,std,0.037827,0.001274,0.200356,1.559770,0.461386,0.501364,0.501364
0.000244,min,0.000100,0.001953,0.500000,4.000000,1.000000,0.000000,0.000000
0.000244,25%,0.000100,0.003906,0.500000,6.000000,1.000000,0.000000,0.000000
0.000244,50%,0.001000,0.005859,0.500000,6.000000,1.000000,0.500000,0.500000
0.000244,75%,0.001000,0.005859,0.900000,8.000000,2.000000,1.000000,1.000000
0.000244,max,0.100000,0.005859,0.900000,8.000000,2.000000,1.000000,1.000000
0.000488,count,24.000000,24.000000,24.000000,24.000000,24.000000,24.000000,24.000000
0.000488,mean,0.000700,0.003906,0.766667,5.333333,2.000000,0.500000,0.500000


In [48]:
df_mnb.groupby(by='dev_error').describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,alpha,max_df,max_ngram_range,min_df,smooth_idf,sublinear_tf,train_error
dev_error,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0.001953,count,16.0,16.0,16.0,16.0,16.0,16.0,16.0
0.001953,mean,0.0001,0.7,6.0,2.0,0.5,0.5,0.000244
0.001953,std,0.0,0.206559,0.0,0.0,0.516398,0.516398,0.0
0.001953,min,0.0001,0.5,6.0,2.0,0.0,0.0,0.000244
0.001953,25%,0.0001,0.5,6.0,2.0,0.0,0.0,0.000244
0.001953,50%,0.0001,0.7,6.0,2.0,0.5,0.5,0.000244
0.001953,75%,0.0001,0.9,6.0,2.0,1.0,1.0,0.000244
0.001953,max,0.0001,0.9,6.0,2.0,1.0,1.0,0.000244
0.003906,count,96.0,96.0,96.0,96.0,96.0,96.0,96.0
0.003906,mean,0.0007,0.7,5.0,1.666667,0.5,0.5,0.000407


In [52]:
train_min = df_mnb['train_error'].min()
dev_min = df_mnb['dev_error'].min()
print("train_min", train_min, train_min*500)
print("dev_min", dev_min, dev_min*500)


train_min 0.000244140625 0.1220703125
dev_min 0.001953125 0.9765625


In [54]:
df_mnb.loc[df_mnb['dev_error'] == dev_min]

Unnamed: 0,alpha,dev_error,max_df,max_ngram_range,min_df,norm,smooth_idf,sublinear_tf,train_error
34,0.0001,0.001953,0.5,6.0,2.0,l2,1.0,1.0,0.000244
35,0.0001,0.001953,0.9,6.0,2.0,l2,1.0,1.0,0.000244
38,0.0001,0.001953,0.5,6.0,2.0,l2,1.0,0.0,0.000244
39,0.0001,0.001953,0.9,6.0,2.0,l2,1.0,0.0,0.000244
42,0.0001,0.001953,0.5,6.0,2.0,,1.0,1.0,0.000244
43,0.0001,0.001953,0.9,6.0,2.0,,1.0,1.0,0.000244
46,0.0001,0.001953,0.5,6.0,2.0,,1.0,0.0,0.000244
47,0.0001,0.001953,0.9,6.0,2.0,,1.0,0.0,0.000244
50,0.0001,0.001953,0.5,6.0,2.0,l2,0.0,1.0,0.000244
51,0.0001,0.001953,0.9,6.0,2.0,l2,0.0,1.0,0.000244


This shows that the hyperparams mnb is sensitive to for min dev_error are
alpha (<= 0.0001), ngram_range (=6), min_df (=2.0)
max_df, norm, smoothin_idf, sublinear_tf are not selecting


### Let's run the grid search again honing in on alpha

In [97]:
params = {}
params['alpha'] = [1e-6, 1e-5, 1e-4]
params['max_ngram_range'] = [6,]
params['smooth_idf'] = [True]
params['norm'] = ['l2']
params['sublinear_tf'] = [True]
params['min_df'] = [2]
params['max_df'] = [.7]

param_space_size(params)

3

In [98]:
data_set = etl.load_dir('../ml_challenge/')
orig_X_train, orig_Y_train, orig_train_targets = data_set['train']['binary_data'], data_set['train']['answers'], data_set['train']['targets']
orig_X_dev, orig_Y_dev, orig_dev_targets = data_set['dev']['binary_data'], data_set['dev']['answers'], data_set['dev']['targets']
orig_X_test, orig_Y_test, orig_test_targets = data_set['test']['binary_data'], data_set['test']['answers'], data_set['test']['targets']

print('orig_X_train[0:4]', '\n', orig_X_train[0:4])
print('orig_Y_train[0:4]', '\n', orig_Y_train[0:4])
print('orig_train_targets[0:4]', '\n', orig_train_targets[0:4])

hex_X_train = etl.hex_data(orig_X_train)
hex_X_dev = etl.hex_data(orig_X_dev)
hex_X_test = etl.hex_data(orig_X_test)


orig_X_train[0:4] 
 ['AADtjAAywX8ADD0gAADACQAA7AsAMuwMACj9gAIQPSAAAMAJAAD/jAAAQJ0ADMAfAAhIAAAQwZ8ACMAfAAzsDA==', 'AAb/////AAAAAAADbGx4CgAgAJgLAKkRwCAAmBuQkPWgmSDAIACZCMAgAJgrwCAAmQjBAADRAAChAACBAADgCA==', 'jIkAGI1LAACNaQAAAShIJK1pAACMSQAgJSkAARAA//GsSQAgjUIAAK+iABiMwgAIAAAwIYxCAACsogAMPAIAAA==', 'WDAwAFAwEACnSAAAWDDR4lBAEABYQNHeUEAQAFhAMABQQBAAWEAwBFBAEABYQNHaWDAwCFAwEABYMEAAUDAQAA==']
orig_Y_train[0:4] 
 ['powerpc', 'xtensa', 'mips', 's390']
orig_train_targets[0:4] 
 [['alphaev56', 'arm', 'powerpc', 's390', 'sh4', 'xtensa'], ['alphaev56', 'avr', 'mipsel', 'powerpc', 'sh4', 'xtensa'], ['avr', 'm68k', 'mips', 'powerpc', 'sh4', 'sparc'], ['arm', 'avr', 'm68k', 'mips', 'mipsel', 's390']]


In [132]:
wrong_train, wrong_dev, df_mnb = search_hyperparams(params)

alpha=1e-06, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7
train error: 0.00013626453488372093
dev error: 0.002840909090909091
alpha=1e-05, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7
train error: 0.0001816860465116279
dev error: 0.002840909090909091
alpha=0.0001, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7
train error: 0.00022710755813953488
dev error: 0.002840909090909091


In [133]:
for k,v in wrong_dev.items():
    print(k)
    print(len(wrong_dev[k]))

for k,v in wrong_train.items():
    print(k)
    print(len(wrong_train[k]))

alpha=1e-06, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7-dev-0.002840909090909091
8
alpha=1e-05, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7-dev-0.002840909090909091
8
alpha=0.0001, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7-dev-0.002840909090909091
8
alpha=1e-06, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7-train-0.00013626453488372093
3
alpha=1e-05, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7-train-0.0001816860465116279
4
alpha=0.0001, max_ngram_range=6, smooth_idf=True, norm=l2, sublinear_tf=True, min_df=2, max_df=0.7-train-0.00022710755813953488
5


In [135]:
wd = pd.DataFrame(list(wrong_dev.values())[0])
wd.columns = ['index', 'wrong_prediction', 'correct', 'probs']
wd

Unnamed: 0,index,wrong_prediction,correct,probs
0,691,x86_64,sh4,"[0.867497807263, 1.52554769582e-10, 4.71626628..."
1,896,x86_64,m68k,"[1.07783274832e-08, 1.72560516678e-05, 3.78045..."
2,1225,alphaev56,mipsel,"[0.00370932347734, 2.07536478601e-12, 6.016259..."
3,2116,xtensa,powerpc,"[5.48622880055e-08, 6.71195903755e-11, 5.14493..."
4,2238,m68k,xtensa,"[1.26833556741e-18, 4.5636306472e-10, 7.326958..."
5,2681,x86_64,arm,"[1.22116621662e-13, 7.79023795144e-13, 1.23761..."
6,2731,x86_64,xtensa,"[2.2987590055e-20, 1.43681079991e-12, 1.850596..."
7,2793,xtensa,s390,"[2.24438545745e-14, 0.000249031672812, 3.34003..."


In [144]:
f = lambda x: x > 1e-4
for col in wd['probs']:
    print(list(filter(f, col)))

[0.86749780726294434, 0.0071149905157935258, 0.12528967721035653]
[0.0037579102787192614, 0.0024714089403507439, 0.99372315119151933]
[0.0037093234773423757, 0.99626420934528825]
[0.0043282820102806567, 0.00017659294462057208, 0.12929650990622041, 0.86166572733725133, 0.004532351056729873]
[0.95530005199615975, 0.00066700395747745409, 0.044032906217756865]
[0.99999999993129052]
[0.0068345251855331914, 0.00069035637934964365, 0.99238452752047768]
[0.00024903167281177096, 0.84065216732841319, 0.00014851096156915666, 0.15894904681197841]


It's interesting that the model has such high confidence for item 5 (index 2681) and yet is wrong.

1.42