# mnb parameter tuning (Multinomial Naieve Bayes)

In [0]:
from __future__ import division
import etl


In [0]:
!free -h

In [0]:
!pwd

In [0]:
!ls ../ml_challenge/

In [0]:
#data_set = etl.load_dir('../ml_challenge')
data_set = etl.load('../ml_challenge/4096_512_512_6f9ef7d8.json')


In [0]:
orig_X_train, orig_Y_train, orig_train_targets = data_set['train']['binary_data'], data_set['train']['answers'], data_set['train']['targets']
orig_X_dev, orig_Y_dev, orig_dev_targets = data_set['dev']['binary_data'], data_set['dev']['answers'], data_set['dev']['targets']
orig_X_test, orig_Y_test, orig_test_targets = data_set['test']['binary_data'], data_set['test']['answers'], data_set['test']['targets']

print('orig_X_train[0:4]', '\n', orig_X_train[0:4])
print('orig_Y_train[0:4]', '\n', orig_Y_train[0:4])
print('orig_train_targets[0:4]', '\n', orig_train_targets[0:4])

In [0]:
hex_X_train = etl.hex_data(orig_X_train)
hex_X_dev = etl.hex_data(orig_X_dev)
hex_X_test = etl.hex_data(orig_X_test)


In [0]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
import numpy as np
import pandas as pd

In [0]:
def guess_from_target(probs, allowed_Y, supported_architectures):
    """
    Improve our chances by taking the max over the possible targets (6 instead of 12)
    @probs: numerical array of shape (m, n_classes)
    @allowed_Y: ones-hot array of shape (m, n_classes)
    @supported_architectures: Use CountVectorizer.classes_ not etl.SUPPORTED_ARCHITECTURES for ordering
    
    @returns: (m, 1) of the most likely ISA arch names after discards or
             (m, n_classes) one-hot representation of best guess
    """
    return list(map(supported_architectures.__getitem__, np.argmax(probs*allowed_Y, axis=1)))

In [0]:
def describe_results(predictions, probs, orig_Y, param_str, error_type='train'):
    """
    @returns tuple(index, prediction, actual_value)
    """
    wrong = []
    for i in range(len(predictions)):
        if predictions[i] != orig_Y[i]:
            wrong.append([i, predictions[i], orig_Y[i], probs[i]])
    if error_type == 'train':
        print(param_str)
    print('{} error: {}'.format(error_type, len(wrong)/len(predictions)))
    return wrong, len(wrong)/len(predictions)

In [0]:
params = {}
params['alpha'] = [0.0001, 0.001, 0.1]
params['max_ngram_range'] = [4, 6, 8]
params['smooth_idf'] = [True, False]
params['norm'] = ['l2', None]
params['sublinear_tf'] = [True, False]
params['min_df'] = [1, 2]
params['max_df'] = [.5, .9]


In [0]:
def search_hyperparams(params):
    df = pd.DataFrame()
    wrong_train = {}
    wrong_dev = {}
    for alpha in params['alpha']:
        for max_ngram_range in params['max_ngram_range']:
            for smooth_idf in params['smooth_idf']:
                for norm in params['norm']:
                    for sublinear_tf in params['sublinear_tf']:
                        for min_df in params['min_df']:
                            for max_df in params['max_df']:
                                vec_opts = {
                                    "ngram_range": (1, max_ngram_range),  # allow n-grams of 1-4 words in length (32-bits)
                                    "analyzer": "word",     # analyze hex words
                                    "token_pattern": "..",  # treat two characters as a word (e.g. 4b)
                                    "min_df": min_df,          # for demo purposes, be very selective about features
                                    "max_df": max_df
                                }
                                v = CountVectorizer(**vec_opts)
                                X_cv = v.fit_transform(hex_X_train)

                                idf_opts = {"use_idf": True}
                                idf = TfidfTransformer(**idf_opts)

                                # perform the idf transform
                                X_idf = idf.fit_transform(X_cv)

                                mnbClassifier = MultinomialNB(alpha=alpha)

                                mnb_model = mnbClassifier.fit(X_idf, np.array(orig_Y_train))

                                Y_train, allowed_Y_train = etl.class_to_ones_hot(orig_Y_train, orig_train_targets, mnb_model.classes_.tolist())
                                Y_dev, allowed_Y_dev = etl.class_to_ones_hot(orig_Y_dev, orig_dev_targets, mnb_model.classes_.tolist())
                                Y_test, allowed_Y_test = etl.class_to_ones_hot(orig_Y_test, orig_test_targets, mnb_model.classes_.tolist())

                                probs_train = mnb_model.predict_proba(X_idf)

                                predictions = guess_from_target(probs_train, allowed_Y_train, mnb_model.classes_.tolist())

                                param_str = "alpha={}, max_ngram_range={}, smooth_idf={}, norm={}, sublinear_tf={}, min_df={}, max_df={}".format(
                                alpha, max_ngram_range, smooth_idf, norm, sublinear_tf, min_df, max_df)
                                mismatches, train_error = describe_results(predictions, probs_train, orig_Y_train, param_str)
                                
                                wrong_train[param_str + '-train-' + str(train_error)] = mismatches
                                
                                vec_opts.update({'vocabulary': v.vocabulary_})
                                v_dev = CountVectorizer(**vec_opts)
                                X_cv_dev = v_dev.transform(hex_X_dev)
                                X_idf_dev = idf.transform(X_cv_dev)
                                probs_dev = mnb_model.predict_proba(X_idf_dev)

                                predictions_dev = guess_from_target(probs_dev, allowed_Y_dev, mnb_model.classes_.tolist())
                                mismatches, dev_error = describe_results(predictions_dev, probs_dev, orig_Y_dev, param_str, error_type='dev')
                                wrong_dev[param_str + '-dev-' + str(dev_error)] = mismatches
                                
                                params_dict = {'alpha': alpha, 'max_ngram_range': max_ngram_range, 
                                                'smooth_idf': smooth_idf, 'norm': norm, 
                                          'sublinear_tf': sublinear_tf, 'min_df': min_df, 
                                          'max_df': max_df, 'train_error': train_error, 
                                          'dev_error': dev_error}
                                df = df.append(params_dict, ignore_index=True)

    return wrong_train, wrong_dev, df

In [0]:
def param_space_size(params):
    prod = 1
    for k,v in params.items():
        prod *= len(v)
    return prod
param_space_size(params)

In [0]:
wrong_train, wrong_dev, df_mnb = search_hyperparams(params)

In [0]:
df_mnb.describe()

In [0]:
df_mnb.groupby(by='train_error').describe()

In [0]:
df_mnb.groupby(by='dev_error').describe()

In [0]:
train_min = df_mnb['train_error'].min()
dev_min = df_mnb['dev_error'].min()
print("train_min", train_min, train_min*500)
print("dev_min", dev_min, dev_min*500)


In [0]:
df_mnb.loc[df_mnb['dev_error'] == dev_min]

This shows that the hyperparams mnb is sensitive to for min dev_error are
alpha (<= 0.0001), ngram_range (=6), min_df (=2.0)
max_df, norm, smoothin_idf, sublinear_tf are not selecting


### Let's run the grid search again honing in on alpha

In [0]:
params = {}
params['alpha'] = [1e-6, 1e-5, 1e-4]
params['max_ngram_range'] = [6,]
params['smooth_idf'] = [True]
params['norm'] = ['l2']
params['sublinear_tf'] = [True]
params['min_df'] = [2]
params['max_df'] = [.7]

param_space_size(params)

In [0]:
data_set = etl.load_dir('../ml_challenge/')
orig_X_train, orig_Y_train, orig_train_targets = data_set['train']['binary_data'], data_set['train']['answers'], data_set['train']['targets']
orig_X_dev, orig_Y_dev, orig_dev_targets = data_set['dev']['binary_data'], data_set['dev']['answers'], data_set['dev']['targets']
orig_X_test, orig_Y_test, orig_test_targets = data_set['test']['binary_data'], data_set['test']['answers'], data_set['test']['targets']

print('orig_X_train[0:4]', '\n', orig_X_train[0:4])
print('orig_Y_train[0:4]', '\n', orig_Y_train[0:4])
print('orig_train_targets[0:4]', '\n', orig_train_targets[0:4])

hex_X_train = etl.hex_data(orig_X_train)
hex_X_dev = etl.hex_data(orig_X_dev)
hex_X_test = etl.hex_data(orig_X_test)


In [0]:
wrong_train, wrong_dev, df_mnb = search_hyperparams(params)

In [0]:
for k,v in wrong_dev.items():
    print(k)
    print(len(wrong_dev[k]))

for k,v in wrong_train.items():
    print(k)
    print(len(wrong_train[k]))

In [0]:
wd = pd.DataFrame(list(wrong_dev.values())[0])
wd.columns = ['index', 'wrong_prediction', 'correct', 'probs']
wd

In [0]:
f = lambda x: x > 1e-4
for col in wd['probs']:
    print(list(filter(f, col)))

It's interesting that the model has such high confidence for item 5 (index 2681) and yet is wrong.