In [1]:
import subprocess
import pprint
import re

First, we will learn the parameters (here, C) for testing the models

**Note:** 
* Scaled as well as unscaled models are saved in the relevant folder 
* 5 fold cross validation used

In [2]:
#coarse tuning param

poss_c_cor = [2 ** -8, 2 ** -6, 2 ** -4, 2 ** -2, 2 ** 0, 2 ** 2, 2 ** 4]
datasets = ["../../../data/train/leu",
            "../../../data/train/leu.scale",
            "../../../data/train/rcv1_train.binary",
            "../../../data/train/rcv1_train.binary.scale",
            "../../../data/train/covtype_train.scale"]

coarse_acc = {}
for dataset in datasets:
    loc_acc_c = {}
    for c in poss_c_cor:
            cmd_cross_valid = ["../train", "-v", str(5),  "-c", str(c), "-q", dataset]
            pred_acc = subprocess.check_output(cmd_cross_valid)
            loc_acc_c[c] = pred_acc
    coarse_acc[dataset] = (loc_acc_c)

In [3]:
for key, value in coarse_acc.iteritems():
    temp = value
    for k, acc in temp.iteritems():
        temp[k] = re.findall("[-+]?\d+[\.]?\d*", acc)[0]
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(coarse_acc)

{   '../../../data/train/covtype_train.scale': {   0.00390625: '78.0678',
                                                   0.015625: '78.0744',
                                                   0.0625: '78.0695',
                                                   0.25: '78.0686',
                                                   1: '78.0688',
                                                   4: '78.0811',
                                                   16: '78.0751'},
    '../../../data/train/leu': {   0.00390625: '94.7368',
                                   0.015625: '94.7368',
                                   0.0625: '94.7368',
                                   0.25: '94.7368',
                                   1: '94.7368',
                                   4: '94.7368',
                                   16: '94.7368'},
    '../../../data/train/leu.scale': {   0.00390625: '86.8421',
                                         0.015625: '92.1053',
                        

In [4]:
fine_tuning_c = {"../../../data/train/leu.scale":[0.25, 1],
                 "../../../data/train/rcv1_train.binary":[0.25,1],
                 "../../../data/train/rcv1_train.binary.scale": [0.015625, 0.0625]
                 }
fine_acc = {}
for dataset , val in fine_tuning_c.iteritems():
    a = val[0]*2
    req_ran = []
    while a < val[1]:
        req_ran.append(a)
        a*=2
    loc_acc_c = {}
    for c in req_ran:
            cmd_cross_valid = ["../train", "-v", str(5),  "-c", str(c), "-q", dataset]
            pred_acc = subprocess.check_output(cmd_cross_valid)
            loc_acc_c[c] = pred_acc
    fine_acc[dataset] = (loc_acc_c)         

In [5]:
for key, value in fine_acc.iteritems():
    temp = value
    for k, acc in temp.iteritems():
        temp[k] = re.findall("[-+]?\d+[\.]?\d*", acc)[0]

pp.pprint(fine_acc)

{   '../../../data/train/leu.scale': {   0.5: '94.7368'},
    '../../../data/train/rcv1_train.binary': {   0.5: '97.0457'},
    '../../../data/train/rcv1_train.binary.scale': {   0.03125: '96.8481'}}


With the coarse and fine tuning, we have learnt c, now we test

In [6]:
chosen_c = {"../../../data/train/leu": 0.25,
            "../../../data/train/leu.scale": 0.5,
            "../../../data/train/rcv1_train.binary": 1,
            "../../../data/train/rcv1_train.binary.scale": 0.0625 ,
            "../../../data/train/covtype_train.scale": 0.015625
            }

With these parameters, we build models

In [7]:
models = []
results = []
for dataset , val in chosen_c.iteritems():
    model_name = dataset.split("/")
    model_name = model_name[len(model_name)-1] + ".model"
    cmd_cross_valid = ["../train", "-c", str(val), dataset, model_name]
    res = subprocess.check_output(cmd_cross_valid)
    results.append(res)
    models.append(model_name)
    
pp.pprint(results)
pp.pprint(models)

[   '.*\noptimization finished, #iter = 12\nObjective value = -1412.358201\nnSV = 7019\n',
    '*\noptimization finished, #iter = 9\nObjective value = -0.002431\nnSV = 32\n',
    '....**.*\noptimization finished, #iter = 51\nObjective value = -0.031492\nnSV = 30\n',
    '*.\noptimization finished, #iter = 10\nObjective value = -108.791377\nnSV = 7565\n',
    '*\noptimization finished, #iter = 6\nObjective value = -4706.274963\nnSV = 414579\n']
[   'rcv1_train.binary.model',
    'leu.model',
    'leu.scale.model',
    'rcv1_train.binary.scale.model',
    'covtype_train.scale.model']


Now we run test with these models

In [8]:
model_test_map = {  "../../../data/test/leu.t": 'leu.model',
                    "../../../data/test/leu.t.scale": 'leu.scale.model',
                    "../../../data/test/rcv1_test.binary": 'rcv1_train.binary.model',
                    "../../../data/test/rcv1_test.binary.scale": 'rcv1_train.binary.scale.model',
                    "../../../data/test/covtype_test.scale": 'covtype_train.scale.model'
                    }

predictions_acc = {}
for testfile, modelfile in model_test_map.iteritems():
    out_name = testfile.split("/")
    out_name = "outs/" + out_name[len(out_name)-1] + ".out"
    cmd_pred= ["../predict", testfile, modelfile, out_name]
    res = subprocess.check_output(cmd_pred)
    predictions_acc[testfile] = res
    
pp.pprint(predictions_acc)

{   '../../../data/test/covtype_test.scale': 'Accuracy = 63.3655% (73632/116202)\n',
    '../../../data/test/leu.t': 'Accuracy = 79.4118% (27/34)\n',
    '../../../data/test/leu.t.scale': 'Accuracy = 88.2353% (30/34)\n',
    '../../../data/test/rcv1_test.binary': 'Accuracy = 96.1581% (651374/677399)\n',
    '../../../data/test/rcv1_test.binary.scale': 'Accuracy = 96.0518% (650654/677399)\n'}


**These are the final model results**

The discrepancy in rcv1_scale is probably because of missing columns in dataset while scaling