In [1]:
import subprocess
import pprint
import re
import time

First, we will learn the parameters (here, C) for testing the models

**Note:** 
* Scaled as well as unscaled models are saved in the relevant folder 
* 5 fold cross validation used

First we train the linear kernels

In [2]:
#coarse tuning param

poss_c_cor = [2 ** -8, 2 ** -6, 2 ** -4, 2 ** -2, 2 ** 0, 2 ** 2, 2 ** 4]
datasets = ["../../../data/train/leu",
            "../../../data/train/leu.scale",
            "../../../data/train/rcv1_train.red",
            "../../../data/train/rcv1_train.red.scale",
            "../../../data/train/covtype_train.red.scale"]

coarse_acc = {}
times = {}
for dataset in datasets:
    loc_acc_c = {}
    start = time.time()
    for c in poss_c_cor:
            cmd_cross_valid = ["../svm-train","-t", "0", "-h", "0", "-v", str(5),  "-c", str(c), "-q", dataset]
            pred_acc = subprocess.check_output(cmd_cross_valid)
            loc_acc_c[c] = pred_acc
    coarse_acc[dataset] = (loc_acc_c)
    times[dataset] = time.time() - start

In [3]:
for key, value in coarse_acc.iteritems():
    temp = value
    for k, acc in temp.iteritems():
        temp[k] = re.findall("[-+]?\d+[\.]?\d*", acc)[0]
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(coarse_acc)
pp.pprint(times)

{   '../../../data/train/covtype_train.red.scale': {   0.00390625: '58.5',
                                                       0.015625: '58.5',
                                                       0.0625: '65.1',
                                                       0.25: '66.3',
                                                       1: '67.5',
                                                       4: '70.7',
                                                       16: '70.6'},
    '../../../data/train/leu': {   0.00390625: '94.7368',
                                   0.015625: '94.7368',
                                   0.0625: '94.7368',
                                   0.25: '94.7368',
                                   1: '94.7368',
                                   4: '94.7368',
                                   16: '94.7368'},
    '../../../data/train/leu.scale': {   0.00390625: '94.7368',
                                         0.015625: '94.7368',
                 

In [4]:
fine_tuning_c = {"../../../data/train/rcv1_train.red":[0.25,1],
                 "../../../data/train/rcv1_train.red.scale": [0.25,1],
                 "../../../data/train/covtype_train.red.scale": [1, 4]
                 }
fine_acc = {}
for dataset , val in fine_tuning_c.iteritems():
    a = val[0]*2
    req_ran = []
    while a < val[1]:
        req_ran.append(a)
        a*=2
    loc_acc_c = {}
    for c in req_ran:
            cmd_cross_valid = ["../svm-train","-t", "0", "-h", "0", "-v", str(5),  "-c", str(c), "-q", dataset]
            pred_acc = subprocess.check_output(cmd_cross_valid)
            loc_acc_c[c] = pred_acc
    fine_acc[dataset] = (loc_acc_c)         

In [5]:
for key, value in fine_acc.iteritems():
    temp = value
    for k, acc in temp.iteritems():
        temp[k] = re.findall("[-+]?\d+[\.]?\d*", acc)[0]

pp.pprint(fine_acc)

{   '../../../data/train/covtype_train.red.scale': {   2: '70.4'},
    '../../../data/train/rcv1_train.red': {   0.5: '86'},
    '../../../data/train/rcv1_train.red.scale': {   0.5: '90.75'}}


With the coarse and fine tuning, we have learnt c, now we test

In [6]:
chosen_c = {"../../../data/train/leu": 0.25,
            "../../../data/train/leu.scale": 0.25,
            "../../../data/train/rcv1_train.red": 1,
            "../../../data/train/rcv1_train.red.scale": 0.625 , #average of values at max
            "../../../data/train/covtype_train.red.scale": 1
            }

With these parameters, we build models

In [7]:
models = []
results = []
for dataset , val in chosen_c.iteritems():
    model_name = dataset.split("/")
    model_name = "linear/" + model_name[len(model_name)-1] + ".lin.model"
    cmd_cross_valid = ["../svm-train","-t", "0", "-h", "0", "-c", str(val), "-q", dataset, model_name]
    res = subprocess.check_output(cmd_cross_valid)
    results.append(res)
    models.append(model_name)
    
pp.pprint(results)
pp.pprint(models)

['', '', '', '', '']
[   'linear/covtype_train.red.scale.lin.model',
    'linear/rcv1_train.red.lin.model',
    'linear/leu.lin.model',
    'linear/rcv1_train.red.scale.lin.model',
    'linear/leu.scale.lin.model']


Now we run test with these models

In [8]:
model_test_map = {  "../../../data/test/leu.t": 'linear/leu.lin.model',
                    "../../../data/test/leu.t.scale": 'linear/leu.scale.lin.model',
                    "../../../data/test/rcv1_test.red": 'linear/rcv1_train.red.lin.model',
                    "../../../data/test/rcv1_test.red.scale": 'linear/rcv1_train.red.scale.lin.model',
                    "../../../data/test/covtype_test.red.scaled": 'linear/covtype_train.red.scaled.lin.model'
                    }

predictions_acc = {}
for testfile, modelfile in model_test_map.iteritems():
    out_name = testfile.split("/")
    out_name = "outs/linear/" + out_name[len(out_name)-1] + ".out"
    cmd_pred= ["../svm-predict", testfile, modelfile, out_name]
    res = subprocess.check_output(cmd_pred)
    predictions_acc[testfile] = res
    
pp.pprint(predictions_acc)

{   '../../../data/test/covtype_test.red.scaled': 'Accuracy = 19.7991% (690/3485) (classification)\n',
    '../../../data/test/leu.t': 'Accuracy = 82.3529% (28/34) (classification)\n',
    '../../../data/test/leu.t.scale': 'Accuracy = 88.2353% (30/34) (classification)\n',
    '../../../data/test/rcv1_test.red': 'Accuracy = 77.5646% (930/1199) (classification)\n',
    '../../../data/test/rcv1_test.red.scale': 'Accuracy = 77.648% (931/1199) (classification)\n'}


**These are the final model results for linear svm**

---
Now we train polynomial kernel, degree 2

In [11]:
#coarse tuning param

poss_c_cor = [2 ** -8, 2 ** -4, 2 ** 0, 2 ** 2]
poss_gamma_cor = [2 ** -8, 2 ** -4, 2 ** 0, 2 ** 2]
poss_coeff = [2 ** -8, 2 ** -4, 2 ** 0, 2 ** 2]
datasets = ["../../../data/train/leu",
            "../../../data/train/leu.scale",
            "../../../data/train/rcv1_train.red",
            "../../../data/train/rcv1_train.red.scale",
            "../../../data/train/covtype_train.red.scale"]

coarse_acc = {}
times = {}
for dataset in datasets:
    loc_acc_c_g = {}
    start = time.time()
    for c in poss_c_cor:
        for g in poss_gamma_cor:
            for coeff in poss_coeff:
                cmd_cross_valid = ["../svm-train","-t", "1", "-h", "0", "-v", str(5),"-d", str(2),  "-c", str(c), "-g", str(g), "-r", str(coeff), "-q", dataset]
                pred_acc = subprocess.check_output(cmd_cross_valid)
                loc_acc_c_g[(c,g,coeff)] = re.findall("[-+]?\d+[\.]?\d*", pred_acc)[0]
    print "currently done ", dataset
    coarse_acc[dataset] = (loc_acc_c_g)
    times[dataset] = time.time() - start

currently done  ../../../data/train/leu
currently done  ../../../data/train/leu.scale
currently done  ../../../data/train/rcv1_train.red
currently done  ../../../data/train/rcv1_train.red.scale
currently done  ../../../data/train/covtype_train.red.scale


In [24]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(times)
for key,value in coarse_acc.iteritems():
    print key
    for j in sorted(value.items(), key=lambda x:x[1], reverse=True):
        print j

{   '../../../data/train/covtype_train.red.scale': 290.08423614501953,
    '../../../data/train/leu': 16.886574029922485,
    '../../../data/train/leu.scale': 17.03380799293518,
    '../../../data/train/rcv1_train.red': 41.516451835632324,
    '../../../data/train/rcv1_train.red.scale': 45.22681212425232}
../../../data/train/covtype_train.red.scale
((1, 4, 0.0625), '79.3')
((1, 4, 0.00390625), '79.3')
((1, 4, 1), '79.1')
((1, 4, 4), '78.9')
((4, 4, 0.0625), '77.9')
((4, 4, 0.00390625), '77.9')
((4, 4, 1), '77.8')
((4, 4, 4), '77.7')
((0.0625, 4, 4), '77.5')
((4, 1, 1), '77.5')
((4, 1, 4), '77.3')
((0.0625, 4, 1), '76.9')
((0.0625, 4, 0.0625), '76.8')
((4, 1, 0.00390625), '76.8')
((4, 1, 0.0625), '76.8')
((0.0625, 4, 0.00390625), '76.8')
((1, 1, 4), '75.9')
((1, 1, 1), '75.6')
((1, 1, 0.00390625), '74.9')
((1, 1, 0.0625), '74.8')
((0.00390625, 4, 4), '73.3')
((0.00390625, 4, 1), '72.9')
((0.00390625, 4, 0.00390625), '72.9')
((0.00390625, 4, 0.0625), '72.8')
((0.0625, 1, 1), '72.7')
((0.

In [13]:
chosen_c_g = {"../../../data/train/leu": (1, 0.00390625, 1),
            "../../../data/train/leu.scale": (1, 0.00390625, 1),
            "../../../data/train/rcv1_train.red": (4, 0.0625, 4),
            "../../../data/train/rcv1_train.red.scale": (1, 0.0625, 4), 
            "../../../data/train/covtype_train.red.scale": (4, 4, 4)
            }


In [15]:
models = []
results = []
for dataset , val in chosen_c_g.iteritems():
    model_name = dataset.split("/")
    model_name = "pol2/" + model_name[len(model_name)-1] + ".pol2.model"
    cmd_cross_valid = ["../svm-train","-t", "1", "-h", "0", "-c", str(val[0]), "-g",str(val[1]), "-d" , str(2), "-r",str(val[2]), "-q", dataset, model_name]
    res = subprocess.check_output(cmd_cross_valid)
    results.append(res)
    models.append(model_name)
    
pp.pprint(results)
pp.pprint(models)

['', '', '', '', '']
[   'pol2/covtype_train.red.scale.pol2.model',
    'pol2/rcv1_train.red.pol2.model',
    'pol2/leu.pol2.model',
    'pol2/rcv1_train.red.scale.pol2.model',
    'pol2/leu.scale.pol2.model']


In [16]:
model_test_map = {  "../../../data/test/leu.t": 'pol2/leu.pol2.model',
                    "../../../data/test/leu.t.scale": 'pol2/leu.scale.pol2.model',
                    "../../../data/test/rcv1_test.red": 'pol2/rcv1_train.red.pol2.model',
                    "../../../data/test/rcv1_test.red.scale": 'pol2/rcv1_train.red.pol2.model',
                    "../../../data/test/covtype_test.red.scaled": 'pol2/covtype_train.red.scaled.pol2.model'
                    }

predictions_acc = {}
for testfile, modelfile in model_test_map.iteritems():
    out_name = testfile.split("/")
    out_name = "outs/pol2/" + out_name[len(out_name)-1] + ".out"
    cmd_pred= ["../svm-predict", testfile, modelfile, out_name]
    res = subprocess.check_output(cmd_pred)
    predictions_acc[testfile] = res
    
pp.pprint(predictions_acc)

{   '../../../data/test/covtype_test.red.scaled': 'Accuracy = 23.6442% (824/3485) (classification)\n',
    '../../../data/test/leu.t': 'Accuracy = 70.5882% (24/34) (classification)\n',
    '../../../data/test/leu.t.scale': 'Accuracy = 88.2353% (30/34) (classification)\n',
    '../../../data/test/rcv1_test.red': 'Accuracy = 79.8999% (958/1199) (classification)\n',
    '../../../data/test/rcv1_test.red.scale': 'Accuracy = 85.4045% (1024/1199) (classification)\n'}


***Results of polynomial kernel, degree 2***

---
Now we train polynomial kernel, degree 3

In [17]:
#coarse tuning param

poss_c_cor = [2 ** -8, 2 ** -4, 2 ** 0, 2 ** 2]
poss_gamma_cor = [2 ** -8, 2 ** -4, 2 ** 0, 2 ** 2]
poss_coeff = [2 ** -8, 2 ** -4, 2 ** 0, 2 ** 2]
datasets = ["../../../data/train/leu",
            "../../../data/train/leu.scale",
            "../../../data/train/rcv1_train.red",
            "../../../data/train/rcv1_train.red.scale",
            "../../../data/train/covtype_train.red.scale"]

coarse_acc = {}
times = {}
for dataset in datasets:
    loc_acc_c_g = {}
    start = time.time()
    for c in poss_c_cor:
        for g in poss_gamma_cor:
            for coeff in poss_coeff:
                cmd_cross_valid = ["../svm-train","-t", "1", "-h", "0", "-v", str(5),"-d", str(3),  "-c", str(c), "-g", str(g), "-r", str(coeff), "-q", dataset]
                pred_acc = subprocess.check_output(cmd_cross_valid)
                loc_acc_c_g[(c,g,coeff)] = re.findall("[-+]?\d+[\.]?\d*", pred_acc)[0]
    print "currently done ", dataset
    coarse_acc[dataset] = (loc_acc_c_g)
    times[dataset] = time.time() - start

currently done  ../../../data/train/leu
currently done  ../../../data/train/leu.scale
currently done  ../../../data/train/rcv1_train.red
currently done  ../../../data/train/rcv1_train.red.scale
currently done  ../../../data/train/covtype_train.red.scale


In [26]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(times)
for key,value in coarse_acc.iteritems():
    print key
    print '\t', sorted(value.items(), key=lambda x:x[1], reverse=True)[0]
    #for j in sorted(value.items(), key=lambda x:x[1], reverse=True):
    #    print '\t',j

{   '../../../data/train/covtype_train.red.scale': 290.08423614501953,
    '../../../data/train/leu': 16.886574029922485,
    '../../../data/train/leu.scale': 17.03380799293518,
    '../../../data/train/rcv1_train.red': 41.516451835632324,
    '../../../data/train/rcv1_train.red.scale': 45.22681212425232}
../../../data/train/covtype_train.red.scale
	((1, 4, 0.0625), '79.3')
../../../data/train/rcv1_train.red
	((1, 0.0625, 4), '91.75')
../../../data/train/leu
	((4, 0.00390625, 4), '76.3158')
../../../data/train/rcv1_train.red.scale
	((4, 0.00390625, 4), '90.75')
../../../data/train/leu.scale
	((4, 4, 0.0625), '94.7368')


In [27]:
chosen_c_g = {"../../../data/train/leu": (4, 0.00390625, 4),
            "../../../data/train/leu.scale": (4, 4, 0.0625),
            "../../../data/train/rcv1_train.red": (1, 0.0625, 4),
            "../../../data/train/rcv1_train.red.scale": (4, 0.00390625, 4), 
            "../../../data/train/covtype_train.red.scale": (1, 4, 0.0625)
            }


In [28]:
models = []
results = []
for dataset , val in chosen_c_g.iteritems():
    model_name = dataset.split("/")
    model_name = "pol2/" + model_name[len(model_name)-1] + ".pol2.model"
    cmd_cross_valid = ["../svm-train","-t", "1", "-h", "0", "-c", str(val[0]), "-g",str(val[1]), "-d" , str(3), "-r",str(val[2]), "-q", dataset, model_name]
    res = subprocess.check_output(cmd_cross_valid)
    results.append(res)
    models.append(model_name)
    
pp.pprint(results)
pp.pprint(models)

['', '', '', '', '']
[   'pol2/covtype_train.red.scale.pol2.model',
    'pol2/rcv1_train.red.pol2.model',
    'pol2/leu.pol2.model',
    'pol2/rcv1_train.red.scale.pol2.model',
    'pol2/leu.scale.pol2.model']


In [29]:
model_test_map = {  "../../../data/test/leu.t": 'pol3/leu.pol3.model',
                    "../../../data/test/leu.t.scale": 'pol3/leu.scale.pol3.model',
                    "../../../data/test/rcv1_test.red": 'pol3/rcv1_train.red.pol3.model',
                    "../../../data/test/rcv1_test.red.scale": 'pol3/rcv1_train.red.pol3.model',
                    "../../../data/test/covtype_test.red.scale": 'pol3/covtype_train.red.scaled.pol3.model'
                    }

predictions_acc = {}
for testfile, modelfile in model_test_map.iteritems():
    out_name = testfile.split("/")
    out_name = "outs/pol3/" + out_name[len(out_name)-1] + ".out"
    cmd_pred= ["../svm-predict", testfile, modelfile, out_name]
    res = subprocess.check_output(cmd_pred)
    predictions_acc[testfile] = res
    
pp.pprint(predictions_acc)

{   '../../../data/test/covtype_test.red.scale': 'Accuracy = 29.8433% (895/2999) (classification)\n',
    '../../../data/test/leu.t': 'Accuracy = 61.7647% (21/34) (classification)\n',
    '../../../data/test/leu.t.scale': 'Accuracy = 91.1765% (31/34) (classification)\n',
    '../../../data/test/rcv1_test.red': 'Accuracy = 56.7139% (680/1199) (classification)\n',
    '../../../data/test/rcv1_test.red.scale': 'Accuracy = 81.6514% (979/1199) (classification)\n'}


***These are the final results of degree 3 model***

---
We now do it for rbf kernel

In [33]:
#coarse tuning param

poss_c_cor = [2 ** -8, 2 ** -6, 2 ** -4, 2 ** -2, 2 ** 0, 2 ** 2, 2 ** 4]
poss_gamma_cor = [2 ** -8, 2 ** -6, 2 ** -4, 2 ** -2, 2 ** 0, 2 ** 2]
datasets = ["../../../data/train/leu",
            "../../../data/train/leu.scale",
            "../../../data/train/rcv1_train.red",
            "../../../data/train/rcv1_train.red.scale",
            "../../../data/train/covtype_train.red.scale"]

coarse_acc = {}
times = {}
for dataset in datasets:
    loc_acc_c_g = {}
    start = time.time()
    for c in poss_c_cor:
        for g in poss_gamma_cor:
            cmd_cross_valid = ["../svm-train","-t", "2", "-h", "0", "-v", str(5), "-c", str(c), "-g", str(g), "-q", dataset]
            pred_acc = subprocess.check_output(cmd_cross_valid)
            loc_acc_c_g[(c,g)] = re.findall("[-+]?\d+[\.]?\d*", pred_acc)[0]
    print "currently done ", dataset
    coarse_acc[dataset] = (loc_acc_c_g)
    times[dataset] = time.time() - start

currently done  ../../../data/train/leu
currently done  ../../../data/train/leu.scale
currently done  ../../../data/train/rcv1_train.red
currently done  ../../../data/train/rcv1_train.red.scale
currently done  ../../../data/train/covtype_train.red.scale


In [34]:
pp = pprint.PrettyPrinter(indent=4)
pp.pprint(times)
for key,value in coarse_acc.iteritems():
    print key
    print '\t', sorted(value.items(), key=lambda x:x[1], reverse=True)[0]
    #for j in sorted(value.items(), key=lambda x:x[1], reverse=True):
    #    print '\t',j

{   '../../../data/train/covtype_train.red.scale': 11.40504002571106,
    '../../../data/train/leu': 7.796416997909546,
    '../../../data/train/leu.scale': 9.774142026901245,
    '../../../data/train/rcv1_train.red': 22.406476974487305,
    '../../../data/train/rcv1_train.red.scale': 22.945279836654663}
../../../data/train/covtype_train.red.scale
	((16, 4), '79.4')
../../../data/train/rcv1_train.red
	((16, 0.0625), '91.5')
../../../data/train/leu
	((4, 0.25), '71.0526')
../../../data/train/rcv1_train.red.scale
	((16, 0.015625), '92.25')
../../../data/train/leu.scale
	((4, 0.00390625), '84.2105')


In [35]:
chosen_c_g = {"../../../data/train/leu": (4, 0.00390625),
            "../../../data/train/leu.scale": (0.25, 0.25),
            "../../../data/train/rcv1_train.red": (16,0.0625),
            "../../../data/train/rcv1_train.red.scale": (16, 0.015625), 
            "../../../data/train/covtype_train.red.scale": (16,4)
            }

In [36]:
models = []
results = []
for dataset , val in chosen_c_g.iteritems():
    model_name = dataset.split("/")
    model_name = "rbf/" + model_name[len(model_name)-1] + ".rbf.model"
    cmd_cross_valid = ["../svm-train","-t", "2", "-h", "0", "-c", str(val[0]), "-g",str(val[1]), "-q", dataset, model_name]
    res = subprocess.check_output(cmd_cross_valid)
    results.append(res)
    models.append(model_name)
    
pp.pprint(results)
pp.pprint(models)

['', '', '', '', '']
[   'rbf/covtype_train.red.scale.rbf.model',
    'rbf/rcv1_train.red.rbf.model',
    'rbf/leu.rbf.model',
    'rbf/rcv1_train.red.scale.rbf.model',
    'rbf/leu.scale.rbf.model']


In [37]:
model_test_map = {  "../../../data/test/leu.t": 'rbf/leu.rbf.model',
                    "../../../data/test/leu.t.scale": 'rbf/leu.scale.rbf.model',
                    "../../../data/test/rcv1_test.red": 'rbf/rcv1_train.red.rbf.model',
                    "../../../data/test/rcv1_test.red.scale": 'rbf/rcv1_train.red.rbf.model',
                    "../../../data/test/covtype_test.red.scale": 'rbf/covtype_train.red.scaled.rbf.model'
                    }

predictions_acc = {}
for testfile, modelfile in model_test_map.iteritems():
    out_name = testfile.split("/")
    out_name = "outs/rbf/" + out_name[len(out_name)-1] + ".out"
    cmd_pred= ["../svm-predict", testfile, modelfile, out_name]
    res = subprocess.check_output(cmd_pred)
    predictions_acc[testfile] = res
    
pp.pprint(predictions_acc)

{   '../../../data/test/covtype_test.red.scale': 'Accuracy = 31.5438% (946/2999) (classification)\n',
    '../../../data/test/leu.t': 'Accuracy = 58.8235% (20/34) (classification)\n',
    '../../../data/test/leu.t.scale': 'Accuracy = 58.8235% (20/34) (classification)\n',
    '../../../data/test/rcv1_test.red': 'Accuracy = 79.2327% (950/1199) (classification)\n',
    '../../../data/test/rcv1_test.red.scale': 'Accuracy = 82.2352% (986/1199) (classification)\n'}


*** These are the final results of the rbf kernel ***