-
Notifications
You must be signed in to change notification settings - Fork 2
/
runShogunSVMDNAWeightedCommonWordKernel.py
executable file
·151 lines (122 loc) · 4.68 KB
/
runShogunSVMDNAWeightedCommonWordKernel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Original code from: ../examples/documented/python_modular/regression_libsvr_modular.py
# and: ../examples/documented/python_modular/serialization_string_kernels_modular.py
import numpy as np
from modshogun import StringCharFeatures, StringWordFeatures, SortWordString
from modshogun import DNA, Labels
from modshogun import MSG_DEBUG
from modshogun import SVMLight
from modshogun import WeightedCommWordStringKernel
from modshogun import BinaryLabels, LibSVM
import sys
TRAININGDATAFILENAME = sys.argv[1]
TRAININGLABELSFILENAME = sys.argv[2]
VALIDATIONDATAFILENAME = sys.argv[3]
VALIDATIONLABELSFILENAME = sys.argv[4]
TRAINPREDICTIONSEPSILONFILENAME = sys.argv[5]
VALIDATIONPREDICTIONSEPSILONFILENAME = sys.argv[6]
K = int(sys.argv[7])
SVMC = float(sys.argv[8]) # Initially 1
GAP = int(sys.argv[9]) # Initially 0
def makeStringList(stringFileName):
# Get a string list from a file
stringList = []
stringFile = open(stringFileName)
for line in stringFile:
# Iterate through the string file and get the string from each line
stringList.append(line.strip())
stringFile.close()
return stringList
def makeIntList(intFileName):
# Get a float list from a file
intList = []
intFile = open(intFileName)
for line in intFile:
# Iterate through the float file and get the float from each line
label = int(line.strip())
if label == 0:
# Labels are 1 and 0 instead of 1 and -1
label = -1
intList.append(label)
intFile.close()
return np.array(intList)
def runShogunSVMDNAWeightedCommonWordKernel(train_xt, train_lt, test_xt):
"""
run svm with spectrum kernel
"""
##################################################
# set up svm
charfeat_train = StringCharFeatures(train_xt, DNA)
feats_train = StringWordFeatures(DNA)
feats_train.obtain_from_char(charfeat_train, K-1, K, GAP, False)
preproc=SortWordString()
preproc.init(feats_train)
feats_train.add_preprocessor(preproc)
feats_train.apply_preprocessor()
charfeat_test = StringCharFeatures(test_xt, DNA)
feats_test=StringWordFeatures(DNA)
feats_test.obtain_from_char(charfeat_test, K-1, K, GAP, False)
feats_test.add_preprocessor(preproc)
feats_test.apply_preprocessor()
kernel=WeightedCommWordStringKernel(feats_train, feats_train, False)
kernel.io.set_loglevel(MSG_DEBUG)
# init kernel
labels = BinaryLabels(train_lt)
# run svm model
print "Ready to train!"
svm=LibSVM(SVMC, kernel, labels)
svm.io.set_loglevel(MSG_DEBUG)
svm.train()
# predictions
print "Making predictions!"
out1=svm.apply(feats_train).get_labels()
kernel.init(feats_train, feats_test)
out2=svm.apply(feats_test).get_labels()
return out1,out2
def writeFloatList(floatList, floatListFileName):
# Write a list of floats to a file
floatListFile = open(floatListFileName, 'w+')
for f in floatList:
# Iterate through the floats and record each of them
floatListFile.write(str(f) + "\n")
floatListFile.close()
def outputResultsClassification(out1, out2, train_lt, test_lt):
# Output the results to the appropriate output files
writeFloatList(out1, TRAINPREDICTIONSEPSILONFILENAME)
writeFloatList(out2, VALIDATIONPREDICTIONSEPSILONFILENAME)
numTrainCorrect = 0
for i in range(len(train_lt)):
# Iterate through training labels and count the number that are the same as the predicted labels
if out1[i] == train_lt[i]:
# The current prediction is correct
numTrainCorrect = numTrainCorrect + 1
fracTrainCorrect = float(numTrainCorrect)/float(len(train_lt))
print "Training accuracy:"
print fracTrainCorrect
numValidCorrect = 0
numPosCorrect = 0
numNegCorrect = 0
for i in range(len(test_lt)):
# Iterate through validation labels and count the number that are the same as the predicted labels
if out2[i] == test_lt[i]:
# The current prediction is correct
numValidCorrect = numValidCorrect + 1
if (out2[i] == 1) and (test_lt[i] == 1):
# The prediction is a positive example
numPosCorrect = numPosCorrect + 1
else:
numNegCorrect = numNegCorrect + 1
fracValidCorrect = float(numValidCorrect)/float(len(test_lt))
print "Validation accuracy:"
print fracValidCorrect
print "Number of correct positive examples:"
print numPosCorrect
print "Number of correct negative examples:"
print numNegCorrect
if __name__=='__main__':
print('LibSVR')
train_xt = makeStringList(TRAININGDATAFILENAME)
train_lt = makeIntList(TRAININGLABELSFILENAME)
test_xt = makeStringList(VALIDATIONDATAFILENAME)
test_lt = makeIntList(VALIDATIONLABELSFILENAME)
[out1, out2] = runShogunSVMDNAWeightedCommonWordKernel(train_xt, train_lt, test_xt)
outputResultsClassification(out1, out2, train_lt, test_lt)