-
Notifications
You must be signed in to change notification settings - Fork 26
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
resturcture into a single package and add documentation.
- Loading branch information
Showing
17 changed files
with
532 additions
and
275 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,21 @@ | ||
# We compile all existing packages | ||
FILE(GLOB _all_files *) | ||
FOREACH(_file ${_all_files}) | ||
IF(EXISTS "${_file}/CMakeLists.txt") | ||
ADD_SUBDIRECTORY(${_file}) | ||
ENDIF(EXISTS "${_file}/CMakeLists.txt") | ||
ENDFOREACH(_file ${_all_files}) | ||
SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH}) | ||
|
||
CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR) | ||
CMAKE_POLICY(VERSION 2.6) | ||
|
||
|
||
FIND_PACKAGE(Torch REQUIRED) | ||
|
||
ADD_SUBDIRECTORY(sgd) | ||
ADD_SUBDIRECTORY(liblinear) | ||
|
||
SET(utilsrc init.c data.c util.c) | ||
ADD_LIBRARY(svmutil SHARED ${utilsrc}) | ||
TARGET_LINK_LIBRARIES(svmutil luaT TH) | ||
INSTALL(TARGETS svmutil LIBRARY DESTINATION ${Torch_INSTALL_LUA_CPATH_SUBDIR}) | ||
|
||
SET(src) | ||
SET(luasrc init.lua data.lua) | ||
|
||
ADD_TORCH_PACKAGE(svm "${src}" "${luasrc}") | ||
ADD_TORCH_DOK(dok svm "Machine Learning" "Support Vector Machines" 3.99) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
|
||
-- write a data/label file in libsvmformatted file. | ||
-- fname : libsvm formatted file name | ||
-- data : {l,d} | ||
-- d[1] is supposed to be index tensor, d[2] is supposed to be value tensor, where each line is a sample | ||
-- l is supposed to be a vector where each entry is the label | ||
function svm.ascwrite(fname,data) | ||
print('Writing ' .. fname) | ||
local function vectostr(i,x) | ||
local str = {} | ||
local cntr = 1 | ||
x:apply(function(v) | ||
table.insert(str,string.format('%d:%g', i[cntr], v)) | ||
cntr = cntr + 1 | ||
return v | ||
end) | ||
return table.concat(str, ' ') | ||
end | ||
|
||
local of = torch.DiskFile(fname,'w') | ||
for i=1,#data do | ||
local ex = data[i] | ||
of:writeString(string.format('%+d %s\n', ex[1], vectostr(ex[2][1],ex[2][2]))) | ||
end | ||
of:close() | ||
end | ||
|
||
-- read libsvm formatted data file into a label and data tensor | ||
-- returns two outputs, the data and label | ||
function svm.ascread(fname) | ||
print('Reading ' .. fname) | ||
local function readline(line) | ||
local label = tonumber(string.match(line,'^([%+%-]?%s?%d+)')) | ||
if not label then | ||
error('could not read label') | ||
end | ||
if label ~= 1 and label ~=-1 then | ||
error('label has to be +1 or -1') | ||
end | ||
local vals = {} | ||
local inds = {} | ||
local indcntr = 0 | ||
for ind,val in string.gmatch(line,'(%d+):([%+%-]?%d?%.?%d+)') do | ||
indcntr = indcntr + 1 | ||
ind = tonumber(ind) | ||
val = tonumber(val) | ||
if not ind or not val then | ||
error('reading failed') | ||
end | ||
if ind < indcntr then | ||
error('indices are not in increasing order') | ||
end | ||
table.insert(inds,ind) | ||
table.insert(vals,val) | ||
end | ||
return label,{torch.IntTensor(inds),torch.FloatTensor(vals)} | ||
end | ||
local data = {} | ||
local maxdim = 0 | ||
local npos = 0 | ||
local nneg = 0 | ||
local minsparse = math.huge | ||
local maxsparse = 0 | ||
for line in io.lines(fname) do | ||
local lbl,vals = readline(line) | ||
table.insert(data,{lbl,vals}) | ||
-- stats | ||
maxdim = math.max(maxdim,vals[1][-1]) | ||
if lbl == 1 then npos = npos + 1 else nneg = nneg + 1 end | ||
minsparse = math.min(minsparse,vals[1]:size(1)) | ||
maxsparse = math.max(maxsparse,vals[1]:size(1)) | ||
end | ||
io.write(string.format("# of positive samples = %d\n",npos)) | ||
io.write(string.format("# of negative samples = %d\n",nneg)) | ||
io.write(string.format("# of total samples = %d\n",#data)) | ||
io.write(string.format("# of max dimensions = %d\n",maxdim)) | ||
io.write(string.format("Min # of dims = %d\n",minsparse)) | ||
io.write(string.format("Max # of dims = %d\n",maxsparse)) | ||
return data,maxdim | ||
end | ||
|
||
|
||
--[[ | ||
A simple dataset table | ||
If the filename extension is .bin, then | ||
it will be assumed to be binary, otherwise it will be assumed | ||
ascii formatted file. | ||
The format of the file is svmlight format, for binary format, | ||
format suggested by Leon Bottou is used. | ||
]]-- | ||
|
||
function svm.dataset(fname) | ||
if not paths.filep(fname) then | ||
error('File does not exist ' .. fname) | ||
end | ||
|
||
local data,maxdim | ||
if fname:match('%.bin') then | ||
data,maxdim = svm.binread(fname,true) | ||
else | ||
data,maxdim = svm.ascread(fname) | ||
end | ||
local nsamples = #data | ||
local dataset = {} | ||
function dataset:size() return nsamples end | ||
function dataset:nfeature() return maxdim end | ||
function dataset:data() return data end | ||
|
||
|
||
-- be careful , this is just for experimentation, it will be very very very slooooooow. | ||
local dense = false | ||
function dataset:dense() | ||
dense = true | ||
end | ||
|
||
local dx | ||
local function todense(ind,x) | ||
dx = dx or torch.FloatTensor(maxdim) | ||
dx:zero() | ||
for i=1,ind:size(1) do | ||
dx[ind[i]] = x[i] | ||
end | ||
return {nil,dx} | ||
end | ||
|
||
setmetatable(dataset,{__index = function(self,i) | ||
local ind = math.mod(i-1,nsamples)+1 | ||
if dense then | ||
local ex = data[ind] | ||
return {ex[1],todense(ex[2][1],ex[2][2])} | ||
else | ||
return data[ind] | ||
end | ||
end}) | ||
|
||
return dataset | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,60 @@ | ||
====== Support Vector Machines ======= | ||
{{anchor:svm.dok}} | ||
|
||
This package provides popular SVM implementations. | ||
|
||
===== What is implemented? ===== | ||
|
||
* [[sgd#svm.sgd|svmsgd]] : Reimplementation of [[http://leon.bottou.org/projects/sgd|Leon Bottou's svmsgd and svmasgd]]. | ||
* [[liblinear#svm.liblinear|liblinear]] : A wrapper around the well known [[http://www.csie.ntu.edu.tw/~cjlin/liblinear/|LIBLINEAR library]]. | ||
|
||
===== I/O Utilities ===== | ||
|
||
It is very common for SVMs to use sparse data as input. For that reason, ''svm'' package provides sparse data reading and writing in ''SVMLight'' format. | ||
|
||
==== d = svm.ascread(filename) ==== | ||
{{anchor:svm.ascread}} | ||
|
||
This function returns a table of tables representing the data given in file ''filename''. | ||
|
||
- ''#d'' is equal to the number of rows in ''filename'', which is equal to the number of samples. | ||
- ''d[i]'' is a table with ''2'' entries. | ||
- ''d[i][1]'' is a number that contains the label value for ''ith'' sample. | ||
- ''d[i][2]'' is a table with ''2'' entries representing sparse input features. | ||
- ''d[i][2][1]'' is a ''torch.IntTensor'' containing ''1-based'' indices of the non-zero features. | ||
- ''d[2][2][2]'' is a ''torch.FloatTensor'' containing the values of non-zero features. | ||
|
||
==== d = svm.ascwrite(filename,data) ==== | ||
{{anchor:svm.ascwrite}} | ||
|
||
This function writes the data object (which should be in format returned by [[#svmascread|svm.ascread]]) into file ''filename''. | ||
|
||
==== d = svm.binread(filename) ==== | ||
{{anchor:svm.binread}} | ||
|
||
This function is equivalent to [[#svmascread|svm.ascread]], except it operates on binary data. This format is especially useful for reading large data files. The specification of the format is the same as the one used in [[http://leon.bottou.org/projects/sgd|Leon Bottou's sgd project]]. | ||
|
||
==== d = svm.bininfo(filename) ==== | ||
{{anchor:svm.bininfo}} | ||
|
||
This function goes over a binary input file and prints out the following information. | ||
- number of positive samples in the dataset. | ||
- number of negative samples in the dataset. | ||
- total number of samples. | ||
- maximum number of feature dimensions. | ||
|
||
==== d = svm.dataset(filename) ==== | ||
{{anchor:svm.dataset}} | ||
|
||
This function returns a dataset object that can be used with [[sgd#svmsgd|stochastic sradient SVMs]]. Returned object provides useful functions to query the size and dimension of the whole data. | ||
|
||
- ''d[i]'' is a table with ''2'' entries. | ||
- ''d[i][1]'' is a number that contains the label value for ''ith'' sample. | ||
- ''d[i][2]'' is a table with ''2'' entries representing sparse input features. | ||
- ''d[i][2][1]'' is a ''torch.IntTensor'' containing ''1-based'' indices of the non-zero features. | ||
- ''d[2][2][2]'' is a ''torch.FloatTensor'' containing the values of non-zero features. | ||
* ''d:size()'' : number of samples. | ||
* ''d:nfeature()'' : maximum number of features in the dataset. | ||
* ''d:data()'' : original data structure returned from [[#svmascread|svm.ascread]] or [[#svmbinread|svm.binread]]. If the extension of ''filename'' is ''.bin'', then [[#svmbinread|svm.binread]] is used, otherwise [[#svmascread|svm.ascread]] is used. | ||
* ''d:dense()'' : sets a flag so that ''d[i]'' returns dense data. ''d[i][2][1]'' is ''nil'' and ''d[i][2][2]'' is a ''torch.FloatTensor'' of size ''data:nfeature()''. [[sgd#svmsgd|svm.SvmSgd and svm.SvmAsgd]] accept dense input type too. | ||
|
Oops, something went wrong.