Skip to content

Commit

Permalink
resturcture into a single package and add documentation.
Browse files Browse the repository at this point in the history
  • Loading branch information
koraykv committed Oct 7, 2012
1 parent 32e99d9 commit 8994292
Show file tree
Hide file tree
Showing 17 changed files with 532 additions and 275 deletions.
28 changes: 21 additions & 7 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,21 @@
# We compile all existing packages
FILE(GLOB _all_files *)
FOREACH(_file ${_all_files})
IF(EXISTS "${_file}/CMakeLists.txt")
ADD_SUBDIRECTORY(${_file})
ENDIF(EXISTS "${_file}/CMakeLists.txt")
ENDFOREACH(_file ${_all_files})
SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})

CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
CMAKE_POLICY(VERSION 2.6)


FIND_PACKAGE(Torch REQUIRED)

ADD_SUBDIRECTORY(sgd)
ADD_SUBDIRECTORY(liblinear)

SET(utilsrc init.c data.c util.c)
ADD_LIBRARY(svmutil SHARED ${utilsrc})
TARGET_LINK_LIBRARIES(svmutil luaT TH)
INSTALL(TARGETS svmutil LIBRARY DESTINATION ${Torch_INSTALL_LUA_CPATH_SUBDIR})

SET(src)
SET(luasrc init.lua data.lua)

ADD_TORCH_PACKAGE(svm "${src}" "${luasrc}")
ADD_TORCH_DOK(dok svm "Machine Learning" "Support Vector Machines" 3.99)
19 changes: 12 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,16 @@ torch-svm

SVM packages for Torch7.

So far, there are two packages available. The first one is Leon Bottou's sgd and asgd algorithms.
These algorithms are reimplemented in Torch7. The second package is a wrapper around the LIBLINEAR
library. This package just provides a wrapper around LIBLINEAR code that is already included in
source form.
torch-svm/sgd
-------------

Reimplementation of Leon Bottou's svmsgd and svmasgd (http://leon.bottou.org/projects/sgd).
This implementation is 2-10 times slower depending on the sparsity of the input.

torch-svm/liblinear
-------------------

This is wrapper around the well known LIBLINEAR library (http://www.csie.ntu.edu.tw/~cjlin/liblinear/).

Requirements
------------
Expand All @@ -19,16 +25,15 @@ Building
```
git clone git://github.com/koraykv/torch-svm.git
cd torch-svm
torch-pkg deploy sgd
torch-pkg deploy liblinear
torch-pkg deploy
torch-pkg deploy
```

Using
----

```
require 'svm'
require 'liblinear'
d = svm.ascread('liblinear/liblinear/heart_scale')
model = liblinear.train(d)
Expand Down
82 changes: 1 addition & 81 deletions sgd/init.c → data.c
Original file line number Diff line number Diff line change
Expand Up @@ -157,93 +157,13 @@ static int svm_infobinary(lua_State *L)
return 2;
}

static int svm_spdot(lua_State *L)
{
THFloatTensor *tdense = luaT_checkudata(L,1,"torch.FloatTensor");
THIntTensor *indices;
if lua_isnil(L,2)
{
indices = NULL;
}
else
{
indices = luaT_checkudata(L,2,"torch.IntTensor");
}
THFloatTensor *tsparse = luaT_checkudata(L,3,"torch.FloatTensor");

luaL_argcheck(L,tdense->nDimension == 1, 1, "Dense Matrix is expected to 1D");
luaL_argcheck(L,!indices || indices->nDimension == 1, 2, "Index tensor is expected to 1D");
luaL_argcheck(L,tsparse->nDimension == 1, 3, "Sparse value tensor is expected to 1D");

if (!indices)
{
lua_pushnumber(L,(double)THFloatTensor_dot(tdense,tsparse));
return 1;
}

float *dense_data = THFloatTensor_data(tdense);
float *sparse_data = THFloatTensor_data(tsparse);
int *indices_data = THIntTensor_data(indices);

long i;
float res = 0;

for (i=0; i< indices->size[0]; i++)
{
res += sparse_data[i]*dense_data[indices_data[i]-1];
}
lua_pushnumber(L,(double)res);
return 1;
}

static int svm_spadd(lua_State *L)
{
THFloatTensor *tdense = luaT_checkudata(L,1,"torch.FloatTensor");
float c = (float)lua_tonumber(L,2);
THIntTensor *indices;
if (lua_isnil(L,3))
{
indices = NULL;
}
else
{
indices = luaT_checkudata(L,3,"torch.IntTensor");
}
THFloatTensor *tsparse = luaT_checkudata(L,4,"torch.FloatTensor");

luaL_argcheck(L,tdense->nDimension == 1, 1, "Dense Matrix is expected to 1D");
luaL_argcheck(L,!indices||indices->nDimension == 1, 3, "Index tensor is expected to 1D");
luaL_argcheck(L,tsparse->nDimension == 1, 4, "Sparse value tensor is expected to 1D");

if(!indices)
{
THFloatTensor_cadd(tdense,tdense,c,tsparse);
return 0;
}

float *dense_data = THFloatTensor_data(tdense);
float *sparse_data = THFloatTensor_data(tsparse);
int *indices_data = THIntTensor_data(indices);

long i;

for (i=0; i< indices->size[0]; i++)
{
dense_data[indices_data[i]-1] += c*sparse_data[i];
}
return 0;
}

static const struct luaL_Reg svm_util__ [] = {
{"spdot", svm_spdot},
{"spadd", svm_spadd},
{"binread", svm_readbinary},
{"bininfo", svm_infobinary},
{NULL, NULL}
};


DLL_EXPORT int luaopen_libsvm(lua_State *L)
int libsvm_data_init(lua_State *L)
{
luaL_register(L, "svm", svm_util__);
return 1;
Expand Down
137 changes: 137 additions & 0 deletions data.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@

-- write a data/label file in libsvmformatted file.
-- fname : libsvm formatted file name
-- data : {l,d}
-- d[1] is supposed to be index tensor, d[2] is supposed to be value tensor, where each line is a sample
-- l is supposed to be a vector where each entry is the label
function svm.ascwrite(fname,data)
print('Writing ' .. fname)
local function vectostr(i,x)
local str = {}
local cntr = 1
x:apply(function(v)
table.insert(str,string.format('%d:%g', i[cntr], v))
cntr = cntr + 1
return v
end)
return table.concat(str, ' ')
end

local of = torch.DiskFile(fname,'w')
for i=1,#data do
local ex = data[i]
of:writeString(string.format('%+d %s\n', ex[1], vectostr(ex[2][1],ex[2][2])))
end
of:close()
end

-- read libsvm formatted data file into a label and data tensor
-- returns two outputs, the data and label
function svm.ascread(fname)
print('Reading ' .. fname)
local function readline(line)
local label = tonumber(string.match(line,'^([%+%-]?%s?%d+)'))
if not label then
error('could not read label')
end
if label ~= 1 and label ~=-1 then
error('label has to be +1 or -1')
end
local vals = {}
local inds = {}
local indcntr = 0
for ind,val in string.gmatch(line,'(%d+):([%+%-]?%d?%.?%d+)') do
indcntr = indcntr + 1
ind = tonumber(ind)
val = tonumber(val)
if not ind or not val then
error('reading failed')
end
if ind < indcntr then
error('indices are not in increasing order')
end
table.insert(inds,ind)
table.insert(vals,val)
end
return label,{torch.IntTensor(inds),torch.FloatTensor(vals)}
end
local data = {}
local maxdim = 0
local npos = 0
local nneg = 0
local minsparse = math.huge
local maxsparse = 0
for line in io.lines(fname) do
local lbl,vals = readline(line)
table.insert(data,{lbl,vals})
-- stats
maxdim = math.max(maxdim,vals[1][-1])
if lbl == 1 then npos = npos + 1 else nneg = nneg + 1 end
minsparse = math.min(minsparse,vals[1]:size(1))
maxsparse = math.max(maxsparse,vals[1]:size(1))
end
io.write(string.format("# of positive samples = %d\n",npos))
io.write(string.format("# of negative samples = %d\n",nneg))
io.write(string.format("# of total samples = %d\n",#data))
io.write(string.format("# of max dimensions = %d\n",maxdim))
io.write(string.format("Min # of dims = %d\n",minsparse))
io.write(string.format("Max # of dims = %d\n",maxsparse))
return data,maxdim
end


--[[
A simple dataset table
If the filename extension is .bin, then
it will be assumed to be binary, otherwise it will be assumed
ascii formatted file.
The format of the file is svmlight format, for binary format,
format suggested by Leon Bottou is used.
]]--

function svm.dataset(fname)
if not paths.filep(fname) then
error('File does not exist ' .. fname)
end

local data,maxdim
if fname:match('%.bin') then
data,maxdim = svm.binread(fname,true)
else
data,maxdim = svm.ascread(fname)
end
local nsamples = #data
local dataset = {}
function dataset:size() return nsamples end
function dataset:nfeature() return maxdim end
function dataset:data() return data end


-- be careful , this is just for experimentation, it will be very very very slooooooow.
local dense = false
function dataset:dense()
dense = true
end

local dx
local function todense(ind,x)
dx = dx or torch.FloatTensor(maxdim)
dx:zero()
for i=1,ind:size(1) do
dx[ind[i]] = x[i]
end
return {nil,dx}
end

setmetatable(dataset,{__index = function(self,i)
local ind = math.mod(i-1,nsamples)+1
if dense then
local ex = data[ind]
return {ex[1],todense(ex[2][1],ex[2][2])}
else
return data[ind]
end
end})

return dataset
end
60 changes: 60 additions & 0 deletions dok/index.dok
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
====== Support Vector Machines =======
{{anchor:svm.dok}}

This package provides popular SVM implementations.

===== What is implemented? =====

* [[sgd#svm.sgd|svmsgd]] : Reimplementation of [[http://leon.bottou.org/projects/sgd|Leon Bottou's svmsgd and svmasgd]].
* [[liblinear#svm.liblinear|liblinear]] : A wrapper around the well known [[http://www.csie.ntu.edu.tw/~cjlin/liblinear/|LIBLINEAR library]].

===== I/O Utilities =====

It is very common for SVMs to use sparse data as input. For that reason, ''svm'' package provides sparse data reading and writing in ''SVMLight'' format.

==== d = svm.ascread(filename) ====
{{anchor:svm.ascread}}

This function returns a table of tables representing the data given in file ''filename''.

- ''#d'' is equal to the number of rows in ''filename'', which is equal to the number of samples.
- ''d[i]'' is a table with ''2'' entries.
- ''d[i][1]'' is a number that contains the label value for ''ith'' sample.
- ''d[i][2]'' is a table with ''2'' entries representing sparse input features.
- ''d[i][2][1]'' is a ''torch.IntTensor'' containing ''1-based'' indices of the non-zero features.
- ''d[2][2][2]'' is a ''torch.FloatTensor'' containing the values of non-zero features.

==== d = svm.ascwrite(filename,data) ====
{{anchor:svm.ascwrite}}

This function writes the data object (which should be in format returned by [[#svmascread|svm.ascread]]) into file ''filename''.

==== d = svm.binread(filename) ====
{{anchor:svm.binread}}

This function is equivalent to [[#svmascread|svm.ascread]], except it operates on binary data. This format is especially useful for reading large data files. The specification of the format is the same as the one used in [[http://leon.bottou.org/projects/sgd|Leon Bottou's sgd project]].

==== d = svm.bininfo(filename) ====
{{anchor:svm.bininfo}}

This function goes over a binary input file and prints out the following information.
- number of positive samples in the dataset.
- number of negative samples in the dataset.
- total number of samples.
- maximum number of feature dimensions.

==== d = svm.dataset(filename) ====
{{anchor:svm.dataset}}

This function returns a dataset object that can be used with [[sgd#svmsgd|stochastic sradient SVMs]]. Returned object provides useful functions to query the size and dimension of the whole data.

- ''d[i]'' is a table with ''2'' entries.
- ''d[i][1]'' is a number that contains the label value for ''ith'' sample.
- ''d[i][2]'' is a table with ''2'' entries representing sparse input features.
- ''d[i][2][1]'' is a ''torch.IntTensor'' containing ''1-based'' indices of the non-zero features.
- ''d[2][2][2]'' is a ''torch.FloatTensor'' containing the values of non-zero features.
* ''d:size()'' : number of samples.
* ''d:nfeature()'' : maximum number of features in the dataset.
* ''d:data()'' : original data structure returned from [[#svmascread|svm.ascread]] or [[#svmbinread|svm.binread]]. If the extension of ''filename'' is ''.bin'', then [[#svmbinread|svm.binread]] is used, otherwise [[#svmascread|svm.ascread]] is used.
* ''d:dense()'' : sets a flag so that ''d[i]'' returns dense data. ''d[i][2][1]'' is ''nil'' and ''d[i][2][2]'' is a ''torch.FloatTensor'' of size ''data:nfeature()''. [[sgd#svmsgd|svm.SvmSgd and svm.SvmAsgd]] accept dense input type too.

Loading

0 comments on commit 8994292

Please sign in to comment.