resturcture into a single package and add documentation.

koraykv · Oct 7, 2012 · 8994292 · 8994292
1 parent 32e99d9
commit 8994292
Show file tree

Hide file tree

Showing 17 changed files with 532 additions and 275 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,7 +1,21 @@
-# We compile all existing packages
-FILE(GLOB _all_files *)
-FOREACH(_file ${_all_files})
-  IF(EXISTS "${_file}/CMakeLists.txt")
-    ADD_SUBDIRECTORY(${_file})
-  ENDIF(EXISTS "${_file}/CMakeLists.txt")
-ENDFOREACH(_file ${_all_files})
+SET(CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${CMAKE_MODULE_PATH})
+
+CMAKE_MINIMUM_REQUIRED(VERSION 2.6 FATAL_ERROR)
+CMAKE_POLICY(VERSION 2.6)
+
+
+FIND_PACKAGE(Torch REQUIRED)
+
+ADD_SUBDIRECTORY(sgd)
+ADD_SUBDIRECTORY(liblinear)
+
+SET(utilsrc init.c data.c util.c)
+ADD_LIBRARY(svmutil SHARED ${utilsrc})
+TARGET_LINK_LIBRARIES(svmutil luaT TH)
+INSTALL(TARGETS svmutil LIBRARY DESTINATION ${Torch_INSTALL_LUA_CPATH_SUBDIR})
+
+SET(src)
+SET(luasrc init.lua data.lua)
+
+ADD_TORCH_PACKAGE(svm "${src}" "${luasrc}")
+ADD_TORCH_DOK(dok svm "Machine Learning" "Support Vector Machines" 3.99)
diff --git a/README.md b/README.md
@@ -3,10 +3,16 @@ torch-svm
 
 SVM packages for Torch7.
 
-So far, there are two packages available. The first one is Leon Bottou's sgd and asgd algorithms.
-These algorithms are reimplemented in Torch7. The second package is a wrapper around the LIBLINEAR
-library. This package just provides a wrapper around LIBLINEAR code that is already included in
-source form.
+torch-svm/sgd
+-------------
+
+Reimplementation of Leon Bottou's svmsgd and svmasgd (http://leon.bottou.org/projects/sgd). 
+This implementation is 2-10 times slower depending on the sparsity of the input.
+
+torch-svm/liblinear
+-------------------
+
+This is wrapper around the well known LIBLINEAR library (http://www.csie.ntu.edu.tw/~cjlin/liblinear/).
 
 Requirements
 ------------
@@ -19,16 +25,15 @@ Building
 ```
 git clone git://github.com/koraykv/torch-svm.git
 cd torch-svm
-torch-pkg deploy sgd
-torch-pkg deploy liblinear
+torch-pkg deploy
+torch-pkg deploy
 ```
 
 Using
 ----
 
 ```
 require 'svm'
-require 'liblinear'
 
 d = svm.ascread('liblinear/liblinear/heart_scale')
 model = liblinear.train(d)

diff --git a/sgd/init.c → data.c b/sgd/init.c → data.c
@@ -157,93 +157,13 @@ static int svm_infobinary(lua_State *L)
 	return 2;
 }
 
-static int svm_spdot(lua_State *L)
-{
-	THFloatTensor *tdense = luaT_checkudata(L,1,"torch.FloatTensor");
-	THIntTensor *indices;
-	if lua_isnil(L,2)
-	{
-		indices = NULL;
-	}
-	else
-	{
-		indices = luaT_checkudata(L,2,"torch.IntTensor");
-	}
-	THFloatTensor *tsparse = luaT_checkudata(L,3,"torch.FloatTensor");
-
-	luaL_argcheck(L,tdense->nDimension == 1, 1, "Dense Matrix is expected to 1D");
-	luaL_argcheck(L,!indices || indices->nDimension == 1, 2, "Index tensor is expected to 1D");
-	luaL_argcheck(L,tsparse->nDimension == 1, 3, "Sparse value tensor is expected to 1D");
-
-	if (!indices)
-	{
-		lua_pushnumber(L,(double)THFloatTensor_dot(tdense,tsparse));
-		return 1;
-	}
-
-	float *dense_data = THFloatTensor_data(tdense);
-	float *sparse_data = THFloatTensor_data(tsparse);
-	int *indices_data = THIntTensor_data(indices);
-
-	long i;
-	float res = 0;
-
-	for (i=0; i< indices->size[0]; i++)
-	{
-		res += sparse_data[i]*dense_data[indices_data[i]-1];
-	}
-	lua_pushnumber(L,(double)res);
-	return 1;
-}
-
-static int svm_spadd(lua_State *L)
-{
-	THFloatTensor *tdense = luaT_checkudata(L,1,"torch.FloatTensor");
-	float c = (float)lua_tonumber(L,2);
-	THIntTensor *indices;
-	if (lua_isnil(L,3))
-	{
-		indices = NULL;
-	}
-	else
-	{
-		indices = luaT_checkudata(L,3,"torch.IntTensor");
-	}
-	THFloatTensor *tsparse = luaT_checkudata(L,4,"torch.FloatTensor");
-
-	luaL_argcheck(L,tdense->nDimension == 1, 1, "Dense Matrix is expected to 1D");
-	luaL_argcheck(L,!indices||indices->nDimension == 1, 3, "Index tensor is expected to 1D");
-	luaL_argcheck(L,tsparse->nDimension == 1, 4, "Sparse value tensor is expected to 1D");
-
-	if(!indices)
-	{
-		THFloatTensor_cadd(tdense,tdense,c,tsparse);
-		return 0;
-	}
-
-	float *dense_data = THFloatTensor_data(tdense);
-	float *sparse_data = THFloatTensor_data(tsparse);
-	int *indices_data = THIntTensor_data(indices);
-
-	long i;
-
-	for (i=0; i< indices->size[0]; i++)
-	{
-		dense_data[indices_data[i]-1] += c*sparse_data[i];
-	}
-	return 0;
-}
-
 static const struct luaL_Reg svm_util__ [] = {
-  {"spdot", svm_spdot},
-  {"spadd", svm_spadd},
   {"binread", svm_readbinary},
   {"bininfo", svm_infobinary},
   {NULL, NULL}
 };
 
-
-DLL_EXPORT int luaopen_libsvm(lua_State *L)
+int libsvm_data_init(lua_State *L)
 {
   luaL_register(L, "svm", svm_util__);
   return 1;

diff --git a/data.lua b/data.lua
@@ -0,0 +1,137 @@
+
+-- write a data/label file in libsvmformatted file.
+-- fname : libsvm formatted file name
+-- data  : {l,d}
+-- d[1] is supposed to be index tensor, d[2] is supposed to be value tensor, where each line is a sample
+-- l is supposed to be a vector where each entry is the label
+function svm.ascwrite(fname,data)
+	print('Writing ' .. fname)
+	local function vectostr(i,x)
+		local str = {}
+		local cntr = 1
+		x:apply(function(v) 
+			table.insert(str,string.format('%d:%g', i[cntr], v))
+			cntr = cntr + 1
+			return v
+			end)
+		return table.concat(str, ' ')
+	end
+
+	local of = torch.DiskFile(fname,'w')
+	for i=1,#data do
+		local ex = data[i]
+		of:writeString(string.format('%+d %s\n', ex[1], vectostr(ex[2][1],ex[2][2])))
+	end
+	of:close()
+end
+
+-- read libsvm formatted data file into a label and data tensor
+-- returns two outputs, the data and label
+function svm.ascread(fname)
+	print('Reading ' .. fname)
+	local function readline(line)
+		local label = tonumber(string.match(line,'^([%+%-]?%s?%d+)'))
+		if not label then
+			error('could not read label')
+		end
+		if label ~= 1 and label ~=-1 then
+			error('label has to be +1 or -1')
+		end
+		local vals = {}
+		local inds = {}
+		local indcntr = 0
+		for ind,val in string.gmatch(line,'(%d+):([%+%-]?%d?%.?%d+)') do
+			indcntr = indcntr + 1
+			ind = tonumber(ind)
+			val = tonumber(val)
+			if not ind or not val then
+				error('reading failed')
+			end
+			if ind < indcntr then
+				error('indices are not in increasing order')
+			end
+			table.insert(inds,ind)
+			table.insert(vals,val)
+		end
+		return label,{torch.IntTensor(inds),torch.FloatTensor(vals)}
+	end
+	local data = {}
+	local maxdim = 0
+	local npos = 0
+	local nneg = 0
+	local minsparse = math.huge
+	local maxsparse = 0
+	for line in io.lines(fname) do
+		local lbl,vals = readline(line)
+		table.insert(data,{lbl,vals})
+		-- stats
+		maxdim = math.max(maxdim,vals[1][-1])
+		if lbl == 1 then npos = npos + 1 else nneg = nneg + 1 end
+		minsparse = math.min(minsparse,vals[1]:size(1))
+		maxsparse = math.max(maxsparse,vals[1]:size(1))
+	end
+	io.write(string.format("# of positive samples = %d\n",npos))
+	io.write(string.format("# of negative samples = %d\n",nneg))
+	io.write(string.format("# of total    samples = %d\n",#data))
+	io.write(string.format("# of max dimensions   = %d\n",maxdim))
+	io.write(string.format("Min # of dims = %d\n",minsparse))
+	io.write(string.format("Max # of dims = %d\n",maxsparse))
+	return data,maxdim
+end
+
+
+--[[
+	A simple dataset table
+	If the filename extension is .bin, then 
+	it will be assumed to be binary, otherwise it will be assumed
+	ascii formatted file.
+	The format of the file is svmlight format, for binary format,
+	format suggested by Leon Bottou is used.
+]]--
+
+function svm.dataset(fname)
+	if not paths.filep(fname) then
+		error('File does not exist ' .. fname)
+	end
+
+	local data,maxdim
+	if fname:match('%.bin') then
+		data,maxdim = svm.binread(fname,true)
+	else
+		data,maxdim = svm.ascread(fname)
+	end
+	local nsamples = #data
+	local dataset = {}
+	function dataset:size() return nsamples end
+	function dataset:nfeature() return maxdim end
+	function dataset:data() return data end
+
+
+	-- be careful , this is just for experimentation, it will be very very very slooooooow.
+	local dense = false
+	function dataset:dense()
+		dense = true
+	end
+
+	local dx
+	local function todense(ind,x)
+		dx = dx or torch.FloatTensor(maxdim)
+		dx:zero()
+		for i=1,ind:size(1) do
+			dx[ind[i]] = x[i]
+		end
+		return {nil,dx}
+	end
+
+	setmetatable(dataset,{__index = function(self,i)
+		local ind = math.mod(i-1,nsamples)+1
+		if dense then
+			local ex = data[ind]
+			return {ex[1],todense(ex[2][1],ex[2][2])}
+		else
+			return data[ind]
+		end
+	end})
+
+	return dataset
+end
diff --git a/dok/index.dok b/dok/index.dok
@@ -0,0 +1,60 @@
+====== Support Vector Machines =======
+{{anchor:svm.dok}}
+
+This package provides popular SVM implementations.
+
+===== What is implemented? =====
+
+  * [[sgd#svm.sgd|svmsgd]] : Reimplementation of [[http://leon.bottou.org/projects/sgd|Leon Bottou's svmsgd and svmasgd]].
+  * [[liblinear#svm.liblinear|liblinear]] : A wrapper around the well known [[http://www.csie.ntu.edu.tw/~cjlin/liblinear/|LIBLINEAR library]].
+
+===== I/O Utilities =====
+
+It is very common for SVMs to use sparse data as input. For that reason, ''svm'' package provides sparse data reading and writing in ''SVMLight'' format.
+
+==== d = svm.ascread(filename) ====
+{{anchor:svm.ascread}}
+
+This function returns a table of tables representing the data given in file ''filename''.
+
+  - ''#d'' is equal to the number of rows in ''filename'', which is equal to the number of samples.
+  - ''d[i]'' is a table with ''2'' entries.
+    - ''d[i][1]'' is a number that contains the label value for ''ith'' sample.
+    - ''d[i][2]'' is a table with ''2'' entries representing sparse input features.
+      - ''d[i][2][1]'' is a ''torch.IntTensor'' containing ''1-based'' indices of the non-zero features.
+      - ''d[2][2][2]'' is a ''torch.FloatTensor'' containing the values of non-zero features.
+
+==== d = svm.ascwrite(filename,data) ====
+{{anchor:svm.ascwrite}}
+
+This function writes the data object (which should be in format returned by [[#svmascread|svm.ascread]]) into file ''filename''.
+
+==== d = svm.binread(filename) ====
+{{anchor:svm.binread}}
+
+This function is equivalent to [[#svmascread|svm.ascread]], except it operates on binary data. This format is especially useful for reading large data files. The specification of the format is the same as the one used in [[http://leon.bottou.org/projects/sgd|Leon Bottou's sgd project]].
+
+==== d = svm.bininfo(filename) ====
+{{anchor:svm.bininfo}}
+
+This function goes over a binary input file and prints out the following information.
+  - number of positive samples in the dataset.
+  - number of negative samples in the dataset.
+  - total number of samples.
+  - maximum number of feature dimensions.
+
+==== d = svm.dataset(filename) ====
+{{anchor:svm.dataset}}
+
+This function returns a dataset object that can be used with [[sgd#svmsgd|stochastic sradient SVMs]]. Returned object provides useful functions to query the size and dimension of the whole data.
+
+  - ''d[i]'' is a table with ''2'' entries.
+    - ''d[i][1]'' is a number that contains the label value for ''ith'' sample.
+    - ''d[i][2]'' is a table with ''2'' entries representing sparse input features.
+      - ''d[i][2][1]'' is a ''torch.IntTensor'' containing ''1-based'' indices of the non-zero features.
+      - ''d[2][2][2]'' is a ''torch.FloatTensor'' containing the values of non-zero features.
+  * ''d:size()'' : number of samples.
+  * ''d:nfeature()'' : maximum number of features in the dataset.
+  * ''d:data()'' : original data structure returned from [[#svmascread|svm.ascread]] or [[#svmbinread|svm.binread]]. If the extension of ''filename'' is ''.bin'', then [[#svmbinread|svm.binread]] is used, otherwise [[#svmascread|svm.ascread]] is used.
+  * ''d:dense()'' : sets a flag so that ''d[i]'' returns dense data. ''d[i][2][1]'' is ''nil'' and ''d[i][2][2]'' is a ''torch.FloatTensor'' of size ''data:nfeature()''. [[sgd#svmsgd|svm.SvmSgd and svm.SvmAsgd]] accept dense input type too.
+