I forget to implement relu in full connect layer

Another 10 days delay for the 0.6 release :(
liuliu · Mar 17, 2014 · c985ff1 · c985ff1
1 parent cb10a22
commit c985ff1
Show file tree

Hide file tree

Showing 6 changed files with 90 additions and 48 deletions.
diff --git a/bin/image-net.c b/bin/image-net.c
@@ -144,7 +144,7 @@ int main(int argc, char** argv)
 			},
 		},
 		{
-			.type = CCV_CONVNET_MAX_POOL,
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
 			.input = {
 				.matrix = {
 					.rows = 55,
@@ -154,29 +154,29 @@ int main(int argc, char** argv)
 				},
 			},
 			.output = {
-				.pool = {
-					.strides = 2,
-					.size = 3,
-					.border = 0,
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.type = CCV_CONVNET_MAX_POOL,
 			.input = {
 				.matrix = {
-					.rows = 27,
-					.cols = 27,
+					.rows = 55,
+					.cols = 55,
 					.channels = 96,
 					.partition = 2,
 				},
 			},
 			.output = {
-				.rnorm = {
-					.size = 5,
-					.kappa = 2,
-					.alpha = 1e-4,
-					.beta = 0.75,
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
 				},
 			},
 		},
@@ -206,7 +206,7 @@ int main(int argc, char** argv)
 			},
 		},
 		{
-			.type = CCV_CONVNET_MAX_POOL,
+			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
 			.input = {
 				.matrix = {
 					.rows = 27,
@@ -216,29 +216,29 @@ int main(int argc, char** argv)
 				},
 			},
 			.output = {
-				.pool = {
-					.strides = 2,
-					.size = 3,
-					.border = 0,
+				.rnorm = {
+					.size = 5,
+					.kappa = 2,
+					.alpha = 1e-4,
+					.beta = 0.75,
 				},
 			},
 		},
 		{
-			.type = CCV_CONVNET_LOCAL_RESPONSE_NORM,
+			.type = CCV_CONVNET_MAX_POOL,
 			.input = {
 				.matrix = {
-					.rows = 13,
-					.cols = 13,
+					.rows = 27,
+					.cols = 27,
 					.channels = 256,
 					.partition = 2,
 				},
 			},
 			.output = {
-				.rnorm = {
-					.size = 5,
-					.kappa = 2,
-					.alpha = 1e-4,
-					.beta = 0.75,
+				.pool = {
+					.strides = 2,
+					.size = 3,
+					.border = 0,
 				},
 			},
 		},
@@ -353,6 +353,7 @@ int main(int argc, char** argv)
 			},
 			.output = {
 				.full_connect = {
+					.relu = 1,
 					.count = 4096,
 				},
 			},
@@ -375,6 +376,7 @@ int main(int argc, char** argv)
 			},
 			.output = {
 				.full_connect = {
+					.relu = 1,
 					.count = 4096,
 				},
 			},
@@ -397,6 +399,7 @@ int main(int argc, char** argv)
 			},
 			.output = {
 				.full_connect = {
+					.relu = 0,
 					.count = 1000,
 				},
 			},
@@ -409,10 +412,10 @@ int main(int argc, char** argv)
 	for (i = 0; i < 13; i++)
 	{
 		layer_params[i].w.decay = 0.0005;
-		layer_params[i].w.learn_rate = 0.0001;
+		layer_params[i].w.learn_rate = 0.01;
 		layer_params[i].w.momentum = 0.9;
 		layer_params[i].bias.decay = 0;
-		layer_params[i].bias.learn_rate = 0.0001;
+		layer_params[i].bias.learn_rate = 0.01;
 		layer_params[i].bias.momentum = 0.9;
 	}
 	layer_params[10].dor = 0.5;

diff --git a/lib/ccv.h b/lib/ccv.h
@@ -1119,6 +1119,7 @@ typedef union {
 		float beta;
 	} rnorm;
 	struct {
+		int relu; // apply relu or not
 		int count;
 	} full_connect;
 } ccv_convnet_type_t;

diff --git a/lib/ccv_convnet.c b/lib/ccv_convnet.c
@@ -226,6 +226,9 @@ static void _ccv_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* lay
 		bptr[i] = layer->bias[i];
 	ccv_dense_matrix_t dw = ccv_dense_matrix(db->rows, a->rows, CCV_32F | CCV_C1, layer->w, 0);
 	ccv_gemm(&dw, a, 1, db, 1, 0, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
+	if (layer->net.full_connect.relu)
+		for (i = 0; i < db->rows; i++)
+			bptr[i] = ccv_max(0, bptr[i]); // relu
 	a->rows = rows, a->cols = cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | ch;
 	a->step = a->cols * CCV_GET_DATA_TYPE_SIZE(a->type) * CCV_GET_CHANNEL(a->type);
 }
@@ -421,6 +424,14 @@ static void _ccv_convnet_full_connect_forward_propagate_parallel(ccv_convnet_lay
 	}
 	ccv_dense_matrix_t dw = ccv_dense_matrix(db->cols, a->cols, CCV_32F | CCV_C1, layer->w, 0);
 	ccv_gemm(a, &dw, 1, db, 1, CCV_B_TRANSPOSE, (ccv_matrix_t**)&db, 0); // supply db as matrix C is allowed
+	bptr = db->data.f32;
+	if (layer->net.full_connect.relu)
+		for (i = 0; i < db->rows; i++)
+		{
+			for (j = 0; j < db->cols; j++)
+				bptr[j] = ccv_max(0, bptr[j]); // relu
+			bptr += db->cols;
+		}
 }
 
 static void _ccv_convnet_compute_softmax_parallel(ccv_dense_matrix_t* a, ccv_dense_matrix_t** b, int type)
@@ -752,17 +763,21 @@ static void _ccv_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* l
 	a->rows = a_rows, a->cols = a_cols, a->type = (a->type - CCV_GET_CHANNEL(a->type)) | a_ch;
 }
 
-static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
+static void _ccv_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, ccv_dense_matrix_t* a, ccv_dense_matrix_t* y, ccv_dense_matrix_t* x, ccv_dense_matrix_t** b, ccv_convnet_layer_t* update_params)
 {
-	// a is the input gradient (for back prop), d is the dropout,
+	// a is the input gradient (for back prop), y is the output (for forward prop)
 	// x is the input (for forward prop), b is the output gradient (gradient, or known as propagated error)
-	// note that y (the output from forward prop) is not included because the full connect net is simple enough that we don't need it
 	ccv_dense_matrix_t* db = 0;
 	if (b)
 		db = *b = ccv_dense_matrix_renew(*b, x->rows, x->cols, CCV_32F | CCV_GET_CHANNEL(x->type), CCV_32F | CCV_GET_CHANNEL(x->type), 0);
 	int x_rows = x->rows, x_cols = x->cols, x_ch = CCV_GET_CHANNEL(x->type);
 	x->rows = x_rows * x_cols * x_ch, x->cols = 1, x->type = (x->type - x_ch) | CCV_C1;
 	x->step = x->cols * CCV_GET_DATA_TYPE_SIZE(x->type);
+	int i;
+	if (layer->net.full_connect.relu)
+		for (i = 0; i < y->rows; i++)
+			if (y->data.f32[i] <= 0)
+				a->data.f32[i] = 0;
 	ccv_dense_matrix_t w = ccv_dense_matrix(a->rows, x->rows, CCV_32F | CCV_C1, update_params->w, 0);
 	ccv_dense_matrix_t* dw = &w;
 	// compute bias gradient
@@ -915,7 +930,7 @@ static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix
 	int i;
 	ccv_convnet_layer_t* layer = convnet->layers + convnet->count - 1;
 	assert(layer->type == CCV_CONVNET_FULL_CONNECT); // the last layer has too be a full connect one to generate softmax result
-	_ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 0, update_params->layers + convnet->count - 1);
+	_ccv_convnet_full_connect_backward_propagate(layer, dloss, convnet->acts[convnet->count - 1], convnet->acts[convnet->count - 2], convnet->count - 1 > 0 ? update_params->acts + convnet->count - 2 : 0, update_params->layers + convnet->count - 1);
 	for (i = convnet->count - 2; i >= 0; i--)
 	{
 		layer = convnet->layers + i;
@@ -925,7 +940,7 @@ static void _ccv_convnet_propagate_loss(ccv_convnet_t* convnet, ccv_dense_matrix
 				_ccv_convnet_convolutional_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
 				break;
 			case CCV_CONVNET_FULL_CONNECT:
-				_ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
+				_ccv_convnet_full_connect_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, i > 0 ? update_params->acts + i - 1 : 0, update_params->layers + i);
 				break;
 			case CCV_CONVNET_LOCAL_RESPONSE_NORM:
 				_ccv_convnet_rnorm_backward_propagate(layer, update_params->acts[i], convnet->acts[i], i > 0 ? convnet->acts[i - 1] : a, convnet->denoms[i], i > 0 ? update_params->acts + i - 1 : 0);
@@ -1174,7 +1189,7 @@ void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet
 			"(layer INTEGER PRIMARY KEY ASC, type INTEGER, "
 			"input_matrix_rows INTEGER, input_matrix_cols INTEGER, input_matrix_channels INTEGER, input_matrix_partition INTEGER, input_node_count INTEGER, "
 			"output_rows INTEGER, output_cols INTEGER, output_channels INTEGER, output_partition INTEGER, output_count INTEGER, output_strides INTEGER, output_border INTEGER, "
-			"output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL);"
+			"output_size INTEGER, output_kappa REAL, output_alpha REAL, output_beta REAL, output_relu INTEGER);"
 			"CREATE TABLE IF NOT EXISTS convnet_params "
 			"(convnet INTEGER PRIMARY KEY ASC, input_height INTEGER, input_width INTEGER, mean_activity BLOB);"
 			"CREATE TABLE IF NOT EXISTS layer_data "
@@ -1185,11 +1200,11 @@ void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet
 			"(layer, type, "
 			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, "
 			"output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, "
-			"output_size, output_kappa, output_alpha, output_beta) VALUES "
+			"output_size, output_kappa, output_alpha, output_beta, output_relu) VALUES "
 			"($layer, $type, " // 1
 			"$input_matrix_rows, $input_matrix_cols, $input_matrix_channels, $input_matrix_partition, $input_node_count, " // 6
 			"$output_rows, $output_cols, $output_channels, $output_partition, $output_count, $output_strides, $output_border, " // 13
-			"$output_size, $output_kappa, $output_alpha, $output_beta);"; // 17
+			"$output_size, $output_kappa, $output_alpha, $output_beta, $output_relu);"; // 18
 		sqlite3_stmt* layer_params_insert_stmt = 0;
 		assert(SQLITE_OK == sqlite3_prepare_v2(db, layer_params_insert_qs, sizeof(layer_params_insert_qs), &layer_params_insert_stmt, 0));
 		const char layer_data_insert_qs[] =
@@ -1222,6 +1237,7 @@ void ccv_convnet_write(ccv_convnet_t* convnet, const char* filename, ccv_convnet
 					break;
 				case CCV_CONVNET_FULL_CONNECT:
 					sqlite3_bind_int(layer_params_insert_stmt, 12, layer->net.full_connect.count);
+					sqlite3_bind_int(layer_params_insert_stmt, 19, layer->net.full_connect.relu);
 					break;
 				case CCV_CONVNET_MAX_POOL:
 				case CCV_CONVNET_AVERAGE_POOL:
@@ -1297,7 +1313,7 @@ ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
 			"SELECT type, " // 1
 			"input_matrix_rows, input_matrix_cols, input_matrix_channels, input_matrix_partition, input_node_count, " // 6
 			"output_rows, output_cols, output_channels, output_partition, output_count, output_strides, output_border, " // 13
-			"output_size, output_kappa, output_alpha, output_beta FROM layer_params ORDER BY layer ASC;"; // 17
+			"output_size, output_kappa, output_alpha, output_beta, output_relu FROM layer_params ORDER BY layer ASC;"; // 18
 		if (SQLITE_OK == sqlite3_prepare_v2(db, layer_params_qs, sizeof(layer_params_qs), &layer_params_stmt, 0))
 		{
 			ccv_array_t* layer_params = ccv_array_new(sizeof(ccv_convnet_layer_param_t), 3, 0);
@@ -1324,6 +1340,7 @@ ccv_convnet_t* ccv_convnet_read(int use_cwc_accel, const char* filename)
 						break;
 					case CCV_CONVNET_FULL_CONNECT:
 						layer_param.output.full_connect.count = sqlite3_column_int(layer_params_stmt, 10);
+						layer_param.output.full_connect.relu = sqlite3_column_int(layer_params_stmt, 17);
 						break;
 					case CCV_CONVNET_MAX_POOL:
 					case CCV_CONVNET_AVERAGE_POOL:

diff --git a/lib/cuda/cwc_convnet.cu b/lib/cuda/cwc_convnet.cu
@@ -705,7 +705,14 @@ static void _cwc_convnet_average_pool_forward_propagate(ccv_convnet_layer_t* lay
 	 b, out_rows, out_cols);
 }
 
-static void _cwc_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, float* batch_unit /* this is just 1's in device */, const cublasHandle_t& handle)
+__global__ static void _cwc_kern_relu_forward_propagate(float* a)
+{
+	a += blockIdx.x * blockDim.x;
+	const int thidx = threadIdx.x;
+	a[thidx] = max(0.0, a[thidx]);
+}
+
+static void _cwc_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* b, float* batch_unit /* this is just 1's in device */, const cudaStream_t& stream, const cublasHandle_t& handle)
 {
 	int rows, out_rows, out_cols, out_partition;
 	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols, &out_partition);
@@ -718,6 +725,11 @@ static void _cwc_convnet_full_connect_forward_propagate(ccv_convnet_layer_t* lay
 	beta = 1;
 	// and then do the GEMM by adding bias
 	cublasSgemm(handle, CUBLAS_OP_N, CUBLAS_OP_N, batch, out_rows, rows, &alpha, a, batch, layer->w, rows, &beta, b, batch);
+	if (layer->net.full_connect.relu)
+		_cwc_kern_relu_forward_propagate
+		<<<layer->net.full_connect.count, batch, 0, stream>>>
+		(b);
+
 }
 
 __global__ static void _cwc_kern_mute_neuron(float* a, float* d)
@@ -751,7 +763,7 @@ static void _cwc_convnet_encode_impl(ccv_convnet_t* convnet, float* a, int batch
 				break;
 			case CCV_CONVNET_FULL_CONNECT:
 				assert(i > 0);
-				_cwc_convnet_full_connect_forward_propagate(layer, batch, GPU(convnet)->forwards[i - 1], GPU(convnet)->forwards[i], GPU(convnet)->unit, context->device.cublas);
+				_cwc_convnet_full_connect_forward_propagate(layer, batch, GPU(convnet)->forwards[i - 1], GPU(convnet)->forwards[i], GPU(convnet)->unit, context->device.stream, context->device.cublas);
 				if (dor && context->device.dor[i])
 					_cwc_kern_mute_neuron
 					<<<layer->net.full_connect.count, batch, 0, context->device.stream>>>
@@ -775,7 +787,7 @@ static void _cwc_convnet_encode_impl(ccv_convnet_t* convnet, float* a, int batch
 
 #ifdef HAVE_GSL
 
-__global__ static void _cwc_kern_convolutional_relu_backward_propagate(const int batch,
+__global__ static void _cwc_kern_relu_backward_propagate(const int batch,
 		float* out, float* out_grad, const int out_rows, const int out_cols,
 		const int count)
 {
@@ -1231,7 +1243,7 @@ static void _cwc_convnet_convolutional_backward_propagate(ccv_convnet_layer_t* l
 	int out_rows, out_cols, out_partition, shared_memory_size;
 	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols, &out_partition);
 	// it turns out that first apply relu would save us a lot of computation because no need to low both out and out_grad any more
-	_cwc_kern_convolutional_relu_backward_propagate
+	_cwc_kern_relu_backward_propagate
 	<<<dim3(out_cols, out_rows, layer->net.convolutional.count), batch, 0, stream>>>
 	(batch, n, a, out_rows, out_cols, layer->net.convolutional.count);
 	assert(cudaGetLastError() == cudaSuccess);
@@ -1492,12 +1504,17 @@ static void _cwc_convnet_average_pool_backward_propagate(ccv_convnet_layer_t* la
 	 a, out_rows, out_cols);
 }
 
-static void _cwc_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* m, float* b, float* batch_unit, ccv_convnet_layer_t* configuration, const cublasHandle_t& handle)
+static void _cwc_convnet_full_connect_backward_propagate(ccv_convnet_layer_t* layer, int batch, float* a, float* n, float* m, float* b, float* batch_unit, ccv_convnet_layer_t* configuration, const cudaStream_t& stream, const cublasHandle_t& handle)
 {
 	int rows, out_rows, out_cols, out_partition;
 	_cwc_convnet_layer_deduce_output_format(layer, &out_rows, &out_cols, &out_partition);
 	out_cols = batch;
 	rows = layer->input.matrix.rows * layer->input.matrix.cols * layer->input.matrix.channels;
+	// apply relu for full connect layer
+	if (layer->net.full_connect.relu)
+		_cwc_kern_relu_backward_propagate
+		<<<dim3(1, out_rows, 1), batch, 0, stream>>>
+		(batch, n, a, out_rows, 1, 1);
 	float alpha = 1;
 	float beta = 0;
 	// propagate bias
@@ -1701,8 +1718,8 @@ static void _cwc_convnet_batch_formation(gsl_rng* rng, ccv_array_t* categorizeds
 		ccv_dense_matrix_t* input = 0;
 		if (image->cols != dim.width || image->rows != dim.height)
 		{
-			int x = (image->cols - dim.width + 1) / 2;
-			int y = (image->rows - dim.height + 1) / 2;
+			int x = rng ? gsl_rng_uniform_int(rng, image->cols - dim.width + 1) : (image->cols - dim.width + 1) / 2;
+			int y = rng ? gsl_rng_uniform_int(rng, image->rows - dim.height + 1) : (image->rows - dim.height + 1) / 2;
 			assert(x == 0 || y == 0);
 			ccv_slice(image, (ccv_matrix_t**)&input, CCV_32F, y, x, dim.height, dim.width);
 		} else
@@ -1964,7 +1981,7 @@ static void _cwc_convnet_backwards_propagate_error(ccv_convnet_t* convnet, float
 					_cwc_kern_mute_neuron
 					<<<layer->net.full_connect.count, batch, 0, context->device.stream>>>
 					(i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], context->device.dor[i]);
-				_cwc_convnet_full_connect_backward_propagate(layer, batch,  i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], i > 0 ? GPU(convnet)->forwards[i - 1] : m, GPU(convnet)->backwards[i], GPU(convnet)->unit, configuration, context->device.cublas);
+				_cwc_convnet_full_connect_backward_propagate(layer, batch,  i == convnet->count - 1 ? a : GPU(convnet)->backwards[i + 1], GPU(convnet)->forwards[i], i > 0 ? GPU(convnet)->forwards[i - 1] : m, GPU(convnet)->backwards[i], GPU(convnet)->unit, configuration, context->device.stream, context->device.cublas);
 				assert(cudaGetLastError() == cudaSuccess);
 				break;
 			case CCV_CONVNET_LOCAL_RESPONSE_NORM:

diff --git a/samples/image-net.sqlite3 b/samples/image-net.sqlite3