@@ -0,0 +1,291 @@
#include <unistd.h>
#include <sys/time.h>
#include <assert.h>

#include "utils.h"
#include "parser.h"
#include "data.h"
#include "option_list.h"
#include "network.h"

void train_detector(char *datacfg, char *cfgfile, char *weightfile)
{
char *base = basecfg(cfgfile);
srand(time(0));
network *net = load_network(cfgfile, weightfile);
net->output_layer = net->n - 1;

struct list *options = read_data_cfg(datacfg);
char *backup_directory = option_find_str(options, "backup", "/backup/");
char *train_list = option_find_str(options, "train", "data/train.list");

int train_set_size = 0;
char **paths = NULL;
struct list *plist = NULL;
plist = get_paths(train_list);
paths = (char **)list_to_array(plist);
train_set_size = plist->size;
train_set_size = option_find_int(options, "train_num", train_set_size);
double time;
printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
int max_epoch = (int)net->max_batches * net->batch / train_set_size;
printf("image net has seen: %lu, train_set_size: %d, max_batches of net: %d, net->classes: %d,"
"net->batch: %d, max_epoch: %d\n\n",
net->seen, train_set_size, net->max_batches, net->classes, net->batch, max_epoch);

net->batch_train = net->seen / net->batch;
net->epoch = net->seen / train_set_size;
float avg_loss = -1;
float max_accuracy = -1;
int max_accuracy_batch = 0;
while(net->batch_train < net->max_batches){
time = what_time_is_it_now();
update_current_learning_rate(net);
batch_detect train;
train = load_data_detection(net->batch, paths, train_set_size, net->w, net->h, 30, net->classes,
net->jitter, net->hue, net->saturation, net->exposure, net->test);
/*
int k;
for(k = 0; k < l.max_boxes; ++k){
box b = float_to_box(train.y.vals[10] + 1 + k*5);
if(!b.x) break;
printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h);
}
*/
/*
int zz;
for(zz = 0; zz < train.X.cols; ++zz){
image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]);
int k;
for(k = 0; k < l.max_boxes; ++k){
box b = float_to_box(train.y.vals[zz] + k*5, 1);
printf("%f %f %f %f\n", b.x, b.y, b.w, b.h);
draw_bbox(im, b, 1, 1,0,0);
}
show_image(im, "truth11");
cvWaitKey(0);
save_image(im, "truth11");
}
printf("Loaded: %lf seconds\n", what_time_is_it_now()-time);
for(int zz = 0; zz < train.X.rows; ++zz){
image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]);
for(int k = 0; k < l.max_boxes; ++k){
box b = float_to_box(train.y.vals[zz] + k*5, 1);
if(!b.x) break;
//printf("box value: %f %f %f %f\n", b.x, b.y, b.w, b.h);
draw_bbox(im, b, 1, 1,1,1);
}
//show_image(im, "truth11");
//cvWaitKey(0);
save_image(im, "truth11");
}
*/
//train_network_detect(net, train);
free_batch_detect(train);

int epoch_old = net->epoch;
net->epoch = net->seen / train_set_size;
float loss = net->loss;
if(loss > 999999 || loss < -999999 || loss != loss || (loss + 1.0 == loss)) { // NaN ≠ NaN, Inf + 1 = Inf
fprintf(stderr, "\n\nloss too large: %f, exit\n", loss);
exit(-1);
}
if(avg_loss < 0){
avg_loss = loss;
} else {
avg_loss = avg_loss*.9 + loss*.1;
}
if(net->correct_num / (net->accuracy_count + 0.00001F) > max_accuracy){
max_accuracy = net->correct_num / (net->accuracy_count + 0.00001F);
max_accuracy_batch = net->batch_train;
}
printf("epoch: %d, batch: %d, accuracy: %.4f, loss: %f, avg_loss: %.2f, learning_rate: %.8f, %.4f s, "
"seen %lu images, max_accuracy: %.4f\n", net->epoch+1, net->batch_train,
net->correct_num / (net->accuracy_count + 0.00001F),
loss, avg_loss, net->learning_rate, what_time_is_it_now()-time, net->seen, max_accuracy);
if(epoch_old != net->epoch){
char buff[256];
sprintf(buff, "%s/%s_%06d.weights", backup_directory, base, net->epoch);
save_weights(net, buff);
}
}
printf("max_accuracy_batch: %d\n", max_accuracy_batch);
char buff[256];
sprintf(buff, "%s/%s_final.weights", backup_directory, base);
save_weights(net, buff);
free_network(net);
printf("max_accuracy_batch: %d\n", max_accuracy_batch);
if(paths) free_ptr(paths);
if(plist){
free_list_contents(plist);
free_list(plist);
}
free(base);
}
/*
void print_detector_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h)
{
int i, j;
for(i = 0; i < total; ++i){
float xmin = dets[i].bbox.x - dets[i].bbox.w/2. + 1;
float xmax = dets[i].bbox.x + dets[i].bbox.w/2. + 1;
float ymin = dets[i].bbox.y - dets[i].bbox.h/2. + 1;
float ymax = dets[i].bbox.y + dets[i].bbox.h/2. + 1;
if (xmin < 1) xmin = 1;
if (ymin < 1) ymin = 1;
if (xmax > w) xmax = w;
if (ymax > h) ymax = h;
for(j = 0; j < classes; ++j){
if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j],
xmin/w, ymin/h, xmax/w, ymax/h);
}
}
}
void validate_detector(char *datacfg, char *cfgfile, char *weightfile)
{
int j;
list *options = read_data_cfg(datacfg);
char *valid_images = option_find_str(options, "valid", "data/train.list");
char *name_list = option_find_str(options, "names", "data/names.list");
char *prefix = option_find_str(options, "results", "results");
char **names = get_labels(name_list);
char *mapf = option_find_str(options, "map", 0);
int *map = 0;
if (mapf) map = read_map(mapf);
network *net = load_network(cfgfile, weightfile, 0);
set_batch_network(net, 1);
fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay);
srand(time(0));
list *plist = get_paths(valid_images);
char **paths = (char **)list_to_array(plist);
layer l = net->layers[net->n-1];
int classes = l.classes;
char buff[1024];
char *type = option_find_str(options, "eval", "voc");
FILE *fp = 0;
FILE **fps = 0;
int coco = 0;
int imagenet = 0;
char *outfile = 0;
if(0==strcmp(type, "coco")){
if(!outfile) outfile = "coco_results";
snprintf(buff, 1024, "%s/%s.json", prefix, outfile);
fp = fopen(buff, "w");
fprintf(fp, "[\n");
coco = 1;
} else if(0==strcmp(type, "imagenet")){
if(!outfile) outfile = "imagenet-detection";
snprintf(buff, 1024, "%s/%s.txt", prefix, outfile);
fp = fopen(buff, "w");
imagenet = 1;
classes = 200;
} else {
if(!outfile) outfile = "comp4_det_test_";
fps = calloc(classes, sizeof(FILE *));
for(j = 0; j < classes; ++j){
snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]);
fps[j] = fopen(buff, "w");
}
}
int m = plist->size;
int i=0;
int t;
float thresh = .3;
float nms = .45;
int nthreads = 4;
image *val = calloc(nthreads, sizeof(image));
image *val_resized = calloc(nthreads, sizeof(image));
image *buf = calloc(nthreads, sizeof(image));
image *buf_resized = calloc(nthreads, sizeof(image));
pthread_t *thr = calloc(nthreads, sizeof(pthread_t));
load_args args = {0};
args.w = net->w;
args.h = net->h;
//args.type = IMAGE_DATA;
args.type = LETTERBOX_DATA;
for(t = 0; t < nthreads; ++t){
args.path = paths[i+t];
args.im = &buf[t];
args.resized = &buf_resized[t];
thr[t] = load_data_in_thread(args);
}
double start = what_time_is_it_now();
for(i = nthreads; i < m+nthreads; i += nthreads){
fprintf(stderr, "%d\n", i);
for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
pthread_join(thr[t], 0);
val[t] = buf[t];
val_resized[t] = buf_resized[t];
}
for(t = 0; t < nthreads && i+t < m; ++t){
args.path = paths[i+t];
args.im = &buf[t];
args.resized = &buf_resized[t];
thr[t] = load_data_in_thread(args);
}
for(t = 0; t < nthreads && i+t-nthreads < m; ++t){
char *path = paths[i+t-nthreads];
char *id = basecfg(path);
float *X = val_resized[t].data;
network_predict(net, X);
int w = val[t].w;
int h = val[t].h;
int nboxes = 0;
detection *dets = get_network_boxes(net, w, h, thresh, .5, map, 0, &nboxes);
if (nms) do_nms_sort(dets, nboxes, classes, nms);
print_detector_detections(fps, path, dets, nboxes, classes, w, h);
free_detections(dets, nboxes);
free(id);
free_image(val[t]);
free_image(val_resized[t]);
}
}
for(j = 0; j < classes; ++j){
if(fps) fclose(fps[j]);
}
if(coco){
fseek(fp, -2, SEEK_CUR);
fprintf(fp, "\n]\n");
fclose(fp);
}
fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start);
}
*/
void run_detector(int argc, char **argv)
{
double time_start = what_time_is_it_now();;
if(argc < 4){
fprintf(stderr, "usage: %s %s [train/valid] [data cfg] [cfg] [weights (optional)]\n", argv[0], argv[1]);
return;
}

char *datacfg = argv[3];
char *cfg = argv[4];
char *weights = (argc > 5) ? argv[5] : 0;
if(0==strcmp(argv[2], "train")){
train_detector(datacfg, cfg, weights);
} else if(0==strcmp(argv[2], "valid")){
//validate_detector(datacfg, cfg, weights);
} else if(0==strcmp(argv[2], "recall")){
//validate_detector_recall(cfg, weights);
} else {
fprintf(stderr, "usage: %s %s [train/valid] [data cfg] [cfg] [weights (optional)]\n", argv[0], argv[1]);
}
fprintf(stderr, "\n\ntotal %.2lf seconds\n\n\n", what_time_is_it_now() - time_start);

}
@@ -274,3 +274,198 @@ char **get_labels(char *filename)
free_list(plist);
return labels;
}

void free_matrix(matrix m)
{
for(int i = 0; i < m.rows; ++i) free(m.vals[i]);
free(m.vals);
}

matrix make_matrix(int rows, int cols)
{
matrix m;
m.rows = rows;
m.cols = cols;
m.vals = calloc(m.rows, sizeof(float *));
for(int i = 0; i < m.rows; ++i){
m.vals[i] = calloc(m.cols, sizeof(float));
}
return m;
}

char **get_random_paths(char **paths, int n, int train_set_size)
{
char **random_paths = calloc(n, sizeof(char*));
for(int i = 0; i < n; ++i){
int index = rand()%train_set_size;
random_paths[i] = paths[index];
}
return random_paths;
}

box_label *read_boxes(char *filename, int *n)
{
FILE *file = fopen(filename, "r");
if(!file) file_error(filename);
float x, y, h, w;
int id;
int count = 0;
int size = 64;
box_label *boxes = calloc(size, sizeof(box_label));
while(fscanf(file, "%d %f %f %f %f", &id, &x, &y, &w, &h) == 5){
if(count == size) {
size = size * 2;
boxes = realloc(boxes, size*sizeof(box_label));
}
boxes[count].id = id;
boxes[count].x = x;
boxes[count].y = y;
boxes[count].h = h;
boxes[count].w = w;
boxes[count].left = x - w/2;
boxes[count].right = x + w/2;
boxes[count].top = y - h/2;
boxes[count].bottom = y + h/2;
++count;
}
fclose(file);
*n = count;
return boxes;
}

void randomize_boxes(box_label *b, int n)
{
int i;
for(i = 0; i < n; ++i){
box_label swap = b[i];
int index = rand()%n;
b[i] = b[index];
b[index] = swap;
}
}

void correct_boxes(box_label *boxes, int n, float dx, float dy, float sx, float sy, int flip)
{
int i;
for(i = 0; i < n; ++i){
if(boxes[i].x == 0 && boxes[i].y == 0) {
boxes[i].x = 999999;
boxes[i].y = 999999;
boxes[i].w = 999999;
boxes[i].h = 999999;
continue;
}
boxes[i].left = boxes[i].left * sx - dx;
boxes[i].right = boxes[i].right * sx - dx;
boxes[i].top = boxes[i].top * sy - dy;
boxes[i].bottom = boxes[i].bottom* sy - dy;

if(flip){
float swap = boxes[i].left;
boxes[i].left = 1. - boxes[i].right;
boxes[i].right = 1. - swap;
}

boxes[i].left = constrain(0, 1, boxes[i].left);
boxes[i].right = constrain(0, 1, boxes[i].right);
boxes[i].top = constrain(0, 1, boxes[i].top);
boxes[i].bottom = constrain(0, 1, boxes[i].bottom);

boxes[i].x = (boxes[i].left+boxes[i].right)/2;
boxes[i].y = (boxes[i].top+boxes[i].bottom)/2;
boxes[i].w = (boxes[i].right - boxes[i].left);
boxes[i].h = (boxes[i].bottom - boxes[i].top);

boxes[i].w = constrain(0, 1, boxes[i].w);
boxes[i].h = constrain(0, 1, boxes[i].h);
}
}

void fill_truth_detection(char *path, int num_boxes, float *truth, int classes, int flip, float dx, float dy, float sx, float sy)
{
char labelpath[4096];
find_replace(path, "train_image", "labels", labelpath);
find_replace(labelpath, ".jpg", ".txt", labelpath);
find_replace(labelpath, ".png", ".txt", labelpath);
find_replace(labelpath, ".JPG", ".txt", labelpath);
find_replace(labelpath, ".JPEG", ".txt", labelpath);
int count = 0;
box_label *boxes = read_boxes(labelpath, &count);
randomize_boxes(boxes, count);
correct_boxes(boxes, count, dx, dy, sx, sy, flip);
if(count > num_boxes) count = num_boxes;
float x,y,w,h;
int id;
int i;
int sub = 0;

for (i = 0; i < count; ++i) {
x = boxes[i].x;
y = boxes[i].y;
w = boxes[i].w;
h = boxes[i].h;
id = boxes[i].id;

if ((w < .001 || h < .001)) {
++sub;
continue;
}

truth[(i-sub)*5+0] = x;
truth[(i-sub)*5+1] = y;
truth[(i-sub)*5+2] = w;
truth[(i-sub)*5+3] = h;
truth[(i-sub)*5+4] = id;
}
free(boxes);
}

void free_batch_detect(batch_detect d)
{
free_matrix(d.X);
free_matrix(d.y);
}

batch_detect load_data_detection(int n, char **paths, int train_set_size, int w, int h, int boxes, int classes,
float jitter, float hue, float saturation, float exposure, int test)
{
char **random_paths = get_random_paths(paths, n, train_set_size);
batch_detect d = {0};
d.X.rows = n;
d.X.vals = calloc(d.X.rows, sizeof(float*));
d.X.cols = h*w*3;
d.y = make_matrix(n, 5*boxes);
for(int i = 0; i < n; ++i){
image orig = load_image_color(random_paths[i], 0, 0);
image sized = make_image(w, h, orig.c);
fill_image(sized, .5);

float dw = jitter * orig.w;
float dh = jitter * orig.h;
float new_ar = (orig.w + rand_uniform(-dw, dw)) / (orig.h + rand_uniform(-dh, dh));
float scale = 1;
float nw, nh;
if(new_ar < 1){
nh = scale * h;
nw = nh * new_ar;
} else {
nw = scale * w;
nh = nw / new_ar;
}
float dx = rand_uniform(0, w - nw);
float dy = rand_uniform(0, h - nh);
place_image(orig, nw, nh, dx, dy, sized);
random_distort_image(sized, hue, saturation, exposure);

int flip = 0;
if(test == 0) { // 0: train, 1: valid
flip = rand()%2;
if(flip) flip_image(sized);
}
d.X.vals[i] = sized.data;
fill_truth_detection(random_paths[i], boxes, d.y.vals[i], classes, flip, -dx/w, -dy/h, nw/w, nh/h);
free_image(orig);
}
free(random_paths);
return d;
}
@@ -3,29 +3,30 @@

#include "image.h"

typedef struct {
char **paths;
int train_set_size;
int h;
int w;
int c;
int n;
char **labels;
int classes;
float saturation;
float exposure;
float hue;
} load_args;

typedef struct{
int n; // number of image
int h;
int w;
int c;
int h, w, c;
float *data;
int *truth_label_index;
} batch;

typedef struct matrix{
int rows, cols;
float **vals;
} matrix;

typedef struct{
int id;
float x,y,w,h;
float left, right, top, bottom;
} box_label;

typedef struct{
int w, h;
matrix X;
matrix y;
} batch_detect;

batch random_batch(char **paths, int batch_size, char **labels, int classes, int train_set_size, int w, int h, int c,
float hue, float saturation, float exposure, int flip, float mean_value, float scale, int test);
void free_batch(batch *b);
@@ -36,4 +37,10 @@ batch *load_csv_image_to_memory(char *filename, int batch_size, char **labels, i
batch *load_image_to_memory(char **paths, int batch_size, char **labels, int classes, int train_set_size,
int *batch_num_return, int w, int h, int c, float hue, float saturation, float exposure,
int flip, float mean_value, float scale, int test);

void free_matrix(matrix m);
matrix make_matrix(int rows, int cols);
batch_detect load_data_detection(int n, char **paths, int train_set_size, int w, int h, int boxes, int classes,
float jitter, float hue, float saturation, float exposure, int test);
void free_batch_detect(batch_detect d);
#endif
@@ -77,6 +77,6 @@ void copy_image_into(image src, image dest);

image get_image_layer(image m, int l);
void flip_image(image a);

void fill_image(image m, float s);
#endif

@@ -622,6 +622,32 @@ void update_network_gpu(network *net)

#endif

void train_network_detect(network *net, batch_detect d)
{
if(net->input == 0){
net->input = (float *)malloc(net->h * net->w * net->c * net->batch * sizeof(float));
net->truth = (float *)malloc(net->h * net->w * net->c * net->batch * sizeof(float));
}
memset(net->truth, 0, sizeof(net->truth) * sizeof(float));
for(int j = 0; j < net->batch; ++j){
memcpy(net->input + j * d.X.cols, d.X.vals[j], d.X.cols * sizeof(float));
memcpy(net->truth + j * d.y.cols, d.y.vals[j], d.y.cols * sizeof(float));
}
#ifdef GPU
cuda_push_array(net->input_gpu, input, net->h * net->w * net->c * net->batch);
cuda_push_array_int(net->truth_label_index_gpu, net->truth_label_index, net->batch);
forward_network_gpu(net, net->input_gpu);
backward_network_gpu(net, net->input_gpu);
update_network_gpu(net);
#else
forward_network(net, net->input);
backward_network(net, net->input);
update_network(net);
#endif
net->seen += net->batch * net->time_steps;
net->batch_train += 1;
}

void train_network(network *net, float *input, int *truth_label_index)
{
if(net->accuracy_count > net->accuracy_count_max){
@@ -105,14 +105,15 @@ typedef struct {
int test; // 0: train, 1: valid
int classes; // train data classes
int *truth_label_index;
float *input, *truth;

int correct_num; // train correct number
int accuracy_count, accuracy_count_max; // all trained data size, train accuracy = correct_num / accuracy_count
float *workspace; // for convolutional_layer image reorder
float *workspace_gpu; // for convolutional_layer image reorder
size_t workspace_size;
float loss;
float hue, saturation, exposure; // random_distort_image
float hue, saturation, exposure, jitter; // random_distort_image
float mean_value, scale; // use when load image
int flip;

@@ -166,6 +167,7 @@ network *make_network(int n);
network *load_network(char *cfg, char *weights);
void free_network(network *net);
void train_network(network *net, float *input, int *truth_label_index);
void train_network_detect(network *net, batch_detect d);
void valid_network(network *net, float *input, int *truth_label_index);
float *forward_network_test(network *net, float *input);
int get_network_output_size_layer(network *net, int i);
@@ -43,8 +43,8 @@ convolutional_layer *parse_convolutional(struct list *options, network *net, int
int batch_normalize = option_find_int(options, "batch_normalize", 0);
int pad = option_find_int(options, "pad", 0);
float lr_mult = option_find_float(options, "lr_mult", 1);
float lr_decay_mult = option_find_float(options, "lr_decay_mult", 0);
float bias_mult = option_find_float(options, "bias_mult", 2);
float lr_decay_mult = option_find_float(options, "lr_decay_mult", 1);
float bias_mult = option_find_float(options, "bias_mult", 1);
float bias_decay_mult = option_find_float(options, "bias_decay_mult", 0);
char *weight_filler_str = option_find_str(options, "weight_filler", "xavier");
int weight_filler = 1;
@@ -391,6 +391,7 @@ void parse_net_options(struct list *options, network *net)
net->saturation = option_find_float(options, "saturation", 1);
net->exposure = option_find_float(options, "exposure", 1);
net->hue = option_find_float(options, "hue", 0);
net->jitter = option_find_float(options, "jitter", 0);
net->flip = option_find_float(options, "flip", 0);
net->mean_value = option_find_float(options, "mean_value", 0);
//net->mean_value /= 255.0F; // scale image to [0, 1] when load image
@@ -0,0 +1,106 @@
#include "upsample_layer.h"
#include "cuda.h"
#include "blas.h"

#include <stdio.h>

layer make_upsample_layer(int batch, int w, int h, int c, int stride)
{
layer l = {0};
l.type = UPSAMPLE;
l.batch = batch;
l.w = w;
l.h = h;
l.c = c;
l.out_w = w*stride;
l.out_h = h*stride;
l.out_c = c;
if(stride < 0){
stride = -stride;
l.reverse=1;
l.out_w = w/stride;
l.out_h = h/stride;
}
l.stride = stride;
l.outputs = l.out_w*l.out_h*l.out_c;
l.inputs = l.w*l.h*l.c;
l.delta = calloc(l.outputs*batch, sizeof(float));
l.output = calloc(l.outputs*batch, sizeof(float));;

l.forward = forward_upsample_layer;
l.backward = backward_upsample_layer;
#ifdef GPU
l.forward_gpu = forward_upsample_layer_gpu;
l.backward_gpu = backward_upsample_layer_gpu;

l.delta_gpu = cuda_make_array(l.delta, l.outputs*batch);
l.output_gpu = cuda_make_array(l.output, l.outputs*batch);
#endif
if(l.reverse) fprintf(stderr, "downsample %2dx %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
else fprintf(stderr, "upsample %2dx %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c);
return l;
}

void resize_upsample_layer(layer *l, int w, int h)
{
l->w = w;
l->h = h;
l->out_w = w*l->stride;
l->out_h = h*l->stride;
if(l->reverse){
l->out_w = w/l->stride;
l->out_h = h/l->stride;
}
l->outputs = l->out_w*l->out_h*l->out_c;
l->inputs = l->h*l->w*l->c;
l->delta = realloc(l->delta, l->outputs*l->batch*sizeof(float));
l->output = realloc(l->output, l->outputs*l->batch*sizeof(float));

#ifdef GPU
cuda_free(l->output_gpu);
cuda_free(l->delta_gpu);
l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch);
l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch);
#endif

}

void forward_upsample_layer(const layer l, network net)
{
fill_cpu(l.outputs*l.batch, 0, l.output, 1);
if(l.reverse){
upsample_cpu(l.output, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input);
}else{
upsample_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output);
}
}

void backward_upsample_layer(const layer l, network net)
{
if(l.reverse){
upsample_cpu(l.delta, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta);
}else{
upsample_cpu(net.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta);
}
}

#ifdef GPU
void forward_upsample_layer_gpu(const layer l, network net)
{
fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1);
if(l.reverse){
upsample_gpu(l.output_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input_gpu);
}else{
upsample_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output_gpu);
}
}

void backward_upsample_layer_gpu(const layer l, network net)
{
if(l.reverse){
upsample_gpu(l.delta_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta_gpu);
}else{
upsample_gpu(net.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta_gpu);
}
}
#endif
@@ -0,0 +1,15 @@
#ifndef UPSAMPLE_LAYER_H
#define UPSAMPLE_LAYER_H
#include "darknet.h"

layer make_upsample_layer(int batch, int w, int h, int c, int stride);
void forward_upsample_layer(const layer l, network net);
void backward_upsample_layer(const layer l, network net);
void resize_upsample_layer(layer *l, int w, int h);

#ifdef GPU
void forward_upsample_layer_gpu(const layer l, network net);
void backward_upsample_layer_gpu(const layer l, network net);
#endif

#endif
@@ -0,0 +1,374 @@
#include "yolo_layer.h"
#include "activations.h"
#include "blas.h"
#include "box.h"
#include "cuda.h"
#include "utils.h"

#include <stdio.h>
#include <assert.h>
#include <string.h>
#include <stdlib.h>

layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes)
{
int i;
layer l = {0};
l.type = YOLO;

l.n = n;
l.total = total;
l.batch = batch;
l.h = h;
l.w = w;
l.c = n*(classes + 4 + 1);
l.out_w = l.w;
l.out_h = l.h;
l.out_c = l.c;
l.classes = classes;
l.cost = calloc(1, sizeof(float));
l.biases = calloc(total*2, sizeof(float));
if(mask) l.mask = mask;
else{
l.mask = calloc(n, sizeof(int));
for(i = 0; i < n; ++i){
l.mask[i] = i;
}
}
l.bias_updates = calloc(n*2, sizeof(float));
l.outputs = h*w*n*(classes + 4 + 1);
l.inputs = l.outputs;
l.truths = 90*(4 + 1);
l.delta = calloc(batch*l.outputs, sizeof(float));
l.output = calloc(batch*l.outputs, sizeof(float));
for(i = 0; i < total*2; ++i){
l.biases[i] = .5;
}

l.forward = forward_yolo_layer;
l.backward = backward_yolo_layer;
#ifdef GPU
l.forward_gpu = forward_yolo_layer_gpu;
l.backward_gpu = backward_yolo_layer_gpu;
l.output_gpu = cuda_make_array(l.output, batch*l.outputs);
l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs);
#endif

fprintf(stderr, "yolo\n");
srand(0);

return l;
}

void resize_yolo_layer(layer *l, int w, int h)
{
l->w = w;
l->h = h;

l->outputs = h*w*l->n*(l->classes + 4 + 1);
l->inputs = l->outputs;

l->output = realloc(l->output, l->batch*l->outputs*sizeof(float));
l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float));

#ifdef GPU
cuda_free(l->delta_gpu);
cuda_free(l->output_gpu);

l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs);
l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs);
#endif
}

box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride)
{
box b;
b.x = (i + x[index + 0*stride]) / lw;
b.y = (j + x[index + 1*stride]) / lh;
b.w = exp(x[index + 2*stride]) * biases[2*n] / w;
b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h;
return b;
}

float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride)
{
box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride);
float iou = box_iou(pred, truth);

float tx = (truth.x*lw - i);
float ty = (truth.y*lh - j);
float tw = log(truth.w*w / biases[2*n]);
float th = log(truth.h*h / biases[2*n + 1]);

delta[index + 0*stride] = scale * (tx - x[index + 0*stride]);
delta[index + 1*stride] = scale * (ty - x[index + 1*stride]);
delta[index + 2*stride] = scale * (tw - x[index + 2*stride]);
delta[index + 3*stride] = scale * (th - x[index + 3*stride]);
return iou;
}


void delta_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat)
{
int n;
if (delta[index]){
delta[index + stride*class] = 1 - output[index + stride*class];
if(avg_cat) *avg_cat += output[index + stride*class];
return;
}
for(n = 0; n < classes; ++n){
delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n];
if(n == class && avg_cat) *avg_cat += output[index + stride*n];
}
}

static int entry_index(layer l, int batch, int location, int entry)
{
int n = location / (l.w*l.h);
int loc = location % (l.w*l.h);
return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc;
}

void forward_yolo_layer(const layer l, network net)
{
int i,j,b,t,n;
memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float));

#ifndef GPU
for (b = 0; b < l.batch; ++b){
for(n = 0; n < l.n; ++n){
int index = entry_index(l, b, n*l.w*l.h, 0);
activate_array(l.output + index, 2*l.w*l.h, LOGISTIC);
index = entry_index(l, b, n*l.w*l.h, 4);
activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC);
}
}
#endif

memset(l.delta, 0, l.outputs * l.batch * sizeof(float));
if(!net.train) return;
float avg_iou = 0;
float recall = 0;
float recall75 = 0;
float avg_cat = 0;
float avg_obj = 0;
float avg_anyobj = 0;
int count = 0;
int class_count = 0;
*(l.cost) = 0;
for (b = 0; b < l.batch; ++b) {
for (j = 0; j < l.h; ++j) {
for (i = 0; i < l.w; ++i) {
for (n = 0; n < l.n; ++n) {
int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0);
box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h);
float best_iou = 0;
int best_t = 0;
for(t = 0; t < l.max_boxes; ++t){
box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);
if(!truth.x) break;
float iou = box_iou(pred, truth);
if (iou > best_iou) {
best_iou = iou;
best_t = t;
}
}
int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4);
avg_anyobj += l.output[obj_index];
l.delta[obj_index] = 0 - l.output[obj_index];
if (best_iou > l.ignore_thresh) {
l.delta[obj_index] = 0;
}
if (best_iou > l.truth_thresh) {
l.delta[obj_index] = 1 - l.output[obj_index];

int class = net.truth[best_t*(4 + 1) + b*l.truths + 4];
if (l.map) class = l.map[class];
int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1);
delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0);
box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1);
delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);
}
}
}
}
for(t = 0; t < l.max_boxes; ++t){
box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1);

if(!truth.x) break;
float best_iou = 0;
int best_n = 0;
i = (truth.x * l.w);
j = (truth.y * l.h);
box truth_shift = truth;
truth_shift.x = truth_shift.y = 0;
for(n = 0; n < l.total; ++n){
box pred = {0};
pred.w = l.biases[2*n]/net.w;
pred.h = l.biases[2*n+1]/net.h;
float iou = box_iou(pred, truth_shift);
if (iou > best_iou){
best_iou = iou;
best_n = n;
}
}

int mask_n = int_index(l.mask, best_n, l.n);
if(mask_n >= 0){
int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0);
float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h);

int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4);
avg_obj += l.output[obj_index];
l.delta[obj_index] = 1 - l.output[obj_index];

int class = net.truth[t*(4 + 1) + b*l.truths + 4];
if (l.map) class = l.map[class];
int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1);
delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat);

++count;
++class_count;
if(iou > .5) recall += 1;
if(iou > .75) recall75 += 1;
avg_iou += iou;
}
}
}
*(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2);
printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count);
}

void backward_yolo_layer(const layer l, network net)
{
axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1);
}

void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative)
{
int i;
int new_w=0;
int new_h=0;
if (((float)netw/w) < ((float)neth/h)) {
new_w = netw;
new_h = (h * netw)/w;
} else {
new_h = neth;
new_w = (w * neth)/h;
}
for (i = 0; i < n; ++i){
box b = dets[i].bbox;
b.x = (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw);
b.y = (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth);
b.w *= (float)netw/new_w;
b.h *= (float)neth/new_h;
if(!relative){
b.x *= w;
b.w *= w;
b.y *= h;
b.h *= h;
}
dets[i].bbox = b;
}
}

int yolo_num_detections(layer l, float thresh)
{
int i, n;
int count = 0;
for (i = 0; i < l.w*l.h; ++i){
for(n = 0; n < l.n; ++n){
int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4);
if(l.output[obj_index] > thresh){
++count;
}
}
}
return count;
}

void avg_flipped_yolo(layer l)
{
int i,j,n,z;
float *flip = l.output + l.outputs;
for (j = 0; j < l.h; ++j) {
for (i = 0; i < l.w/2; ++i) {
for (n = 0; n < l.n; ++n) {
for(z = 0; z < l.classes + 4 + 1; ++z){
int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i;
int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1);
float swap = flip[i1];
flip[i1] = flip[i2];
flip[i2] = swap;
if(z == 0){
flip[i1] = -flip[i1];
flip[i2] = -flip[i2];
}
}
}
}
}
for(i = 0; i < l.outputs; ++i){
l.output[i] = (l.output[i] + flip[i])/2.;
}
}

int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets)
{
int i,j,n;
float *predictions = l.output;
if (l.batch == 2) avg_flipped_yolo(l);
int count = 0;
for (i = 0; i < l.w*l.h; ++i){
int row = i / l.w;
int col = i % l.w;
for(n = 0; n < l.n; ++n){
int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4);
float objectness = predictions[obj_index];
if(objectness <= thresh) continue;
int box_index = entry_index(l, 0, n*l.w*l.h + i, 0);
dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h);
dets[count].objectness = objectness;
dets[count].classes = l.classes;
for(j = 0; j < l.classes; ++j){
int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j);
float prob = objectness*predictions[class_index];
dets[count].prob[j] = (prob > thresh) ? prob : 0;
}
++count;
}
}
correct_yolo_boxes(dets, count, w, h, netw, neth, relative);
return count;
}

#ifdef GPU

void forward_yolo_layer_gpu(const layer l, network net)
{
copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1);
int b, n;
for (b = 0; b < l.batch; ++b){
for(n = 0; n < l.n; ++n){
int index = entry_index(l, b, n*l.w*l.h, 0);
activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC);
index = entry_index(l, b, n*l.w*l.h, 4);
activate_array_gpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC);
}
}
if(!net.train || l.onlyforward){
cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs);
return;
}

cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs);
forward_yolo_layer(l, net);
cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs);
}

void backward_yolo_layer_gpu(const layer l, network net)
{
axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1);
}
#endif

@@ -0,0 +1,19 @@
#ifndef YOLO_LAYER_H
#define YOLO_LAYER_H

#include "darknet.h"
#include "layer.h"
#include "network.h"

layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes);
void forward_yolo_layer(const layer l, network net);
void backward_yolo_layer(const layer l, network net);
void resize_yolo_layer(layer *l, int w, int h);
int yolo_num_detections(layer l, float thresh);

#ifdef GPU
void forward_yolo_layer_gpu(const layer l, network net);
void backward_yolo_layer_gpu(layer l, network net);
#endif

#endif
152 yolo.cfg
@@ -0,0 +1,152 @@
[network]
batch=1
width=416
height=416
channels=3

saturation = 1.5
exposure = 1.5
hue=.1
jitter=.3

learning_rate=0.001
momentum=0.9
decay=0.0005
max_batches = 7002
policy=steps
steps=5000,6000
scales=.1,.1

[convolutional]
batch_normalize=1
filters=16
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=32
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=64
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=128
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=2

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[maxpool]
size=2
stride=1

[convolutional]
batch_normalize=1
filters=1024
size=3
stride=1
pad=1
activation=leaky

###########

[convolutional]
batch_normalize=1
filters=256
size=1
stride=1
pad=1
activation=leaky

[convolutional]
batch_normalize=1
filters=512
size=3
stride=1
pad=1
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=18
activation=linear


[route]
layers = -4

[convolutional]
batch_normalize=1
filters=128
size=1
stride=1
pad=1
activation=leaky



[convolutional]
batch_normalize=1
filters=256
size=3
stride=1
pad=1
activation=leaky

[convolutional]
size=1
stride=1
pad=1
filters=18
activation=linear