| @@ -0,0 +1,291 @@ | ||
| #include <unistd.h> | ||
| #include <sys/time.h> | ||
| #include <assert.h> | ||
|
|
||
| #include "utils.h" | ||
| #include "parser.h" | ||
| #include "data.h" | ||
| #include "option_list.h" | ||
| #include "network.h" | ||
|
|
||
| void train_detector(char *datacfg, char *cfgfile, char *weightfile) | ||
| { | ||
| char *base = basecfg(cfgfile); | ||
| srand(time(0)); | ||
| network *net = load_network(cfgfile, weightfile); | ||
| net->output_layer = net->n - 1; | ||
|
|
||
| struct list *options = read_data_cfg(datacfg); | ||
| char *backup_directory = option_find_str(options, "backup", "/backup/"); | ||
| char *train_list = option_find_str(options, "train", "data/train.list"); | ||
|
|
||
| int train_set_size = 0; | ||
| char **paths = NULL; | ||
| struct list *plist = NULL; | ||
| plist = get_paths(train_list); | ||
| paths = (char **)list_to_array(plist); | ||
| train_set_size = plist->size; | ||
| train_set_size = option_find_int(options, "train_num", train_set_size); | ||
| double time; | ||
| printf("Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); | ||
| int max_epoch = (int)net->max_batches * net->batch / train_set_size; | ||
| printf("image net has seen: %lu, train_set_size: %d, max_batches of net: %d, net->classes: %d," | ||
| "net->batch: %d, max_epoch: %d\n\n", | ||
| net->seen, train_set_size, net->max_batches, net->classes, net->batch, max_epoch); | ||
|
|
||
| net->batch_train = net->seen / net->batch; | ||
| net->epoch = net->seen / train_set_size; | ||
| float avg_loss = -1; | ||
| float max_accuracy = -1; | ||
| int max_accuracy_batch = 0; | ||
| while(net->batch_train < net->max_batches){ | ||
| time = what_time_is_it_now(); | ||
| update_current_learning_rate(net); | ||
| batch_detect train; | ||
| train = load_data_detection(net->batch, paths, train_set_size, net->w, net->h, 30, net->classes, | ||
| net->jitter, net->hue, net->saturation, net->exposure, net->test); | ||
| /* | ||
| int k; | ||
| for(k = 0; k < l.max_boxes; ++k){ | ||
| box b = float_to_box(train.y.vals[10] + 1 + k*5); | ||
| if(!b.x) break; | ||
| printf("loaded: %f %f %f %f\n", b.x, b.y, b.w, b.h); | ||
| } | ||
| */ | ||
| /* | ||
| int zz; | ||
| for(zz = 0; zz < train.X.cols; ++zz){ | ||
| image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]); | ||
| int k; | ||
| for(k = 0; k < l.max_boxes; ++k){ | ||
| box b = float_to_box(train.y.vals[zz] + k*5, 1); | ||
| printf("%f %f %f %f\n", b.x, b.y, b.w, b.h); | ||
| draw_bbox(im, b, 1, 1,0,0); | ||
| } | ||
| show_image(im, "truth11"); | ||
| cvWaitKey(0); | ||
| save_image(im, "truth11"); | ||
| } | ||
| printf("Loaded: %lf seconds\n", what_time_is_it_now()-time); | ||
| for(int zz = 0; zz < train.X.rows; ++zz){ | ||
| image im = float_to_image(net->w, net->h, 3, train.X.vals[zz]); | ||
| for(int k = 0; k < l.max_boxes; ++k){ | ||
| box b = float_to_box(train.y.vals[zz] + k*5, 1); | ||
| if(!b.x) break; | ||
| //printf("box value: %f %f %f %f\n", b.x, b.y, b.w, b.h); | ||
| draw_bbox(im, b, 1, 1,1,1); | ||
| } | ||
| //show_image(im, "truth11"); | ||
| //cvWaitKey(0); | ||
| save_image(im, "truth11"); | ||
| } | ||
| */ | ||
| //train_network_detect(net, train); | ||
| free_batch_detect(train); | ||
|
|
||
| int epoch_old = net->epoch; | ||
| net->epoch = net->seen / train_set_size; | ||
| float loss = net->loss; | ||
| if(loss > 999999 || loss < -999999 || loss != loss || (loss + 1.0 == loss)) { // NaN ≠NaN, Inf + 1 = Inf | ||
| fprintf(stderr, "\n\nloss too large: %f, exit\n", loss); | ||
| exit(-1); | ||
| } | ||
| if(avg_loss < 0){ | ||
| avg_loss = loss; | ||
| } else { | ||
| avg_loss = avg_loss*.9 + loss*.1; | ||
| } | ||
| if(net->correct_num / (net->accuracy_count + 0.00001F) > max_accuracy){ | ||
| max_accuracy = net->correct_num / (net->accuracy_count + 0.00001F); | ||
| max_accuracy_batch = net->batch_train; | ||
| } | ||
| printf("epoch: %d, batch: %d, accuracy: %.4f, loss: %f, avg_loss: %.2f, learning_rate: %.8f, %.4f s, " | ||
| "seen %lu images, max_accuracy: %.4f\n", net->epoch+1, net->batch_train, | ||
| net->correct_num / (net->accuracy_count + 0.00001F), | ||
| loss, avg_loss, net->learning_rate, what_time_is_it_now()-time, net->seen, max_accuracy); | ||
| if(epoch_old != net->epoch){ | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_%06d.weights", backup_directory, base, net->epoch); | ||
| save_weights(net, buff); | ||
| } | ||
| } | ||
| printf("max_accuracy_batch: %d\n", max_accuracy_batch); | ||
| char buff[256]; | ||
| sprintf(buff, "%s/%s_final.weights", backup_directory, base); | ||
| save_weights(net, buff); | ||
| free_network(net); | ||
| printf("max_accuracy_batch: %d\n", max_accuracy_batch); | ||
| if(paths) free_ptr(paths); | ||
| if(plist){ | ||
| free_list_contents(plist); | ||
| free_list(plist); | ||
| } | ||
| free(base); | ||
| } | ||
| /* | ||
| void print_detector_detections(FILE **fps, char *id, detection *dets, int total, int classes, int w, int h) | ||
| { | ||
| int i, j; | ||
| for(i = 0; i < total; ++i){ | ||
| float xmin = dets[i].bbox.x - dets[i].bbox.w/2. + 1; | ||
| float xmax = dets[i].bbox.x + dets[i].bbox.w/2. + 1; | ||
| float ymin = dets[i].bbox.y - dets[i].bbox.h/2. + 1; | ||
| float ymax = dets[i].bbox.y + dets[i].bbox.h/2. + 1; | ||
| if (xmin < 1) xmin = 1; | ||
| if (ymin < 1) ymin = 1; | ||
| if (xmax > w) xmax = w; | ||
| if (ymax > h) ymax = h; | ||
| for(j = 0; j < classes; ++j){ | ||
| if (dets[i].prob[j]) fprintf(fps[j], "%s %f %f %f %f %f\n", id, dets[i].prob[j], | ||
| xmin/w, ymin/h, xmax/w, ymax/h); | ||
| } | ||
| } | ||
| } | ||
| void validate_detector(char *datacfg, char *cfgfile, char *weightfile) | ||
| { | ||
| int j; | ||
| list *options = read_data_cfg(datacfg); | ||
| char *valid_images = option_find_str(options, "valid", "data/train.list"); | ||
| char *name_list = option_find_str(options, "names", "data/names.list"); | ||
| char *prefix = option_find_str(options, "results", "results"); | ||
| char **names = get_labels(name_list); | ||
| char *mapf = option_find_str(options, "map", 0); | ||
| int *map = 0; | ||
| if (mapf) map = read_map(mapf); | ||
| network *net = load_network(cfgfile, weightfile, 0); | ||
| set_batch_network(net, 1); | ||
| fprintf(stderr, "Learning Rate: %g, Momentum: %g, Decay: %g\n", net->learning_rate, net->momentum, net->decay); | ||
| srand(time(0)); | ||
| list *plist = get_paths(valid_images); | ||
| char **paths = (char **)list_to_array(plist); | ||
| layer l = net->layers[net->n-1]; | ||
| int classes = l.classes; | ||
| char buff[1024]; | ||
| char *type = option_find_str(options, "eval", "voc"); | ||
| FILE *fp = 0; | ||
| FILE **fps = 0; | ||
| int coco = 0; | ||
| int imagenet = 0; | ||
| char *outfile = 0; | ||
| if(0==strcmp(type, "coco")){ | ||
| if(!outfile) outfile = "coco_results"; | ||
| snprintf(buff, 1024, "%s/%s.json", prefix, outfile); | ||
| fp = fopen(buff, "w"); | ||
| fprintf(fp, "[\n"); | ||
| coco = 1; | ||
| } else if(0==strcmp(type, "imagenet")){ | ||
| if(!outfile) outfile = "imagenet-detection"; | ||
| snprintf(buff, 1024, "%s/%s.txt", prefix, outfile); | ||
| fp = fopen(buff, "w"); | ||
| imagenet = 1; | ||
| classes = 200; | ||
| } else { | ||
| if(!outfile) outfile = "comp4_det_test_"; | ||
| fps = calloc(classes, sizeof(FILE *)); | ||
| for(j = 0; j < classes; ++j){ | ||
| snprintf(buff, 1024, "%s/%s%s.txt", prefix, outfile, names[j]); | ||
| fps[j] = fopen(buff, "w"); | ||
| } | ||
| } | ||
| int m = plist->size; | ||
| int i=0; | ||
| int t; | ||
| float thresh = .3; | ||
| float nms = .45; | ||
| int nthreads = 4; | ||
| image *val = calloc(nthreads, sizeof(image)); | ||
| image *val_resized = calloc(nthreads, sizeof(image)); | ||
| image *buf = calloc(nthreads, sizeof(image)); | ||
| image *buf_resized = calloc(nthreads, sizeof(image)); | ||
| pthread_t *thr = calloc(nthreads, sizeof(pthread_t)); | ||
| load_args args = {0}; | ||
| args.w = net->w; | ||
| args.h = net->h; | ||
| //args.type = IMAGE_DATA; | ||
| args.type = LETTERBOX_DATA; | ||
| for(t = 0; t < nthreads; ++t){ | ||
| args.path = paths[i+t]; | ||
| args.im = &buf[t]; | ||
| args.resized = &buf_resized[t]; | ||
| thr[t] = load_data_in_thread(args); | ||
| } | ||
| double start = what_time_is_it_now(); | ||
| for(i = nthreads; i < m+nthreads; i += nthreads){ | ||
| fprintf(stderr, "%d\n", i); | ||
| for(t = 0; t < nthreads && i+t-nthreads < m; ++t){ | ||
| pthread_join(thr[t], 0); | ||
| val[t] = buf[t]; | ||
| val_resized[t] = buf_resized[t]; | ||
| } | ||
| for(t = 0; t < nthreads && i+t < m; ++t){ | ||
| args.path = paths[i+t]; | ||
| args.im = &buf[t]; | ||
| args.resized = &buf_resized[t]; | ||
| thr[t] = load_data_in_thread(args); | ||
| } | ||
| for(t = 0; t < nthreads && i+t-nthreads < m; ++t){ | ||
| char *path = paths[i+t-nthreads]; | ||
| char *id = basecfg(path); | ||
| float *X = val_resized[t].data; | ||
| network_predict(net, X); | ||
| int w = val[t].w; | ||
| int h = val[t].h; | ||
| int nboxes = 0; | ||
| detection *dets = get_network_boxes(net, w, h, thresh, .5, map, 0, &nboxes); | ||
| if (nms) do_nms_sort(dets, nboxes, classes, nms); | ||
| print_detector_detections(fps, path, dets, nboxes, classes, w, h); | ||
| free_detections(dets, nboxes); | ||
| free(id); | ||
| free_image(val[t]); | ||
| free_image(val_resized[t]); | ||
| } | ||
| } | ||
| for(j = 0; j < classes; ++j){ | ||
| if(fps) fclose(fps[j]); | ||
| } | ||
| if(coco){ | ||
| fseek(fp, -2, SEEK_CUR); | ||
| fprintf(fp, "\n]\n"); | ||
| fclose(fp); | ||
| } | ||
| fprintf(stderr, "Total Detection Time: %f Seconds\n", what_time_is_it_now() - start); | ||
| } | ||
| */ | ||
| void run_detector(int argc, char **argv) | ||
| { | ||
| double time_start = what_time_is_it_now();; | ||
| if(argc < 4){ | ||
| fprintf(stderr, "usage: %s %s [train/valid] [data cfg] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| return; | ||
| } | ||
|
|
||
| char *datacfg = argv[3]; | ||
| char *cfg = argv[4]; | ||
| char *weights = (argc > 5) ? argv[5] : 0; | ||
| if(0==strcmp(argv[2], "train")){ | ||
| train_detector(datacfg, cfg, weights); | ||
| } else if(0==strcmp(argv[2], "valid")){ | ||
| //validate_detector(datacfg, cfg, weights); | ||
| } else if(0==strcmp(argv[2], "recall")){ | ||
| //validate_detector_recall(cfg, weights); | ||
| } else { | ||
| fprintf(stderr, "usage: %s %s [train/valid] [data cfg] [cfg] [weights (optional)]\n", argv[0], argv[1]); | ||
| } | ||
| fprintf(stderr, "\n\ntotal %.2lf seconds\n\n\n", what_time_is_it_now() - time_start); | ||
|
|
||
| } |
| @@ -77,6 +77,6 @@ void copy_image_into(image src, image dest); | ||
|
|
||
| image get_image_layer(image m, int l); | ||
| void flip_image(image a); | ||
| void fill_image(image m, float s); | ||
| #endif | ||
|
|
||
| @@ -0,0 +1,106 @@ | ||
| #include "upsample_layer.h" | ||
| #include "cuda.h" | ||
| #include "blas.h" | ||
|
|
||
| #include <stdio.h> | ||
|
|
||
| layer make_upsample_layer(int batch, int w, int h, int c, int stride) | ||
| { | ||
| layer l = {0}; | ||
| l.type = UPSAMPLE; | ||
| l.batch = batch; | ||
| l.w = w; | ||
| l.h = h; | ||
| l.c = c; | ||
| l.out_w = w*stride; | ||
| l.out_h = h*stride; | ||
| l.out_c = c; | ||
| if(stride < 0){ | ||
| stride = -stride; | ||
| l.reverse=1; | ||
| l.out_w = w/stride; | ||
| l.out_h = h/stride; | ||
| } | ||
| l.stride = stride; | ||
| l.outputs = l.out_w*l.out_h*l.out_c; | ||
| l.inputs = l.w*l.h*l.c; | ||
| l.delta = calloc(l.outputs*batch, sizeof(float)); | ||
| l.output = calloc(l.outputs*batch, sizeof(float));; | ||
|
|
||
| l.forward = forward_upsample_layer; | ||
| l.backward = backward_upsample_layer; | ||
| #ifdef GPU | ||
| l.forward_gpu = forward_upsample_layer_gpu; | ||
| l.backward_gpu = backward_upsample_layer_gpu; | ||
|
|
||
| l.delta_gpu = cuda_make_array(l.delta, l.outputs*batch); | ||
| l.output_gpu = cuda_make_array(l.output, l.outputs*batch); | ||
| #endif | ||
| if(l.reverse) fprintf(stderr, "downsample %2dx %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c); | ||
| else fprintf(stderr, "upsample %2dx %4d x%4d x%4d -> %4d x%4d x%4d\n", stride, w, h, c, l.out_w, l.out_h, l.out_c); | ||
| return l; | ||
| } | ||
|
|
||
| void resize_upsample_layer(layer *l, int w, int h) | ||
| { | ||
| l->w = w; | ||
| l->h = h; | ||
| l->out_w = w*l->stride; | ||
| l->out_h = h*l->stride; | ||
| if(l->reverse){ | ||
| l->out_w = w/l->stride; | ||
| l->out_h = h/l->stride; | ||
| } | ||
| l->outputs = l->out_w*l->out_h*l->out_c; | ||
| l->inputs = l->h*l->w*l->c; | ||
| l->delta = realloc(l->delta, l->outputs*l->batch*sizeof(float)); | ||
| l->output = realloc(l->output, l->outputs*l->batch*sizeof(float)); | ||
|
|
||
| #ifdef GPU | ||
| cuda_free(l->output_gpu); | ||
| cuda_free(l->delta_gpu); | ||
| l->output_gpu = cuda_make_array(l->output, l->outputs*l->batch); | ||
| l->delta_gpu = cuda_make_array(l->delta, l->outputs*l->batch); | ||
| #endif | ||
|
|
||
| } | ||
|
|
||
| void forward_upsample_layer(const layer l, network net) | ||
| { | ||
| fill_cpu(l.outputs*l.batch, 0, l.output, 1); | ||
| if(l.reverse){ | ||
| upsample_cpu(l.output, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input); | ||
| }else{ | ||
| upsample_cpu(net.input, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output); | ||
| } | ||
| } | ||
|
|
||
| void backward_upsample_layer(const layer l, network net) | ||
| { | ||
| if(l.reverse){ | ||
| upsample_cpu(l.delta, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta); | ||
| }else{ | ||
| upsample_cpu(net.delta, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta); | ||
| } | ||
| } | ||
|
|
||
| #ifdef GPU | ||
| void forward_upsample_layer_gpu(const layer l, network net) | ||
| { | ||
| fill_gpu(l.outputs*l.batch, 0, l.output_gpu, 1); | ||
| if(l.reverse){ | ||
| upsample_gpu(l.output_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 0, l.scale, net.input_gpu); | ||
| }else{ | ||
| upsample_gpu(net.input_gpu, l.w, l.h, l.c, l.batch, l.stride, 1, l.scale, l.output_gpu); | ||
| } | ||
| } | ||
|
|
||
| void backward_upsample_layer_gpu(const layer l, network net) | ||
| { | ||
| if(l.reverse){ | ||
| upsample_gpu(l.delta_gpu, l.out_w, l.out_h, l.c, l.batch, l.stride, 1, l.scale, net.delta_gpu); | ||
| }else{ | ||
| upsample_gpu(net.delta_gpu, l.w, l.h, l.c, l.batch, l.stride, 0, l.scale, l.delta_gpu); | ||
| } | ||
| } | ||
| #endif |
| @@ -0,0 +1,15 @@ | ||
| #ifndef UPSAMPLE_LAYER_H | ||
| #define UPSAMPLE_LAYER_H | ||
| #include "darknet.h" | ||
|
|
||
| layer make_upsample_layer(int batch, int w, int h, int c, int stride); | ||
| void forward_upsample_layer(const layer l, network net); | ||
| void backward_upsample_layer(const layer l, network net); | ||
| void resize_upsample_layer(layer *l, int w, int h); | ||
|
|
||
| #ifdef GPU | ||
| void forward_upsample_layer_gpu(const layer l, network net); | ||
| void backward_upsample_layer_gpu(const layer l, network net); | ||
| #endif | ||
|
|
||
| #endif |
| @@ -0,0 +1,374 @@ | ||
| #include "yolo_layer.h" | ||
| #include "activations.h" | ||
| #include "blas.h" | ||
| #include "box.h" | ||
| #include "cuda.h" | ||
| #include "utils.h" | ||
|
|
||
| #include <stdio.h> | ||
| #include <assert.h> | ||
| #include <string.h> | ||
| #include <stdlib.h> | ||
|
|
||
| layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes) | ||
| { | ||
| int i; | ||
| layer l = {0}; | ||
| l.type = YOLO; | ||
|
|
||
| l.n = n; | ||
| l.total = total; | ||
| l.batch = batch; | ||
| l.h = h; | ||
| l.w = w; | ||
| l.c = n*(classes + 4 + 1); | ||
| l.out_w = l.w; | ||
| l.out_h = l.h; | ||
| l.out_c = l.c; | ||
| l.classes = classes; | ||
| l.cost = calloc(1, sizeof(float)); | ||
| l.biases = calloc(total*2, sizeof(float)); | ||
| if(mask) l.mask = mask; | ||
| else{ | ||
| l.mask = calloc(n, sizeof(int)); | ||
| for(i = 0; i < n; ++i){ | ||
| l.mask[i] = i; | ||
| } | ||
| } | ||
| l.bias_updates = calloc(n*2, sizeof(float)); | ||
| l.outputs = h*w*n*(classes + 4 + 1); | ||
| l.inputs = l.outputs; | ||
| l.truths = 90*(4 + 1); | ||
| l.delta = calloc(batch*l.outputs, sizeof(float)); | ||
| l.output = calloc(batch*l.outputs, sizeof(float)); | ||
| for(i = 0; i < total*2; ++i){ | ||
| l.biases[i] = .5; | ||
| } | ||
|
|
||
| l.forward = forward_yolo_layer; | ||
| l.backward = backward_yolo_layer; | ||
| #ifdef GPU | ||
| l.forward_gpu = forward_yolo_layer_gpu; | ||
| l.backward_gpu = backward_yolo_layer_gpu; | ||
| l.output_gpu = cuda_make_array(l.output, batch*l.outputs); | ||
| l.delta_gpu = cuda_make_array(l.delta, batch*l.outputs); | ||
| #endif | ||
|
|
||
| fprintf(stderr, "yolo\n"); | ||
| srand(0); | ||
|
|
||
| return l; | ||
| } | ||
|
|
||
| void resize_yolo_layer(layer *l, int w, int h) | ||
| { | ||
| l->w = w; | ||
| l->h = h; | ||
|
|
||
| l->outputs = h*w*l->n*(l->classes + 4 + 1); | ||
| l->inputs = l->outputs; | ||
|
|
||
| l->output = realloc(l->output, l->batch*l->outputs*sizeof(float)); | ||
| l->delta = realloc(l->delta, l->batch*l->outputs*sizeof(float)); | ||
|
|
||
| #ifdef GPU | ||
| cuda_free(l->delta_gpu); | ||
| cuda_free(l->output_gpu); | ||
|
|
||
| l->delta_gpu = cuda_make_array(l->delta, l->batch*l->outputs); | ||
| l->output_gpu = cuda_make_array(l->output, l->batch*l->outputs); | ||
| #endif | ||
| } | ||
|
|
||
| box get_yolo_box(float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, int stride) | ||
| { | ||
| box b; | ||
| b.x = (i + x[index + 0*stride]) / lw; | ||
| b.y = (j + x[index + 1*stride]) / lh; | ||
| b.w = exp(x[index + 2*stride]) * biases[2*n] / w; | ||
| b.h = exp(x[index + 3*stride]) * biases[2*n+1] / h; | ||
| return b; | ||
| } | ||
|
|
||
| float delta_yolo_box(box truth, float *x, float *biases, int n, int index, int i, int j, int lw, int lh, int w, int h, float *delta, float scale, int stride) | ||
| { | ||
| box pred = get_yolo_box(x, biases, n, index, i, j, lw, lh, w, h, stride); | ||
| float iou = box_iou(pred, truth); | ||
|
|
||
| float tx = (truth.x*lw - i); | ||
| float ty = (truth.y*lh - j); | ||
| float tw = log(truth.w*w / biases[2*n]); | ||
| float th = log(truth.h*h / biases[2*n + 1]); | ||
|
|
||
| delta[index + 0*stride] = scale * (tx - x[index + 0*stride]); | ||
| delta[index + 1*stride] = scale * (ty - x[index + 1*stride]); | ||
| delta[index + 2*stride] = scale * (tw - x[index + 2*stride]); | ||
| delta[index + 3*stride] = scale * (th - x[index + 3*stride]); | ||
| return iou; | ||
| } | ||
|
|
||
|
|
||
| void delta_yolo_class(float *output, float *delta, int index, int class, int classes, int stride, float *avg_cat) | ||
| { | ||
| int n; | ||
| if (delta[index]){ | ||
| delta[index + stride*class] = 1 - output[index + stride*class]; | ||
| if(avg_cat) *avg_cat += output[index + stride*class]; | ||
| return; | ||
| } | ||
| for(n = 0; n < classes; ++n){ | ||
| delta[index + stride*n] = ((n == class)?1 : 0) - output[index + stride*n]; | ||
| if(n == class && avg_cat) *avg_cat += output[index + stride*n]; | ||
| } | ||
| } | ||
|
|
||
| static int entry_index(layer l, int batch, int location, int entry) | ||
| { | ||
| int n = location / (l.w*l.h); | ||
| int loc = location % (l.w*l.h); | ||
| return batch*l.outputs + n*l.w*l.h*(4+l.classes+1) + entry*l.w*l.h + loc; | ||
| } | ||
|
|
||
| void forward_yolo_layer(const layer l, network net) | ||
| { | ||
| int i,j,b,t,n; | ||
| memcpy(l.output, net.input, l.outputs*l.batch*sizeof(float)); | ||
|
|
||
| #ifndef GPU | ||
| for (b = 0; b < l.batch; ++b){ | ||
| for(n = 0; n < l.n; ++n){ | ||
| int index = entry_index(l, b, n*l.w*l.h, 0); | ||
| activate_array(l.output + index, 2*l.w*l.h, LOGISTIC); | ||
| index = entry_index(l, b, n*l.w*l.h, 4); | ||
| activate_array(l.output + index, (1+l.classes)*l.w*l.h, LOGISTIC); | ||
| } | ||
| } | ||
| #endif | ||
|
|
||
| memset(l.delta, 0, l.outputs * l.batch * sizeof(float)); | ||
| if(!net.train) return; | ||
| float avg_iou = 0; | ||
| float recall = 0; | ||
| float recall75 = 0; | ||
| float avg_cat = 0; | ||
| float avg_obj = 0; | ||
| float avg_anyobj = 0; | ||
| int count = 0; | ||
| int class_count = 0; | ||
| *(l.cost) = 0; | ||
| for (b = 0; b < l.batch; ++b) { | ||
| for (j = 0; j < l.h; ++j) { | ||
| for (i = 0; i < l.w; ++i) { | ||
| for (n = 0; n < l.n; ++n) { | ||
| int box_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 0); | ||
| box pred = get_yolo_box(l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.w*l.h); | ||
| float best_iou = 0; | ||
| int best_t = 0; | ||
| for(t = 0; t < l.max_boxes; ++t){ | ||
| box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1); | ||
| if(!truth.x) break; | ||
| float iou = box_iou(pred, truth); | ||
| if (iou > best_iou) { | ||
| best_iou = iou; | ||
| best_t = t; | ||
| } | ||
| } | ||
| int obj_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4); | ||
| avg_anyobj += l.output[obj_index]; | ||
| l.delta[obj_index] = 0 - l.output[obj_index]; | ||
| if (best_iou > l.ignore_thresh) { | ||
| l.delta[obj_index] = 0; | ||
| } | ||
| if (best_iou > l.truth_thresh) { | ||
| l.delta[obj_index] = 1 - l.output[obj_index]; | ||
|
|
||
| int class = net.truth[best_t*(4 + 1) + b*l.truths + 4]; | ||
| if (l.map) class = l.map[class]; | ||
| int class_index = entry_index(l, b, n*l.w*l.h + j*l.w + i, 4 + 1); | ||
| delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, 0); | ||
| box truth = float_to_box(net.truth + best_t*(4 + 1) + b*l.truths, 1); | ||
| delta_yolo_box(truth, l.output, l.biases, l.mask[n], box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| for(t = 0; t < l.max_boxes; ++t){ | ||
| box truth = float_to_box(net.truth + t*(4 + 1) + b*l.truths, 1); | ||
|
|
||
| if(!truth.x) break; | ||
| float best_iou = 0; | ||
| int best_n = 0; | ||
| i = (truth.x * l.w); | ||
| j = (truth.y * l.h); | ||
| box truth_shift = truth; | ||
| truth_shift.x = truth_shift.y = 0; | ||
| for(n = 0; n < l.total; ++n){ | ||
| box pred = {0}; | ||
| pred.w = l.biases[2*n]/net.w; | ||
| pred.h = l.biases[2*n+1]/net.h; | ||
| float iou = box_iou(pred, truth_shift); | ||
| if (iou > best_iou){ | ||
| best_iou = iou; | ||
| best_n = n; | ||
| } | ||
| } | ||
|
|
||
| int mask_n = int_index(l.mask, best_n, l.n); | ||
| if(mask_n >= 0){ | ||
| int box_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 0); | ||
| float iou = delta_yolo_box(truth, l.output, l.biases, best_n, box_index, i, j, l.w, l.h, net.w, net.h, l.delta, (2-truth.w*truth.h), l.w*l.h); | ||
|
|
||
| int obj_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4); | ||
| avg_obj += l.output[obj_index]; | ||
| l.delta[obj_index] = 1 - l.output[obj_index]; | ||
|
|
||
| int class = net.truth[t*(4 + 1) + b*l.truths + 4]; | ||
| if (l.map) class = l.map[class]; | ||
| int class_index = entry_index(l, b, mask_n*l.w*l.h + j*l.w + i, 4 + 1); | ||
| delta_yolo_class(l.output, l.delta, class_index, class, l.classes, l.w*l.h, &avg_cat); | ||
|
|
||
| ++count; | ||
| ++class_count; | ||
| if(iou > .5) recall += 1; | ||
| if(iou > .75) recall75 += 1; | ||
| avg_iou += iou; | ||
| } | ||
| } | ||
| } | ||
| *(l.cost) = pow(mag_array(l.delta, l.outputs * l.batch), 2); | ||
| printf("Region %d Avg IOU: %f, Class: %f, Obj: %f, No Obj: %f, .5R: %f, .75R: %f, count: %d\n", net.index, avg_iou/count, avg_cat/class_count, avg_obj/count, avg_anyobj/(l.w*l.h*l.n*l.batch), recall/count, recall75/count, count); | ||
| } | ||
|
|
||
| void backward_yolo_layer(const layer l, network net) | ||
| { | ||
| axpy_cpu(l.batch*l.inputs, 1, l.delta, 1, net.delta, 1); | ||
| } | ||
|
|
||
| void correct_yolo_boxes(detection *dets, int n, int w, int h, int netw, int neth, int relative) | ||
| { | ||
| int i; | ||
| int new_w=0; | ||
| int new_h=0; | ||
| if (((float)netw/w) < ((float)neth/h)) { | ||
| new_w = netw; | ||
| new_h = (h * netw)/w; | ||
| } else { | ||
| new_h = neth; | ||
| new_w = (w * neth)/h; | ||
| } | ||
| for (i = 0; i < n; ++i){ | ||
| box b = dets[i].bbox; | ||
| b.x = (b.x - (netw - new_w)/2./netw) / ((float)new_w/netw); | ||
| b.y = (b.y - (neth - new_h)/2./neth) / ((float)new_h/neth); | ||
| b.w *= (float)netw/new_w; | ||
| b.h *= (float)neth/new_h; | ||
| if(!relative){ | ||
| b.x *= w; | ||
| b.w *= w; | ||
| b.y *= h; | ||
| b.h *= h; | ||
| } | ||
| dets[i].bbox = b; | ||
| } | ||
| } | ||
|
|
||
| int yolo_num_detections(layer l, float thresh) | ||
| { | ||
| int i, n; | ||
| int count = 0; | ||
| for (i = 0; i < l.w*l.h; ++i){ | ||
| for(n = 0; n < l.n; ++n){ | ||
| int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4); | ||
| if(l.output[obj_index] > thresh){ | ||
| ++count; | ||
| } | ||
| } | ||
| } | ||
| return count; | ||
| } | ||
|
|
||
| void avg_flipped_yolo(layer l) | ||
| { | ||
| int i,j,n,z; | ||
| float *flip = l.output + l.outputs; | ||
| for (j = 0; j < l.h; ++j) { | ||
| for (i = 0; i < l.w/2; ++i) { | ||
| for (n = 0; n < l.n; ++n) { | ||
| for(z = 0; z < l.classes + 4 + 1; ++z){ | ||
| int i1 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + i; | ||
| int i2 = z*l.w*l.h*l.n + n*l.w*l.h + j*l.w + (l.w - i - 1); | ||
| float swap = flip[i1]; | ||
| flip[i1] = flip[i2]; | ||
| flip[i2] = swap; | ||
| if(z == 0){ | ||
| flip[i1] = -flip[i1]; | ||
| flip[i2] = -flip[i2]; | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
| for(i = 0; i < l.outputs; ++i){ | ||
| l.output[i] = (l.output[i] + flip[i])/2.; | ||
| } | ||
| } | ||
|
|
||
| int get_yolo_detections(layer l, int w, int h, int netw, int neth, float thresh, int *map, int relative, detection *dets) | ||
| { | ||
| int i,j,n; | ||
| float *predictions = l.output; | ||
| if (l.batch == 2) avg_flipped_yolo(l); | ||
| int count = 0; | ||
| for (i = 0; i < l.w*l.h; ++i){ | ||
| int row = i / l.w; | ||
| int col = i % l.w; | ||
| for(n = 0; n < l.n; ++n){ | ||
| int obj_index = entry_index(l, 0, n*l.w*l.h + i, 4); | ||
| float objectness = predictions[obj_index]; | ||
| if(objectness <= thresh) continue; | ||
| int box_index = entry_index(l, 0, n*l.w*l.h + i, 0); | ||
| dets[count].bbox = get_yolo_box(predictions, l.biases, l.mask[n], box_index, col, row, l.w, l.h, netw, neth, l.w*l.h); | ||
| dets[count].objectness = objectness; | ||
| dets[count].classes = l.classes; | ||
| for(j = 0; j < l.classes; ++j){ | ||
| int class_index = entry_index(l, 0, n*l.w*l.h + i, 4 + 1 + j); | ||
| float prob = objectness*predictions[class_index]; | ||
| dets[count].prob[j] = (prob > thresh) ? prob : 0; | ||
| } | ||
| ++count; | ||
| } | ||
| } | ||
| correct_yolo_boxes(dets, count, w, h, netw, neth, relative); | ||
| return count; | ||
| } | ||
|
|
||
| #ifdef GPU | ||
|
|
||
| void forward_yolo_layer_gpu(const layer l, network net) | ||
| { | ||
| copy_gpu(l.batch*l.inputs, net.input_gpu, 1, l.output_gpu, 1); | ||
| int b, n; | ||
| for (b = 0; b < l.batch; ++b){ | ||
| for(n = 0; n < l.n; ++n){ | ||
| int index = entry_index(l, b, n*l.w*l.h, 0); | ||
| activate_array_gpu(l.output_gpu + index, 2*l.w*l.h, LOGISTIC); | ||
| index = entry_index(l, b, n*l.w*l.h, 4); | ||
| activate_array_gpu(l.output_gpu + index, (1+l.classes)*l.w*l.h, LOGISTIC); | ||
| } | ||
| } | ||
| if(!net.train || l.onlyforward){ | ||
| cuda_pull_array(l.output_gpu, l.output, l.batch*l.outputs); | ||
| return; | ||
| } | ||
|
|
||
| cuda_pull_array(l.output_gpu, net.input, l.batch*l.inputs); | ||
| forward_yolo_layer(l, net); | ||
| cuda_push_array(l.delta_gpu, l.delta, l.batch*l.outputs); | ||
| } | ||
|
|
||
| void backward_yolo_layer_gpu(const layer l, network net) | ||
| { | ||
| axpy_gpu(l.batch*l.inputs, 1, l.delta_gpu, 1, net.delta_gpu, 1); | ||
| } | ||
| #endif | ||
|
|
| @@ -0,0 +1,19 @@ | ||
| #ifndef YOLO_LAYER_H | ||
| #define YOLO_LAYER_H | ||
|
|
||
| #include "darknet.h" | ||
| #include "layer.h" | ||
| #include "network.h" | ||
|
|
||
| layer make_yolo_layer(int batch, int w, int h, int n, int total, int *mask, int classes); | ||
| void forward_yolo_layer(const layer l, network net); | ||
| void backward_yolo_layer(const layer l, network net); | ||
| void resize_yolo_layer(layer *l, int w, int h); | ||
| int yolo_num_detections(layer l, float thresh); | ||
|
|
||
| #ifdef GPU | ||
| void forward_yolo_layer_gpu(const layer l, network net); | ||
| void backward_yolo_layer_gpu(layer l, network net); | ||
| #endif | ||
|
|
||
| #endif |
| @@ -0,0 +1,152 @@ | ||
| [network] | ||
| batch=1 | ||
| width=416 | ||
| height=416 | ||
| channels=3 | ||
|
|
||
| saturation = 1.5 | ||
| exposure = 1.5 | ||
| hue=.1 | ||
| jitter=.3 | ||
|
|
||
| learning_rate=0.001 | ||
| momentum=0.9 | ||
| decay=0.0005 | ||
| max_batches = 7002 | ||
| policy=steps | ||
| steps=5000,6000 | ||
| scales=.1,.1 | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=16 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [maxpool] | ||
| size=2 | ||
| stride=2 | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=32 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [maxpool] | ||
| size=2 | ||
| stride=2 | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=64 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [maxpool] | ||
| size=2 | ||
| stride=2 | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=128 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [maxpool] | ||
| size=2 | ||
| stride=2 | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=256 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [maxpool] | ||
| size=2 | ||
| stride=2 | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=512 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [maxpool] | ||
| size=2 | ||
| stride=1 | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=1024 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| ########### | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=256 | ||
| size=1 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=512 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [convolutional] | ||
| size=1 | ||
| stride=1 | ||
| pad=1 | ||
| filters=18 | ||
| activation=linear | ||
|
|
||
|
|
||
| [route] | ||
| layers = -4 | ||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=128 | ||
| size=1 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
|
|
||
|
|
||
| [convolutional] | ||
| batch_normalize=1 | ||
| filters=256 | ||
| size=3 | ||
| stride=1 | ||
| pad=1 | ||
| activation=leaky | ||
|
|
||
| [convolutional] | ||
| size=1 | ||
| stride=1 | ||
| pad=1 | ||
| filters=18 | ||
| activation=linear |