Skip to content
Permalink
Browse files

dwalk: add default file histogram

When file_histogram option is used this creates a file histogram
of all walked files. This option does not require any range inputs.
It will give the user a count of files in each bin. The bins
are created based on the max file size. So if the Max file size
is between 2^20 - 2^30, then this is the last bin. The bins
start with the first bin for 0 byte files, then goes up from
2^10, 2^20, 2^30, etc. The bins are created dynamically based
on the max file size.

Signed-off-by: Danielle Sikich <sikich1@llnl.gov>
  • Loading branch information...
Danielle Sikich
Danielle Sikich committed Mar 1, 2019
1 parent 3d91ba7 commit 0cd8eec514cd2ad9a5442a9df78803700cc841e9
Showing with 129 additions and 47 deletions.
  1. +1 −1 cmake/MFU_ADD_TOOL.cmake
  2. +12 −0 doc/rst/dwalk.1.rst
  3. +116 −46 src/dwalk/dwalk.c
@@ -1,7 +1,7 @@
FUNCTION(MFU_ADD_TOOL name)

ADD_EXECUTABLE(${name} ${name}.c)
TARGET_LINK_LIBRARIES(${name} mfu)
TARGET_LINK_LIBRARIES(${name} mfu m)
SET_TARGET_PROPERTIES(${name} PROPERTIES C_STANDARD 99)
INSTALL(TARGETS ${name} DESTINATION ${CMAKE_INSTALL_BINDIR})

@@ -54,6 +54,18 @@ OPTIONS
bytes, between 1-80 bytes, between 81-99 bytes, and 100 bytes or
greater.

.. option:: -f, --file-histogram

Creates a file histogram without requiring the user to provide
the bin sizes. The bins are created dynamically based on the
max file size. The first bin is always for only zero byte
files, and the rest go up until the max file size is included
in the very last bin. It always goes up by orders of magnitude
in powers of two. So, an example of bin separators would be:
0, 2^10, 2^20, 2^30. Assuming the max file size was somewhere
within the 2^20 - 2^30 range. The histogram also includes both
files and directories.

.. option:: -p, --print

Print files to the screen.
@@ -18,6 +18,7 @@
#include <assert.h>
#include <inttypes.h>
#include <string.h>
#include <math.h>

#include "libcircle.h"
#include "dtcmp.h"
@@ -39,18 +40,69 @@ struct distribute_option {
uint64_t separators[MAX_DISTRIBUTE_SEPARATORS];
};

static int print_flist_distribution(struct distribute_option *option, mfu_flist* pflist, int rank)
static void create_default_separators(struct distribute_option *option,
mfu_flist* flist,
uint64_t* size,
int* separators,
uint64_t* global_max_file_size)
{
/* get local max file size for Allreduce */
uint64_t local_max_file_size = 0;
for (int i = 0; i < *size; i++) {
uint64_t file_size = mfu_flist_file_get_size(*flist, i);
if (file_size > local_max_file_size) {
local_max_file_size = file_size;
}
}

/* get the max file size across all ranks */
MPI_Allreduce(&local_max_file_size, global_max_file_size, 1,
MPI_UINT64_T, MPI_MAX, MPI_COMM_WORLD);

/* print and convert max file size to appropriate units */
double max_size_tmp;
const char* max_size_units;
mfu_format_bytes(*global_max_file_size, &max_size_tmp, &max_size_units);
printf("Max File Size: %.3lf %s\n", max_size_tmp, max_size_units);

/* round next_pow_2 to next multiple of 10 */
uint64_t max_magnitude_bin = (ceil(log2(*global_max_file_size) / 10 )) * 10;

/* get bin ranges based on max file size */
option->separators[0] = 1;

/* plus one is for zero count bin */
*separators = max_magnitude_bin / 10;
int power = 10;
for (int i = 1; power <= max_magnitude_bin; i++) {
option->separators[i] = pow(2, power);
power+=10;
}
}

static int print_flist_distribution(int file_histogram,
struct distribute_option *option,
mfu_flist* pflist, int rank)
{
/* file list to use */
mfu_flist flist = *pflist;

/* get local size for each rank */
/* get local size for each rank, and max file sizes */
uint64_t size = mfu_flist_size(flist);
uint64_t global_max_file_size;

int separators = 0;
if (file_histogram) {
/* create default separators */
create_default_separators(option, &flist, &size, &separators,
&global_max_file_size);
} else {
separators = option->separator_number;
}

/* allocate a count for each bin, initialize the bin counts to 0
* it is separator + 1 because the last bin is the last separator
* to the DISTRIBUTE_MAX */
int separators = option->separator_number;
uint64_t* dist = (uint64_t*) MFU_MALLOC((separators + 1) * sizeof(uint64_t));

/* initialize the bin counts to 0 */
@@ -94,24 +146,39 @@ static int print_flist_distribution(struct distribute_option *option, mfu_flist*
if (rank == 0) {
/* number of files in a bin */
uint64_t number;
printf("Range\tNumber\n");
for (int i = 0; i <= option->separator_number; i++) {
printf("[");
double size_tmp;
const char* size_units;
printf("%-27s %s\n", "Range", "Number");
for (int i = 0; i <= separators; i++) {
printf("%s", "[ ");
if (i == 0) {
printf("0");
printf("%7.3lf %2s", 0.000, "B");
} else {
printf("%"PRIu64, option->separators[i - 1] + 1);
mfu_format_bytes((uint64_t)option->separators[i - 1],
&size_tmp, &size_units);
printf("%7.3lf %2s", size_tmp, size_units);
}

printf("~");
printf("%s", " - ");

if (i == option->separator_number) {
number = disttotal[i];
printf("MAX]\t%"PRIu64"\n", number);
if (file_histogram) {
mfu_format_bytes((uint64_t)option->separators[i],
&size_tmp, &size_units);
number = disttotal[i];
mfu_format_bytes((uint64_t)option->separators[i],
&size_tmp, &size_units);
printf("%7.3lf %2s ) %"PRIu64"\n", size_tmp,
size_units, number);
} else {
number = disttotal[i];
printf("%"PRIu64"]\t%"PRIu64"\n",
option->separators[i], number);
if (i == separators) {
number = disttotal[i];
printf("%10s ) %"PRIu64"\n", "MAX", number);
} else {
number = disttotal[i];
mfu_format_bytes((uint64_t)option->separators[i],
&size_tmp, &size_units);
printf("%7.3lf %2s ) %"PRIu64"\n", size_tmp, size_units, number);
}
}
}
}
@@ -123,10 +190,7 @@ static int print_flist_distribution(struct distribute_option *option, mfu_flist*
return 0;
}

/*
* Search the right position to insert the separator
* If the separator exists already, return failure
* Otherwise, locate the right position, and move the array forward to
/* * Search the right position to insert the separator * If the separator exists already, return failure * Otherwise, locate the right position, and move the array forward to
* save the separator.
*/
static int distribute_separator_add(struct distribute_option *option, uint64_t separator)
@@ -237,16 +301,16 @@ static void print_usage(void)
printf("Usage: dwalk [options] <path> ...\n");
printf("\n");
printf("Options:\n");
printf(" -i, --input <file> - read list from file\n");
printf(" -o, --output <file> - write processed list to file in binary format\n");
printf(" -t, --text - use with -o; write processed list to file in ascii format\n");
printf(" -l, --lite - walk file system without stat\n");
printf(" -s, --sort <fields> - sort output by comma-delimited fields\n");
printf(" -d, --distribution <field>:<separators>\n - print distribution by field\n");
printf(" -p, --print - print files to screen\n");
printf(" -v, --verbose - verbose output\n");
printf(" -q, --quiet - quiet output\n");
printf(" -h, --help - print usage\n");
printf(" -i, --input <file> - read list from file\n");
printf(" -o, --output <file> - write processed list to file\n");
printf(" -l, --lite - walk file system without stat\n");
printf(" -s, --sort <fields> - sort output by comma-delimited fields\n");
printf(" -f, --file_histogram - print default size distribution of items\n");
printf(" -d, --distribution <field>:<separators> - print distribution by field\n");
printf(" -p, --print - print files to screen\n");
printf(" -v, --verbose - verbose output\n");
printf(" -q, --quiet - quiet output\n");
printf(" -h, --help - print usage\n");
printf("\n");
printf("Fields: name,user,group,uid,gid,atime,mtime,ctime,size\n");
printf("\n");
@@ -281,37 +345,40 @@ int main(int argc, char** argv)
* - allow user to sort by different fields
* - allow user to group output (sum all bytes, group by user) */

char* inputname = NULL;
char* outputname = NULL;
char* sortfields = NULL;
char* distribution = NULL;
char* inputname = NULL;
char* outputname = NULL;
char* sortfields = NULL;
char* distribution = NULL;

int file_histogram = 0;
int walk = 0;
int print = 0;
int text = 0;

struct distribute_option option;

/* verbose by default */
mfu_debug_level = MFU_LOG_VERBOSE;

int option_index = 0;
static struct option long_options[] = {
{"input", 1, 0, 'i'},
{"output", 1, 0, 'o'},
{"lite", 0, 0, 'l'},
{"sort", 1, 0, 's'},
{"distribution", 1, 0, 'd'},
{"print", 0, 0, 'p'},
{"verbose", 0, 0, 'v'},
{"quiet", 0, 0, 'q'},
{"help", 0, 0, 'h'},
{"text", 0, 0, 't'},
{"input", 1, 0, 'i'},
{"output", 1, 0, 'o'},
{"lite", 0, 0, 'l'},
{"sort", 1, 0, 's'},
{"distribution", 1, 0, 'd'},
{"file_histogram", 0, 0, 'f'},
{"print", 0, 0, 'p'},
{"verbose", 0, 0, 'v'},
{"help", 0, 0, 'h'},
{"text", 0, 0, 't'},
{0, 0, 0, 0}
};

int usage = 0;
while (1) {
int c = getopt_long(
argc, argv, "i:o:ls:d:pvqht",
argc, argv, "i:o:ls:d:pvhtf",
long_options, &option_index
);

@@ -336,6 +403,9 @@ int main(int argc, char** argv)
case 'd':
distribution = MFU_STRDUP(optarg);
break;
case 'f':
file_histogram = 1;
break;
case 'p':
print = 1;
break;
@@ -513,8 +583,8 @@ int main(int argc, char** argv)
mfu_flist_print_summary(flist);

/* print distribution if user specified this option */
if (distribution != NULL) {
print_flist_distribution(&option, &flist, rank);
if (distribution != NULL || file_histogram) {
print_flist_distribution(file_histogram, &option, &flist, rank);
}

/* write data to cache file */

0 comments on commit 0cd8eec

Please sign in to comment.
You can’t perform that action at this time.