Permalink
Browse files

Avoid hashing initial part of files twice in threaded hash mode.

  • Loading branch information...
jvirkki committed Feb 15, 2016
1 parent 4537afa commit 8f6ea02a24773604a0d682c084987d2eeecb00a1
Showing with 112 additions and 22 deletions.
  1. +11 −0 src/filecompare.c
  2. +5 −2 src/hashlist.c
  3. +7 −2 src/hashlist.h
  4. +36 −18 src/sizelist.c
  5. BIN tests/files6/files.tar.gz
  6. +3 −0 tests/output.56
  7. +28 −0 tests/test.56
  8. +22 −0 tests/test.57
@@ -102,6 +102,10 @@ static void compare_two_open_files(sqlite3 * dbh,
*/
void compare_two_files(sqlite3 * dbh, char * path1, char * path2, off_t size)
{
if (path1[0] == 0 || path2[0] == 0) {
return;
}

if (verbosity >= 4) {
printf("compare_two_files: [%s] vs [%s]\n", path1, path2);
}
@@ -138,6 +142,13 @@ void compare_three_files(sqlite3 * dbh,
printf("compare_three_files: [%s],[%s],[%s]\n", path1, path2, path3);
}

// It is possible some of these files have been discarded already by
// skim_uniques(), in which case, ignore the first one seen and this
// becomes a compare_two.
if (path1[0] == 0) { compare_two_files(dbh, path2, path3, size); return; }
if (path2[0] == 0) { compare_two_files(dbh, path1, path3, size); return; }
if (path3[0] == 0) { compare_two_files(dbh, path1, path2, size); return; }

int bread = 0;
int file[4];

@@ -444,12 +444,15 @@ void print_hash_list(struct hash_list * src)
* Public function, see header file.
*
*/
void record_uniques(sqlite3 * dbh, struct hash_list * src)
void skim_uniques(sqlite3 * dbh, struct hash_list * src, int record_in_db)
{
struct hash_list * p = src;
while (p != NULL && p->hash_valid) {
if (p->next_index == 1) {
unique_to_db(dbh, *(p->pathptrs), "hashlist");
if (record_in_db) {
unique_to_db(dbh, *(p->pathptrs), "hashlist");
}
*(p->pathptrs)[0] = 0;
}
p = p->next;
}
@@ -180,7 +180,12 @@ void print_hash_list(struct hash_list * src);


/** ***************************************************************************
* Export all unique files from the given hash list to the database.
* Look for unique files identified in the given hash list.
*
* The path of unique entries is nulled out (in its path list, given that the
* hash list path is a pointer to the path list).
*
* If record_in_db is true, these files are also saved in the database.
*
* Parameters:
* dbh - Database handle.
@@ -189,7 +194,7 @@ void print_hash_list(struct hash_list * src);
* Return: none.
*
*/
void record_uniques(sqlite3 * dbh, struct hash_list * src);
void skim_uniques(sqlite3 * dbh, struct hash_list * src, int record_in_db);


#endif
@@ -230,7 +230,7 @@ void process_size_list(sqlite3 * dbh)
}

if (save_uniques) {
record_uniques(dbh, hl_one);
skim_uniques(dbh, hl_one, save_uniques);
}

// If no potential dups after this round, we're done!
@@ -267,7 +267,7 @@ void process_size_list(sqlite3 * dbh)
}

if (save_uniques) {
record_uniques(dbh, hl_partial);
skim_uniques(dbh, hl_partial, save_uniques);
}

// If no potential dups after this round, we're done!
@@ -302,7 +302,7 @@ void process_size_list(sqlite3 * dbh)
}

if (save_uniques) {
record_uniques(dbh, hl_full);
skim_uniques(dbh, hl_full, save_uniques);
}

// If no potential dups after this round, we're done!
@@ -369,17 +369,24 @@ static void reader_read_bytes(struct size_list * size_node, off_t max_to_read)
free(buffer);
}

buffer = (char *)malloc(size_node->bytes_read);
received = read_file_bytes(path, buffer, size_node->bytes_read, 0);
if (received != size_node->bytes_read) {
printf("error: read %zd bytes from [%s] but wanted %ld\n",
received, path, size_node->bytes_read);
size_node->bytes_read = 0;
}
if (thread_verbosity >= 2) {
printf("%s%ld bytes from %s\n",spaces,size_node->bytes_read,path);
// The path may be null if this particular path within this pathlist
// has been discarded as a potential duplicate already. If so, skip.
if (path[0] != 0) {
buffer = (char *)malloc(size_node->bytes_read);
received = read_file_bytes(path, buffer, size_node->bytes_read, 0);
if (received != size_node->bytes_read) {
printf("error: read %zd bytes from [%s] but wanted %ld\n",
received, path, size_node->bytes_read);
size_node->bytes_read = 0;
}
if (thread_verbosity >= 2) {
printf("%s%ld bytes from %s\n",spaces,size_node->bytes_read,path);
}
pl_entry_set_buffer(node, buffer);

} else {
pl_entry_set_buffer(node, NULL);
}
pl_entry_set_buffer(node, buffer);

node = pl_entry_get_next(node);

@@ -499,8 +506,14 @@ static int build_hash_list_round(sqlite3 * dbh,
// Build hash list for these files
do {
path = pl_entry_get_path(node);
buffer = pl_entry_get_buffer(node);
add_hash_list_from_mem(hl, path, buffer, size_node->bytes_read);

// The path may be null if this particular path within this pathlist
// has been discarded as a potential duplicate already. If so, skip.
if (path[0] != 0) {
buffer = pl_entry_get_buffer(node);
add_hash_list_from_mem(hl, path, buffer, size_node->bytes_read);
}

node = pl_entry_get_next(node);
} while (node != NULL);

@@ -510,7 +523,7 @@ static int build_hash_list_round(sqlite3 * dbh,
print_hash_list(hl);
}

if (save_uniques) { record_uniques(dbh, hl); }
skim_uniques(dbh, hl, save_uniques);

// If no potential dups after this round, we're done!
if (HASH_LIST_NO_DUPS(hl)) {
@@ -719,7 +732,12 @@ void threaded_process_size_list(sqlite3 * dbh)
struct hash_list * hl_full = get_hash_list(HASH_LIST_FULL);
do {
path = pl_entry_get_path(node);
add_hash_list(hl_full, path, 0, hash_block_size, 0);

// The path may be null if this particular path within this pathlist
// has been discarded as a potential duplicate already. If so, skip.
if (path[0] != 0) {
add_hash_list(hl_full, path, 0, hash_block_size, 0);
}
node = pl_entry_get_next(node);
} while (node != NULL);

@@ -729,7 +747,7 @@ void threaded_process_size_list(sqlite3 * dbh)
}

if (save_uniques) {
record_uniques(dbh, hl_full);
skim_uniques(dbh, hl_full, save_uniques);
}

// If no potential dups after this round, we're done!
Binary file not shown.
@@ -0,0 +1,3 @@
/3
/4
/files.tar.gz
@@ -0,0 +1,28 @@
#!/bin/bash

source common

DESC="scan(files6): uniques w/larger files, some differ early"

cd files6
cat files.tar.gz | gunzip | tar xf -
cd ..

$DUPD_CMD scan --path `pwd`/files6 --uniques > /dev/null
checkrv $?

# Test for a bug where unique files were identified more than once
# when belonging to a size set larger than three (thus forcing the full hash)
# if some of the files could've been discarded early in round 1.
# There is no way to detect this via normal dupd interfaces (that is, all
# report output is correct) so the only way to check is in database directly.

echo "select * from files;" | sqlite3 ~/.dupd_sqlite | sort | sed -e 's/.*files6//' > nreport

check_nreport output.56

cd files6
rm -f 1 2 3 4 5
cd ..

tdone
@@ -0,0 +1,22 @@
#!/bin/bash

source common

DESC="scan(files6): uniques w/larger files, some differ early, no thread scan"

cd files6
cat files.tar.gz | gunzip | tar xf -
cd ..

$DUPD_CMD scan --path `pwd`/files6 --uniques --no-thread-hash > /dev/null
checkrv $?

echo "select * from files;" | sqlite3 ~/.dupd_sqlite | sort | sed -e 's/.*files6//' > nreport

check_nreport output.56

cd files6
rm -f 1 2 3 4 5
cd ..

tdone

0 comments on commit 8f6ea02

Please sign in to comment.