From 242a65d2fdf904632b334f63f44ead0e9b1501d3 Mon Sep 17 00:00:00 2001 From: Manfred Stock Date: Tue, 14 Mar 2023 19:31:22 +0100 Subject: [PATCH] Support extraction of large files The behaviour of `syswrite` depends on the platform and seems to be different when getting closer to writing about 2 GiB or more at once. On Linux, it will write at most (2**31 - 4096) bytes [1,2] and not return an error when more data was passed in but just return the amount of data that was actually written - so the original implementation was producing incomplete/corrupt files during extraction when they were larger than (2**31 - 4096) bytes. On macOS, the limit appears to be (2**31 - 1) bytes, otherwise, an error is returned. So in order to correctly extract files close to or larger than 2 GiB, it's necessary to write less than about 2 GiB at once and redo write operations until all data actually has been written. [1] https://www.man7.org/linux/man-pages/man2/write.2.html#NOTES [2] https://stackoverflow.com/questions/70368651/why-cant-linux-write-more-than-2147479552-bytes --- lib/Archive/Tar.pm | 26 ++++++++++++++++++++++---- t/02_methods.t | 28 ++++++++++++++++++++++++++++ 2 files changed, 50 insertions(+), 4 deletions(-) diff --git a/lib/Archive/Tar.pm b/lib/Archive/Tar.pm index 476e646..6e2ba4f 100644 --- a/lib/Archive/Tar.pm +++ b/lib/Archive/Tar.pm @@ -24,6 +24,7 @@ use strict; use vars qw[$DEBUG $error $VERSION $WARN $FOLLOW_SYMLINK $CHOWN $CHMOD $DO_NOT_USE_PREFIX $HAS_PERLIO $HAS_IO_STRING $SAME_PERMISSIONS $INSECURE_EXTRACT_MODE $ZERO_PAD_NUMBERS @ISA @EXPORT $RESOLVE_SYMLINK + $EXTRACT_BLOCK_SIZE ]; @ISA = qw[Exporter]; @@ -39,6 +40,7 @@ $DO_NOT_USE_PREFIX = 0; $INSECURE_EXTRACT_MODE = 0; $ZERO_PAD_NUMBERS = 0; $RESOLVE_SYMLINK = $ENV{'PERL5_AT_RESOLVE_SYMLINK'} || 'speed'; +$EXTRACT_BLOCK_SIZE = 1024 * 1024 * 1024; BEGIN { use Config; @@ -894,10 +896,18 @@ sub _extract_file { if( $entry->size ) { binmode $fh; - syswrite $fh, $entry->data or ( - $self->_error( qq[Could not write data to '$full'] ), - return - ); + my $offset = 0; + my $content = $entry->get_content_by_ref(); + while ($offset < $entry->size) { + my $written + = syswrite $fh, $$content, $EXTRACT_BLOCK_SIZE, $offset; + if (defined $written) { + $offset += $written; + } else { + $self->_error( qq[Could not write data to '$full': $!] ); + return; + } + } } close $fh or ( @@ -2163,6 +2173,14 @@ numbers. Added for compatibility with C implementations. It won't work for terminal, pipe or sockets or every non seekable source. +=head $Archive::Tar::EXTRACT_BLOCK_SIZE + +This variable holds an integer with the block size that should be used when +writing files during extraction. It defaults to 1 GiB. Please note that this +cannot be arbitrarily large since some operating systems limit the number of +bytes that can be written in one call to C, so if this is too large, +extraction may fail with an error. + =cut =head1 FAQ diff --git a/t/02_methods.t b/t/02_methods.t index 19d9212..37f4db2 100644 --- a/t/02_methods.t +++ b/t/02_methods.t @@ -570,6 +570,34 @@ SKIP: { ### pesky warnings } +### extract tests with different $EXTRACT_BLOCK_SIZE values ### +SKIP: { ### pesky warnings + skip $ebcdic_skip_msg, 517 if ord "A" != 65; + + skip('no IO::String', 517) if !$Archive::Tar::HAS_PERLIO && + !$Archive::Tar::HAS_PERLIO && + !$Archive::Tar::HAS_IO_STRING && + !$Archive::Tar::HAS_IO_STRING; + + my $tar = $Class->new; + ok( $tar->read( $TAR_FILE ), "Read in '$TAR_FILE'" ); + + for my $aref ( [$tar, \@EXPECT_NORMAL], + [$TARBIN, \@EXPECTBIN], + [$TARX, \@EXPECTX] + ) { + my($obj, $struct) = @$aref; + + for my $block_size ((1, BLOCK, 1024 * 1024, 2**31 - 4096, 2**31, 2**32)) { + local $Archive::Tar::EXTRACT_BLOCK_SIZE = $block_size; + + ok( $obj->extract, " Extracted with 'extract'" ); + check_tar_extract( $obj, $struct ); + } + } +} + + ### clear tests ### SKIP: { ### pesky warnings skip $ebcdic_skip_msg, 3 if ord "A" != 65;