From 0bd433d1539a70c83690ab8602ab3a38cd243d1f Mon Sep 17 00:00:00 2001 From: Andy Sarroff Date: Fri, 17 Jan 2020 09:39:10 -0500 Subject: [PATCH] s3: copy large objects without chunking When executing `boto3.s3.Client.Copy`, large files are chunked into multiple parts. The default chunk threshold and size is 8MB. This caused an issue when caching an object larger than 8M, where the original object had not been uploaded in multiple parts. The original object and the cached object would have different ETags, causing an exception. This fixes the issue by dynamically setting the chunk threshold to be one byte larger than the size of the original object therefore avoiding multipart copies. Fixes #3174 --- dvc/remote/s3.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py index ba043afcdd..b6a24c6bb0 100644 --- a/dvc/remote/s3.py +++ b/dvc/remote/s3.py @@ -3,6 +3,7 @@ import logging import os import threading +from boto3.s3.transfer import TransferConfig from funcy import cached_property, wrap_prop @@ -169,7 +170,13 @@ def _copy(cls, s3, from_info, to_info, extra_args): ) else: source = {"Bucket": from_info.bucket, "Key": from_info.path} - s3.copy(source, to_info.bucket, to_info.path, ExtraArgs=extra_args) + s3.copy( + source, + to_info.bucket, + to_info.path, + ExtraArgs=extra_args, + Config=TransferConfig(multipart_threshold=size + 1) + ) cached_etag = cls.get_etag(s3, to_info.bucket, to_info.path) if etag != cached_etag: