From 0bd433d1539a70c83690ab8602ab3a38cd243d1f Mon Sep 17 00:00:00 2001
From: Andy Sarroff <asarroff@izotope.com>
Date: Fri, 17 Jan 2020 09:39:10 -0500
Subject: [PATCH] s3: copy large objects without chunking

When executing `boto3.s3.Client.Copy`, large files are chunked into
multiple parts. The default chunk threshold and size is 8MB. This
caused an issue when caching an object larger than 8M, where the
original object had not been uploaded in multiple parts. The original
object and the cached object would have different ETags, causing an
exception. This fixes the issue by dynamically setting the chunk
threshold to be one byte larger than the size of the original object
therefore avoiding multipart copies.

Fixes #3174
---
 dvc/remote/s3.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/dvc/remote/s3.py b/dvc/remote/s3.py
index ba043afcdd..b6a24c6bb0 100644
--- a/dvc/remote/s3.py
+++ b/dvc/remote/s3.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import threading
+from boto3.s3.transfer import TransferConfig
 
 from funcy import cached_property, wrap_prop
 
@@ -169,7 +170,13 @@ def _copy(cls, s3, from_info, to_info, extra_args):
             )
         else:
             source = {"Bucket": from_info.bucket, "Key": from_info.path}
-            s3.copy(source, to_info.bucket, to_info.path, ExtraArgs=extra_args)
+            s3.copy(
+                source,
+                to_info.bucket,
+                to_info.path,
+                ExtraArgs=extra_args,
+                Config=TransferConfig(multipart_threshold=size + 1)
+            )
 
         cached_etag = cls.get_etag(s3, to_info.bucket, to_info.path)
         if etag != cached_etag: