Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support download of UTF-8 filenames #7188 #7503

Merged
merged 4 commits into from Jan 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 9 additions & 0 deletions doc/release-notes/7188-utf-8-filenames.md
@@ -0,0 +1,9 @@
## Notes for Tool Developers and Integrators

### UTF-8 Characters and Spaces in File Names

UTF-8 characters in filenames are now preserved when downloaded.

Dataverse Installations will no longer replace spaces in file names of downloaded files with the + character. If your tool or integration has any special handling around this, you may need to make further adjustments to maintain backwards compatibility while also supporting Dataverse installations on 5.4+.

Note that this follows a change from 5.1 that only corrected this for installations running with S3 storage. This makes the behavior consistent across installations running all types of file storage.
Expand Up @@ -33,6 +33,7 @@
import java.io.FileInputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
Expand Down Expand Up @@ -301,9 +302,11 @@ public void writeTo(DownloadInstance di, Class<?> clazz, Type type, Annotation[]

// Provide both the "Content-disposition" and "Content-Type" headers,
// to satisfy the widest selection of browsers out there.

httpHeaders.add("Content-disposition", "attachment; filename=\"" + fileName + "\"");
httpHeaders.add("Content-Type", mimeType + "; name=\"" + fileName + "\"");
// Encode the filename as UTF-8, then deal with spaces. "encode" changes
// a space to + so we change it back to a space (%20).
String finalFileName = URLEncoder.encode(fileName, "UTF-8").replaceAll("\\+", "%20");
httpHeaders.add("Content-disposition", "attachment; filename=\"" + finalFileName + "\"");
httpHeaders.add("Content-Type", mimeType + "; name=\"" + finalFileName + "\"");

long contentSize;
boolean useChunkedTransfer = false;
Expand Down
83 changes: 83 additions & 0 deletions src/test/java/edu/harvard/iq/dataverse/api/DownloadFilesIT.java
Expand Up @@ -2,8 +2,10 @@

import com.jayway.restassured.RestAssured;
import com.jayway.restassured.path.json.JsonPath;
import com.jayway.restassured.response.Headers;
import com.jayway.restassured.response.Response;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.file.Files;
Expand All @@ -13,6 +15,7 @@
import java.util.HashSet;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import java.util.zip.ZipOutputStream;
import static javax.ws.rs.core.Response.Status.CREATED;
import static javax.ws.rs.core.Response.Status.FORBIDDEN;
import static javax.ws.rs.core.Response.Status.OK;
Expand Down Expand Up @@ -367,6 +370,86 @@ public void downloadAllFilesTabular() throws IOException {
Assert.assertEquals(new HashSet<>(Arrays.asList("50by1000.dta", "MANIFEST.TXT")), gatherFilenames(downloadFiles2.getBody().asInputStream()));
}

/**
* Download a file with a UTF-8 filename with a space.
*/
@Test
public void downloadFilenameUtf8() throws IOException {

Response createUser = UtilIT.createRandomUser();
createUser.prettyPrint();
createUser.then().assertThat()
.statusCode(OK.getStatusCode());
String username = UtilIT.getUsernameFromResponse(createUser);
String apiToken = UtilIT.getApiTokenFromResponse(createUser);

Response createDataverseResponse = UtilIT.createRandomDataverse(apiToken);
createDataverseResponse.prettyPrint();
createDataverseResponse.then().assertThat()
.statusCode(CREATED.getStatusCode());

String dataverseAlias = UtilIT.getAliasFromResponse(createDataverseResponse);

Response createDataset = UtilIT.createRandomDatasetViaNativeApi(dataverseAlias, apiToken);
createDataset.prettyPrint();
createDataset.then().assertThat()
.statusCode(CREATED.getStatusCode());

Integer datasetId = UtilIT.getDatasetIdFromResponse(createDataset);
String datasetPid = UtilIT.getDatasetPersistentIdFromResponse(createDataset);

// Put a filename with an en-dash ("MY READ–ME.md") into a zip file.
StringBuilder sb = new StringBuilder();
sb.append("This is my README.");
Path pathtoTempDir = Paths.get(Files.createTempDirectory(null).toString());
String pathToZipFile = pathtoTempDir + File.separator + "test.zip";
File f = new File(pathToZipFile);
ZipOutputStream out = new ZipOutputStream(new FileOutputStream(f));
ZipEntry e = new ZipEntry("MY READ–ME.md");
out.putNextEntry(e);
byte[] data = sb.toString().getBytes();
out.write(data, 0, data.length);
out.closeEntry();
out.close();

// We upload via SWORD (as a zip) because the native API gives this error:
// "Constraint violation found in FileMetadata. File Name cannot contain any
// of the following characters: / : * ? " < > | ; # . The invalid value is "READ?ME.md"."
// This error probably has something to do with the way REST Assured sends the filename
// to the native API. The en-dash is turned into question mark, which is disallowed.
Response uploadViaSword = UtilIT.uploadZipFileViaSword(datasetPid, pathToZipFile, apiToken);
uploadViaSword.prettyPrint();
uploadViaSword.then().assertThat()
.statusCode(CREATED.getStatusCode());

Response getDatasetJson = UtilIT.nativeGet(datasetId, apiToken);
getDatasetJson.then().assertThat()
.statusCode(OK.getStatusCode());

int fileId = JsonPath.from(getDatasetJson.getBody().asString()).getInt("data.latestVersion.files[0].dataFile.id");

// Download the file individually and assert READ–ME.md has an en-dash.
Response downloadFile = UtilIT.downloadFile(new Integer(fileId), apiToken);
downloadFile.then().assertThat()
.statusCode(OK.getStatusCode());
Headers headers = downloadFile.getHeaders();
// In "MY READ–ME.md" below the space is %20 and the en-dash ("–") is "%E2%80%93" (e2 80 93 in hex).
Assert.assertEquals("attachment; filename=\"MY%20READ%E2%80%93ME.md\"", headers.getValue("Content-disposition"));
Assert.assertEquals("text/markdown; name=\"MY%20READ%E2%80%93ME.md\";charset=UTF-8", headers.getValue("Content-Type"));

// Download all files as a zip and assert "MY READ–ME.md" has an en-dash.
Response downloadFiles = UtilIT.downloadFiles(datasetPid, apiToken);
downloadFiles.then().assertThat()
.statusCode(OK.getStatusCode());

HashSet<String> filenamesFound = gatherFilenames(downloadFiles.getBody().asInputStream());

// Note that a MANIFEST.TXT file is added.
// "MY READ–ME.md" (with an en-dash) is correctly extracted from the downloaded zip
HashSet<String> expectedFiles = new HashSet<>(Arrays.asList("MANIFEST.TXT", "MY READ–ME.md"));
Assert.assertEquals(expectedFiles, filenamesFound);
}

private HashSet<String> gatherFilenames(InputStream inputStream) throws IOException {
HashSet<String> filenamesFound = new HashSet<>();
try (ZipInputStream zipStream = new ZipInputStream(inputStream)) {
Expand Down