Add support for charset option to cab format reader.

SVN-Revision: 3033
libarchive · Mar 20, 2011 · bc56f3b · bc56f3b
1 parent 79a8745
commit bc56f3b
Show file tree

Hide file tree

Showing 6 changed files with 202 additions and 29 deletions.
diff --git a/Makefile.am b/Makefile.am
@@ -289,6 +289,7 @@ libarchive_test_SOURCES=					\
 	libarchive/test/test_read_file_nonexistent.c		\
 	libarchive/test/test_read_format_ar.c			\
 	libarchive/test/test_read_format_cab.c			\
+	libarchive/test/test_read_format_cab_filename.c		\
 	libarchive/test/test_read_format_cpio_afio.c		\
 	libarchive/test/test_read_format_cpio_bin.c		\
 	libarchive/test/test_read_format_cpio_bin_Z.c		\
@@ -419,6 +420,7 @@ libarchive_test_EXTRA_DIST=\
 	libarchive/test/test_read_format_cab_1.cab.uu			\
 	libarchive/test/test_read_format_cab_2.cab.uu			\
 	libarchive/test/test_read_format_cab_3.cab.uu			\
+	libarchive/test/test_read_format_cab_cp932.cab.uu		\
 	libarchive/test/test_read_format_cpio_bin_be.cpio.uu		\
 	libarchive/test/test_read_format_cpio_svr4_bzip2_rpm.rpm.uu	\
 	libarchive/test/test_read_format_cpio_svr4_gzip_rpm.rpm.uu	\

diff --git a/libarchive/archive_read_support_format_cab.c b/libarchive/archive_read_support_format_cab.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 2010 Michihiro NAKAJIMA
+ * Copyright (c) 2010-2011 Michihiro NAKAJIMA
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -290,6 +290,7 @@ struct cab {
 	unsigned char		*uncompressed_buffer;
 	size_t			 uncompressed_buffer_size;
 
+	char			*charset;
 	char			 format_name[64];
 
 #ifdef HAVE_ZLIB_H
@@ -300,6 +301,8 @@ struct cab {
 };
 
 static int	archive_read_format_cab_bid(struct archive_read *);
+static int	archive_read_format_cab_options(struct archive_read *,
+		    const char *, const char *);
 static int	archive_read_format_cab_read_header(struct archive_read *,
 		    struct archive_entry *);
 static int	archive_read_format_cab_read_data(struct archive_read *,
@@ -362,7 +365,7 @@ archive_read_support_format_cab(struct archive *_a)
 	    cab,
 	    "cab",
 	    archive_read_format_cab_bid,
-	    NULL,
+	    archive_read_format_cab_options,
 	    archive_read_format_cab_read_header,
 	    archive_read_format_cab_read_data,
 	    archive_read_format_cab_read_data_skip,
@@ -443,6 +446,29 @@ archive_read_format_cab_bid(struct archive_read *a)
 	return (0);
 }
 
+static int
+archive_read_format_cab_options(struct archive_read *a,
+    const char *key, const char *val)
+{
+	struct cab *cab;
+	int ret = ARCHIVE_FAILED;
+
+	cab = (struct cab *)(a->format->data);
+	if (strcmp(key, "charset")  == 0) {
+		if (val == NULL || val[0] == 0)
+			archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
+			    "cab: charset option needs a character-set name");
+		else {
+			cab->charset = strdup(val);
+			ret = ARCHIVE_OK;
+		}
+	} else
+		archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
+		    "cab: unknown keyword ``%s''", key);
+
+	return (ret);
+}
+
 static int
 cab_skip_sfx(struct archive_read *a)
 {
@@ -526,38 +552,49 @@ cab_read_ahead_remaining(struct archive_read *a, size_t min, ssize_t *avail)
 /* Convert a path separator '\' -> '/' */
 static void
 cab_convert_path_separator(struct archive_read *a, struct cab *cab,
-    struct archive_string *pathname, unsigned char attr)
+    struct archive_string *fn, unsigned char attr)
 {
-	int l, r;
+	size_t i;
 
-	if (strchr(pathname->s, '\\') == NULL)
+	for (i = 0; i < archive_strlen(fn); i++) {
+		if (fn->s[i] == '\\')
+			fn->s[i] = '/';
+		else if (fn->s[i] & 0x80)
+			/* Are there any multibyte characters in fn ? */
+			break;
+	}
+	if (i == archive_strlen(fn))
 		return;
 
-	archive_wstring_empty(&cab->ws);
-	if ((attr & ATTR_NAME_IS_UTF) != 0 ||
-	    archive_wstring_append_from_mbs(&a->archive, &(cab->ws), pathname->s, pathname->length) != 0) {
-		for (l = 0; pathname->s[l] != '\0'; l++) {
-			if (pathname->s[l] == '\\')
-				pathname->s[l] = '/';
+	/*
+	 * Try to replace a character in wide character.
+	 */
+
+	/* If a conversion to wide character failed, force a replacement. */
+	if (!archive_wstring_append_from_mbs(&a->archive, &(cab->ws),
+	    fn->s, fn->length)) {
+		for (i = 0; i < archive_strlen(fn); i++) {
+			if (fn->s[i] == '\\')
+				fn->s[i] = '/';
 		}
 		return;
 	}
 
-	r = 0;
-	for (l = 0; cab->ws.s[l] != L'\0'; l++) {
-		if (cab->ws.s[l] == L'\\') {
-			cab->ws.s[l] = L'/';
-			r = 1;
-		}
-	}
-	if (r) {
-		archive_string_empty(&cab->mbs);
-		archive_string_append_from_unicode_to_mbs(&a->archive, &cab->mbs, cab->ws.s, cab->ws.length);
-		/* If mbs length is different to pathname, we broke the
-		 * pathname. We shouldn't use it. */
-		if (archive_strlen(&cab->mbs) == archive_strlen(pathname))
-			archive_string_copy(pathname, &cab->mbs);
+	for (i = 0; i < archive_strlen(&(cab->ws)); i++) {
+		if (cab->ws.s[i] == L'\\')
+			cab->ws.s[i] = L'/';
 	}
+
+	/*
+	 * Sanity check that we surely did not break a filename.
+	 */
+	archive_string_empty(&(cab->mbs));
+	archive_string_append_from_unicode_to_mbs(&a->archive, &(cab->mbs),
+	    cab->ws.s, cab->ws.length);
+	/* If mbs length is different to fn, we broke the
+	 * filename and we shouldn't use it. */
+	if (archive_strlen(&(cab->mbs)) == archive_strlen(fn))
+		archive_string_copy(fn, &(cab->mbs));
 }
 
 /*
@@ -753,7 +790,12 @@ cab_read_header(struct archive_read *a)
 		if ((len = cab_strnlen(p, avail-1)) <= 0)
 			goto invalid;
 		archive_string_init(&(file->pathname));
-		archive_strncpy(&(file->pathname),  p, len);
+		if ((file->attr & ATTR_NAME_IS_UTF) && cab->charset == NULL)
+			archive_strncpy_from_specific_locale(&a->archive,
+			    &(file->pathname),  (const char *)p, len, "UTF-8");
+		else
+			archive_strncpy_from_specific_locale(&a->archive,
+			    &(file->pathname),  (const char *)p, len, cab->charset);
 		__archive_read_consume(a, len + 1);
 		cab->cab_offset += len + 1;
 		/* Convert a path separator '\' -> '/' */
@@ -885,8 +927,6 @@ archive_read_format_cab_read_header(struct archive_read *a,
 	 * Set a default value and common data
 	 */
 	archive_entry_set_pathname(entry, file->pathname.s);
-	if (file->attr & ATTR_NAME_IS_UTF)
-		archive_entry_update_pathname_utf8(entry, file->pathname.s);
 
 	archive_entry_set_size(entry, file->uncompressed_size);
 	if (file->attr & ATTR_RDONLY)
@@ -1928,6 +1968,7 @@ archive_read_format_cab_cleanup(struct archive_read *a)
 	archive_wstring_free(&cab->ws);
 	archive_string_free(&cab->mbs);
 	free(cab->uncompressed_buffer);
+	free(cab->charset);
 	free(cab);
 	(a->format->data) = NULL;
 	return (ARCHIVE_OK);

diff --git a/libarchive/archive_read_support_format_lha.c b/libarchive/archive_read_support_format_lha.c
@@ -730,7 +730,7 @@ lha_replace_path_separator(struct archive_read *a, struct lha *lha,
 	 * Try to replace a character in wide character.
 	 */
 
-	/* If converting to wide character failed, force a replacement. */
+	/* If a conversion to wide character failed, force a replacement. */
 	if (!archive_wstring_append_from_mbs(&a->archive, &(lha->ws),
 	    fn->s, fn->length)) {
 		for (i = 0; i < archive_strlen(fn); i++) {

diff --git a/libarchive/test/CMakeLists.txt b/libarchive/test/CMakeLists.txt
@@ -60,6 +60,7 @@ IF(ENABLE_TEST)
     test_read_file_nonexistent.c
     test_read_format_ar.c
     test_read_format_cab.c
+    test_read_format_cab_filename.c
     test_read_format_cpio_afio.c
     test_read_format_cpio_bin.c
     test_read_format_cpio_bin_Z.c

diff --git a/libarchive/test/test_read_format_cab_cp932.cab.uu b/libarchive/test/test_read_format_cab_cp932.cab.uu
@@ -0,0 +1,7 @@
+begin 644 test_read_format_cab_cp932.cab
+M35-#1@````"4`````````"P``````````P$!``(````(_@``;@````$``Q(%
+M````````````=#ZO5"``E5R"OH+F7(J_CIHN='AT``4````%``````!T/KM4
+M(`"57(*^@N9<B.J7EY5<+G1X=``I]2&+'@`*`%N`@(T`,*```0````$````!
+-````:V%N:FE(96QL;P``
+`
+end
diff --git a/libarchive/test/test_read_format_cab_filename.c b/libarchive/test/test_read_format_cab_filename.c
@@ -0,0 +1,122 @@
+/*-
+ * Copyright (c) 2011 Michihiro NAKAJIMA
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "test.h"
+__FBSDID("$FreeBSD");
+
+#include <locale.h>
+
+DEFINE_TEST(test_read_format_cab_filename)
+{
+	struct archive *a;
+	struct archive_entry *ae;
+	const char *refname = "test_read_format_cab_cp932.cab";
+
+	/*
+	 * Read CAB filename in ja_JP.eucJP with "charset=CP932" option.
+	 */
+	if (NULL == setlocale(LC_ALL, "ja_JP.eucJP")) {
+		skipping("ja_JP.eucJP locale not available on this system.");
+		return;
+	}
+
+	extract_reference_file(refname);
+	assert((a = archive_read_new()) != NULL);
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_compression_all(a));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
+	assertEqualIntA(a, ARCHIVE_OK,
+	    archive_read_set_options(a, "charset=CP932"));
+	assertEqualIntA(a, ARCHIVE_OK,
+	    archive_read_open_filename(a, refname, 10240));
+
+	/* Verify regular file. */
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+	assertEqualString(
+	    "\xc9\xbd\xa4\xc0\xa4\xe8\x2f\xb4\xc1\xbb\xfa\x2e\x74\x78\x74",
+	    archive_entry_pathname(ae));
+	assertEqualInt(5, archive_entry_size(ae));
+
+	/* Verify regular file. */
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+	assertEqualString(
+	    "\xc9\xbd\xa4\xc0\xa4\xe8\x2f\xb0\xec\xcd\xf7\xc9\xbd\x2e\x74\x78\x74",
+	    archive_entry_pathname(ae));
+	assertEqualInt(5, archive_entry_size(ae));
+
+
+	/* End of archive. */
+	assertEqualIntA(a, ARCHIVE_EOF, archive_read_next_header(a, &ae));
+
+	/* Verify archive format. */
+	assertEqualIntA(a, ARCHIVE_COMPRESSION_NONE, archive_compression(a));
+	assertEqualIntA(a, ARCHIVE_FORMAT_CAB, archive_format(a));
+
+	/* Close the archive. */
+	assertEqualInt(ARCHIVE_OK, archive_read_close(a));
+	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
+
+
+	/*
+	 * Read CAB filename in ja_JP.UTF-8 with "charset=CP932" option.
+	 */
+	if (NULL == setlocale(LC_ALL, "ja_JP.UTF-8")) {
+		skipping("ja_JP.UTF-8 locale not available on this system.");
+		return;
+	}
+
+	assert((a = archive_read_new()) != NULL);
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_compression_all(a));
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_support_format_all(a));
+	assertEqualIntA(a, ARCHIVE_OK,
+	    archive_read_set_options(a, "charset=CP932"));
+	assertEqualIntA(a, ARCHIVE_OK,
+	    archive_read_open_filename(a, refname, 10240));
+
+	/* Verify regular file. */
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+	assertEqualString("\xe8\xa1\xa8\xe3\x81\xa0\xe3\x82\x88\x2f"
+	    "\xe6\xbc\xa2\xe5\xad\x97\x2e\x74\x78\x74",
+	    archive_entry_pathname(ae));
+	assertEqualInt(5, archive_entry_size(ae));
+
+	/* Verify regular file. */
+	assertEqualIntA(a, ARCHIVE_OK, archive_read_next_header(a, &ae));
+	assertEqualString("\xe8\xa1\xa8\xe3\x81\xa0\xe3\x82\x88\x2f"
+	    "\xe4\xb8\x80\xe8\xa6\xa7\xe8\xa1\xa8\x2e\x74\x78\x74",
+	    archive_entry_pathname(ae));
+	assertEqualInt(5, archive_entry_size(ae));
+
+
+	/* End of archive. */
+	assertEqualIntA(a, ARCHIVE_EOF, archive_read_next_header(a, &ae));
+
+	/* Verify archive format. */
+	assertEqualIntA(a, ARCHIVE_COMPRESSION_NONE, archive_compression(a));
+	assertEqualIntA(a, ARCHIVE_FORMAT_CAB, archive_format(a));
+
+	/* Close the archive. */
+	assertEqualInt(ARCHIVE_OK, archive_read_close(a));
+	assertEqualInt(ARCHIVE_OK, archive_read_free(a));
+}
+