🐛 Legacy detect should return UTF-8-SIG if sig is detected (#38)

In order to be as close as possible to chardet behavior regarding the detect function, this PR adjusts the return value when the actual given content contains the SIG (utf-8). This minor fix is related to the possible integration of this lib to requests. see psf/requests#5797 Why does Charset-Normalizer return 'utf-8' instead of 'utf-8-sig'? Here are the main reasons : The SIG is actually not very useful, and not widely recognized even by the Unicode consortium. The detection/normalization process does strip the sig but takes it into account.
jawah · May 12, 2021 · 030039c · 030039c
1 parent e46ee12
commit 030039c
Show file tree

Hide file tree

Showing 2 changed files with 23 additions and 3 deletions.
diff --git a/charset_normalizer/legacy.py b/charset_normalizer/legacy.py
@@ -19,8 +19,17 @@ def detect(byte_str):
 
     r = CnM.from_bytes(byte_str).best().first()
 
+    encoding = r.encoding if r is not None else None
+    language = r.language if r is not None and r.language != 'Unknown' else ''
+    confidence = 1. - r.chaos if r is not None else None
+
+    # Note: CharsetNormalizer does not return 'UTF-8-SIG' as the sig get stripped in the detection/normalization process
+    # but chardet does return 'utf-8-sig' and it is a valid codec name.
+    if encoding == 'utf_8' and r.bom:
+        encoding += '_sig'
+
     return {
-        'encoding': r.encoding if r is not None else None,
-        'language': r.language if r is not None and r.language != 'Unknown' else '',
-        'confidence': 1. - r.chaos if r is not None else None
+        'encoding': encoding,
+        'language': language,
+        'confidence': confidence
     }
diff --git a/test/test_detect_legacy.py b/test/test_detect_legacy.py
@@ -62,3 +62,14 @@ def test_detect_dict_value(self):
                 r['encoding'],
                 'utf_7'
             )
+
+    def test_utf8_sig_not_striped(self):
+        r = detect(
+            "Hello World".encode('utf-8-sig')
+        )
+
+        with self.subTest("Verify that UTF-8-SIG is returned when using legacy detect"):
+            self.assertEqual(
+                r['encoding'],
+                "utf_8_sig"
+            )