### Fixing legacy non-UTF-8 encodings with the existing tools may be disappointing:

In [1]:
import chardet

s = "Ìèñíèê, Íèêîëàé Áîðèñîâè÷"
bytes_default = s.encode()
bytes_latin = s.encode('latin-1')

print(chardet.detect(bytes_default))
print(chardet.detect(bytes_latin))

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
{'encoding': 'windows-1251', 'confidence': 0.99, 'language': 'Russian'}


In [2]:
from chardet.universaldetector import UniversalDetector

s = "writer�s �nothing� virtue�"
detector = UniversalDetector()
detector.feed(s.encode())
detector.close()
print(detector.result)

{'encoding': 'utf-8', 'confidence': 0.938125, 'language': ''}


In [4]:
from chardet.universaldetector import UniversalDetector

detector = UniversalDetector()
detector.feed("ÃƒÂ©chÃƒÂ©ancier".encode())
detector.close()
print(detector.result)

{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}


# Meet `whatever_disentangler` — a brute-force disentangler for legacy encodings

* It is available as both a self-hosted web service and a package (soon officially, let's hope so!)
* It supports [as many encodings as Python itself](https://docs.python.org/3/library/codecs.html#standard-encodings)

In [1]:
from whatever_disentangler import whatever_disentangler as wd

# this one is an offline disentangler
disentangler = wd.Disentangler()

# this one is remote and calls a homemade REST API
remote_disentangler = wd.RemoteDisentangler(endpoint='https://crac.ovh/fix_legacy_encoding')

#### Use case: When you already know what the expected (disentangled) string looks like

In [2]:
expected_str = "Gocławski"
garbled_str = "GocÅ‚awski"

print(f"Result by {type(remote_disentangler).__name__}:")
response_obj = await remote_disentangler.fetch_response(str_to_fix=garbled_str, expected_str=expected_str)
remote_disentangler.flatten_legibly(response_obj)
print()
print(f"Result by {type(disentangler).__name__}:")
result_generator = disentangler.disentangle(str_to_fix=garbled_str, expected_str=expected_str)
disentangler.flatten_legibly(result_generator)

Result by RemoteDisentangler:
'GocÅ‚awski' ('cp1252') -> '[32mGocławski[0m' ('utf_8')
'GocÅ‚awski' ('cp1254') -> '[32mGocławski[0m' ('utf_8')
'GocÅ‚awski' ('cp1257') -> '[32mGocławski[0m' ('utf_8')
'GocÅ‚awski' ('cp1258') -> '[32mGocławski[0m' ('utf_8')

Result by Disentangler:
'GocÅ‚awski' ('cp1252') -> '[32mGocławski[0m' ('utf_8')
'GocÅ‚awski' ('cp1252') -> '[32mGocławski[0m' ('utf_8_sig')
'GocÅ‚awski' ('cp1254') -> '[32mGocławski[0m' ('utf_8')
'GocÅ‚awski' ('cp1254') -> '[32mGocławski[0m' ('utf_8_sig')
'GocÅ‚awski' ('cp1257') -> '[32mGocławski[0m' ('utf_8')
'GocÅ‚awski' ('cp1257') -> '[32mGocławski[0m' ('utf_8_sig')
'GocÅ‚awski' ('cp1258') -> '[32mGocławski[0m' ('utf_8')
'GocÅ‚awski' ('cp1258') -> '[32mGocławski[0m' ('utf_8_sig')


In [3]:
expected_str = "Contrôle"
garbled_str = "ContrÃ´le"

print(f"Result by {type(remote_disentangler).__name__}:")
response_obj = await remote_disentangler.fetch_response(str_to_fix=garbled_str, expected_str=expected_str)
remote_disentangler.flatten_legibly(response_obj)
print()
print(f"Result by {type(disentangler).__name__}:")
result_generator = disentangler.disentangle(str_to_fix=garbled_str, expected_str=expected_str)
disentangler.flatten_legibly(result_generator)

Result by RemoteDisentangler:
'ContrÃ´le' ('cp1252') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('cp1254') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('latin_1') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('iso8859_4') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('iso8859_9') -> '[32mContrôle[0m' ('utf_8')

Result by Disentangler:
'ContrÃ´le' ('cp1252') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('cp1252') -> '[32mContrôle[0m' ('utf_8_sig')
'ContrÃ´le' ('cp1254') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('cp1254') -> '[32mContrôle[0m' ('utf_8_sig')
'ContrÃ´le' ('latin_1') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('latin_1') -> '[32mContrôle[0m' ('utf_8_sig')
'ContrÃ´le' ('iso8859_4') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('iso8859_4') -> '[32mContrôle[0m' ('utf_8_sig')
'ContrÃ´le' ('iso8859_9') -> '[32mContrôle[0m' ('utf_8')
'ContrÃ´le' ('iso8859_9') -> '[32mContrôle[0m' ('utf_8_sig')


In [4]:
expected_str = 'Мисник, Николай Борисович'
garbled_str = 'Ìèñíèê, Íèêîëàé Áîðèñîâè÷'

print(f"Result by {type(remote_disentangler).__name__}:")
response_obj = await remote_disentangler.fetch_response(str_to_fix=garbled_str, expected_str=expected_str)
remote_disentangler.flatten_legibly(response_obj)
print()
print(f"Result by {type(disentangler).__name__}:")
result_generator = disentangler.disentangle(str_to_fix=garbled_str, expected_str=expected_str)
disentangler.flatten_legibly(result_generator)

Result by RemoteDisentangler:
Got an empty response [] (list) for the request 'http://localhost:3000/fix_legacy_encoding?str_to_fix=Ìèñíèê,+Íèêîëàé+Áîðèñîâè÷&expected_str=Мисник,+Николай+Борисович&recursivity_depth=1'

Result by Disentangler:
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('cp1252') -> '[32mМисник, Николай Борисович[0m' ('cp1251')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('cp1252') -> '[32mМисник, Николай Борисович[0m' ('kz1048')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('cp1252') -> '[32mМисник, Николай Борисович[0m' ('ptcp154')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('latin_1') -> '[32mМисник, Николай Борисович[0m' ('cp1251')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('latin_1') -> '[32mМисник, Николай Борисович[0m' ('kz1048')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('latin_1') -> '[32mМисник, Николай Борисович[0m' ('ptcp154')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('iso8859_15') -> '[32mМисник, Николай Борисович[0m' ('cp1251')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('iso8859_15') -> '[32mМисник, Николай Борисович[0m' ('kz1048')
'Ìèñíèê, Íèêîëàé Áîðèñîâ

#### Use case: When you know which encodings you want to try, even without knowing what the expected string looks like

In [5]:
selected_encodings = ['utf_8', 'cp1252', 'cp1251']
garbled_str = ["GocÅ‚awski", "ContrÃ´le", 'Ìèñíèê, Íèêîëàé Áîðèñîâè÷', "ÃƒÂ©chÃƒÂ©ancier"]

In [8]:
print(f"Result by {type(disentangler).__name__}:", end='\n\n')
for bad_str in garbled_str:
    for enc_from in selected_encodings:
        for enc_to in selected_encodings:
            if enc_from == enc_to:
                continue
            result_generator = disentangler.disentangle(str_to_fix=bad_str, encoding_from=enc_from, encoding_to=enc_to)
            disentangler.flatten_legibly(result_generator)
    print()

Result by Disentangler:

'GocÅ‚awski' ('utf_8') -> 'GocÃ…â€šawski' ('cp1252')
'GocÅ‚awski' ('utf_8') -> 'GocГ…вЂљawski' ('cp1251')
'GocÅ‚awski' ('cp1252') -> 'Gocławski' ('utf_8')
'GocÅ‚awski' ('cp1252') -> 'GocЕ‚awski' ('cp1251')

'ContrÃ´le' ('utf_8') -> 'ContrÃƒÂ´le' ('cp1252')
'ContrÃ´le' ('utf_8') -> 'ContrГѓВґle' ('cp1251')
'ContrÃ´le' ('cp1252') -> 'Contrôle' ('utf_8')
'ContrÃ´le' ('cp1252') -> 'ContrГґle' ('cp1251')

'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('utf_8') -> 'ÃŒÃ¨Ã±Ã­Ã¨Ãª, Ã�Ã¨ÃªÃ®Ã«Ã Ã© Ã�Ã®Ã°Ã¨Ã±Ã®Ã¢Ã¨Ã·' ('cp1252')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('utf_8') -> 'ГЊГЁГ±Г­ГЁГЄ, ГЌГЁГЄГ®Г«Г Г© ГЃГ®Г°ГЁГ±Г®ГўГЁГ·' ('cp1251')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('cp1252') -> '������, ������� ���������' ('utf_8')
'Ìèñíèê, Íèêîëàé Áîðèñîâè÷' ('cp1252') -> 'Мисник, Николай Борисович' ('cp1251')

'ÃƒÂ©chÃƒÂ©ancier' ('utf_8') -> 'ÃƒÆ’Ã‚Â©chÃƒÆ’Ã‚Â©ancier' ('cp1252')
'ÃƒÂ©chÃƒÂ©ancier' ('utf_8') -> 'ГѓЖ’Г‚В©chГѓЖ’Г‚В©ancier' ('cp1251')
'ÃƒÂ©chÃƒÂ©ancier' ('cp1252') -> 'Ã©chÃ©ancier' ('utf_8')
'ÃƒÂ©ch

In [10]:
print(f"Result by {type(remote_disentangler).__name__}:", end='\n\n')
for bad_str in garbled_str:
    for enc_from in selected_encodings:
        for enc_to in selected_encodings:
            if enc_from == enc_to:
                continue
            response_obj = await remote_disentangler.fetch_response(str_to_fix=bad_str, encoding_from=enc_from, encoding_to=enc_to)
            remote_disentangler.flatten_legibly(response_obj)
    print()

Result by RemoteDisentangler:

'GocÅ‚awski' ('utf_8') -> 'GocÃ…â€šawski' ('cp1252')
'GocÅ‚awski' ('utf_8') -> 'GocГ…вЂљawski' ('cp1251')
'GocÅ‚awski' ('cp1252') -> 'Gocławski' ('utf_8')
'GocÅ‚awski' ('cp1252') -> 'GocЕ‚awski' ('cp1251')
Got an empty response [] (list) for the request 'http://localhost:3000/fix_legacy_encoding?str_to_fix=GocÅ‚awski&encoding_from=cp1251&encoding_to=utf_8&recursivity_depth=1'
Got an empty response [] (list) for the request 'http://localhost:3000/fix_legacy_encoding?str_to_fix=GocÅ‚awski&encoding_from=cp1251&encoding_to=cp1252&recursivity_depth=1'

'ContrÃ´le' ('utf_8') -> 'ContrÃƒÂ´le' ('cp1252')
'ContrÃ´le' ('utf_8') -> 'ContrГѓВґle' ('cp1251')
'ContrÃ´le' ('cp1252') -> 'Contrôle' ('utf_8')
'ContrÃ´le' ('cp1252') -> 'ContrГґle' ('cp1251')
Got an empty response [] (list) for the request 'http://localhost:3000/fix_legacy_encoding?str_to_fix=ContrÃ´le&encoding_from=cp1251&encoding_to=utf_8&recursivity_depth=1'
Got an empty response [] (list) for the request

#### Use case: A tough case which needs two-step detangling

Use recursive search to explore deeper

In [11]:
garbled_str = "ÃƒÂ©chÃƒÂ©ancier"
expected_str = "échéancier"

In [12]:
print(f"Result by {type(remote_disentangler).__name__}:", end='\n\n')
response_obj = await remote_disentangler.fetch_response(str_to_fix=garbled_str, expected_str=expected_str, encoding_to='utf_8', recursivity_depth=2)
remote_disentangler.flatten_legibly(response_obj)

Result by RemoteDisentangler:

'ÃƒÂ©chÃƒÂ©ancier' ('cp850') -> 'ǟ��chǟ��ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('cp858') -> 'ǟ��chǟ��ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('cp1252') -> 'Ã©chÃ©ancier' ('utf_8')
    -> 'Ã©chÃ©ancier' ('cp1252') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('cp1254') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('latin_1') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('iso8859_9') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('iso8859_14') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('iso8859_15') -> '[32méchéancier[0m' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('cp1254') -> 'Ã©chÃ©ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('gb18030') -> '�0�1�0�6�0�0�0�8ch�0�1�0�6�0�0�0�8ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('mac_iceland') -> '���ch���ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('mac_roman') -> '���ch���ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('mac_turkish') -> '���ch���ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('utf_32') -> '�

In [13]:
print(f"Result by {type(disentangler).__name__}:", end='\n\n')
result_generator = disentangler.disentangle(str_to_fix=garbled_str, expected_str=expected_str, encoding_to='utf_8', recursivity_depth=2)
disentangler.flatten_legibly(result_generator)

Result by Disentangler:

'ÃƒÂ©chÃƒÂ©ancier' ('cp850') -> 'ǟ��chǟ��ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('cp858') -> 'ǟ��chǟ��ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('cp1252') -> 'Ã©chÃ©ancier' ('utf_8')
    -> 'Ã©chÃ©ancier' ('cp1252') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('cp1254') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('latin_1') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('iso8859_9') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('iso8859_14') -> '[32méchéancier[0m' ('utf_8')
    -> 'Ã©chÃ©ancier' ('iso8859_15') -> '[32méchéancier[0m' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('cp1254') -> 'Ã©chÃ©ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('gb18030') -> '�0�1�0�6�0�0�0�8ch�0�1�0�6�0�0�0�8ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('mac_iceland') -> '���ch���ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('mac_roman') -> '���ch���ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('mac_turkish') -> '���ch���ancier' ('utf_8')
'ÃƒÂ©chÃƒÂ©ancier' ('utf_32') -> '��  �  

#### Still, sometimes you'll meet an unresolvable case:

In [14]:
garbled_str = "writer�s �nothing� virtue�"
expected_str = "writer’s ‘nothing’ virtue—"

print(f"Result by {type(remote_disentangler).__name__}:", end='\n\n')
response_obj = await remote_disentangler.fetch_response(str_to_fix=garbled_str, expected_str=expected_str, encoding_to='utf_8', recursivity_depth=2)
remote_disentangler.flatten_legibly(response_obj)
print()
print(f"Result by {type(disentangler).__name__}:", end='\n\n')
result_generator = disentangler.disentangle(str_to_fix=garbled_str, expected_str=expected_str, encoding_to='utf_8', recursivity_depth=2)
disentangler.flatten_legibly(result_generator)

Result by RemoteDisentangler:

'writer�s �nothing� virtue�' ('gb18030') -> 'writer�1�7s �1�7nothing�1�7 virtue�1�7' ('utf_8')
'writer�s �nothing� virtue�' ('utf_32') -> '��  w   r   i   t   e   r   ��  s       ��  n   o   t   h   i   n   g   ��      v   i   r   t   u   e   ��  ' ('utf_8')
'writer�s �nothing� virtue�' ('utf_32_be') -> '   w   r   i   t   e   r  ��   s      ��   n   o   t   h   i   n   g  ��       v   i   r   t   u   e  ��' ('utf_8')
'writer�s �nothing� virtue�' ('utf_32_le') -> 'w   r   i   t   e   r   ��  s       ��  n   o   t   h   i   n   g   ��      v   i   r   t   u   e   ��  ' ('utf_8')
'writer�s �nothing� virtue�' ('utf_16') -> '��w r i t e r ��s   ��n o t h i n g ��  v i r t u e ��' ('utf_8')
'writer�s �nothing� virtue�' ('utf_16_be') -> ' w r i t e r�� s  �� n o t h i n g��   v i r t u e��' ('utf_8')
'writer�s �nothing� virtue�' ('utf_16_le') -> 'w r i t e r ��s   ��n o t h i n g ��  v i r t u e ��' ('utf_8')
'writer�s �nothing� virtue�' ('utf_7') -> 'writer+//