In [None]:
<p style="text-align:center">
    <a href="https://https://github.com/kristajan/UTF8_Utilities" target="_blank">
    <img src=".png" width="300" alt="Skills Network Logo">
    </a>
</p>

# https://github.com/kristajan/UTF8_Utilities
These utilities help to understand why a utf-8 file can't be read, and provides
functions to convert to a readable approximation of the file.  The problem is caused
when Python's encoder/decoder is missing some byte sequences that are new or used
internationally, but not recognized yet in the USA.


In [None]:
# Problem: Some files are mostly utf8 but have some sequences
# that are not readable with utf8 decoders.
# The functions below try to change the sequences into newlines or - or quotes, etc
# in 2 steps, 1. printplaintext() and 2. fixnonreadables()
# Then there are optional functions to split long lines, clean up, and
# write a file with just those line numbers that have unreadable characters.
#
# There are 3 ways to run:
# 1. a. In cell # [6] marked __main__ set the input file name, limit, 
#       limitnbrlines, and MAXFILESIZE.
#    b. Run all the cells above it (1-5) and it.
#    c. Look at the files with .txt appended to your file name in your directory.
#
# 2. a. In the cell # [9] set your input file name at the bottom and run readdecode() with it.
#    b1. Look at the output comments.  Add any encodings you want to try,
#    b2. or Write a line to run runfixundecodables() with your file name.
#    c. Look at the files with .txt appended to your file name in your directory.
#
# 3. You can run just the first few functions, skip splitlonglines()
#    and then run cleanexcessblanklines(), after setting input file name, limit, 
#    limitnbrlines, and MAXFILESIZE.
#
# 4. At the end of this file, see findbadbytes() and writelineandbytenbrs()
#
# Security is helped by printing os.path.basename(file) instead of full path name
#                       not having unvalidated user input in print statements
#                       checking os.path.getsize(file) for a file > MAXFILESIZE Gb
#                       For longer files, change MAXFILESIZE below to a larger number.
#
# OVER_RIDE in CELL # [6] Globally used variables:  OVER-RIDE in CELL # [6]
if __name__ == '__main__':
    MAXFILESIZE = 2 * 1024 * 1024 * 1024  # 2 Gb
    ENCODER = 'utf-8'
    infile = 'tests.txt'
    limit = 75
    limitnbrlines = 1000000  # for testing to limit number of lines processed

import datetime
def prnow():  # shortcut
    print(datetime.datetime.now())


In [165]:
# convert the byte lines to strings
# that is, when a non-readable character is encountered, 
# output it like \\u2010 or \\xe2\\x80\\x90 etc
# the str() function does this, how nice
def printplaintext(infile):
    import os
    file_size = os.path.getsize(infile)
    if file_size > ( MAXFILESIZE):
        raise IOError("printplaintext(): IOError File " + os.path.basename(infile) + " is over ", MAXFILESIZE)
        return
    outfile = f"{infile}.txt"
    plainfile = os.path.join(os.getcwd(), outfile)
    lnnbr = 0
    limitlines = limitnbrlines  # can change this for testing
    print(f"printplaintext: {infile=} {limitlines=}")
    try:
        #print("printplaintext: trying open 'rb'")
        with open(infile, 'rb') as fin:
            # count number of lines in infile and then seek back to beginning
            line_count = sum(1 for line in fin)
            fin.seek(0,0)
            print(f"printplaintext: {line_count=}")
            with open(plainfile, 'w+') as fout:
                while(lnnbr < limitlines) and (lnnbr <= line_count):
                    lnnbr += 1  # in this case, increment before trying
                    try:
                        ln = fin.readline()
                        if len(ln) == 0:
                            print(f"len(ln) is 0, so break/done, {lnnbr=}")
                            break
                        # When we do readline(), each line has an appended newline.
                        # But because we read binary, it becomes 2 chars \ and n
                        # So we need to append a real newline since we will
                        # readline() utf8 in the next function.
                        fout.write(str(ln) + '\n')
                    except Exception as e:
                        print(f"printplaintext: Exception {e=}")
                        return
    except Exception as e:
        print(f"printplaintext: Exception {e=}")
    print('printplaintext: plainfile name is ', os.path.basename(outfile))
    return(plainfile)


In [166]:
# Read the plaintext file which has 4-7 character \xdd and \\udddd
# and replace them with estimates of what they mean.
def do_subs(text, replacements):
    # a safe way to do 1 regex at a time use a for loop, faster than many other ways
    import re
    cnt = 0
    for old, new in replacements.items():
        cnt = cnt + 1
        text = re.sub(old, new, text)
    return text

# infile is expected to be from printplaintext()
def fixnonreadables(infile):
    import os
    import re
    file_size = os.path.getsize(infile)
    if file_size > (MAXFILESIZE):
        raise IOError("fixnonreadables(): IOError File " + os.path.basename(infile) + " is over ", MAXFILESIZE)
        return
    lnnbr = 0
    limitlines = limitnbrlines  # CHANGE THIS FOR LONGER FILES
    #print(f"fixnonreadables: {infile=} {limitlines=}")
    outfile = f"{infile}.txt"
    fixedfile = os.path.join(os.getcwd(), outfile)
    # trim off binary stuff
    replacements = {r"^b'" : "" ,
        r'^b"' : "" ,
        r"\\n'$" : "" ,
        r'\\n"$' : '' ,
        r'\\\\n' : '' ,
        r"\\xe2\\x80\\x9[0-5]" : "-" ,
        r"\\xe2\\x80\\x9[6]" : "|" ,
        r"\\xe2\\x80\\x9[7]" : "_" ,
        r"\\xe2\\x80\\x9[89ab]" : "'" ,
        r"\\xe2\\x80\\x9[cdef]" : '"' ,
        r"\\xe2\\x80\\x[0-9]{2}" : '\n' ,
        r"\\xc2\\xa9" : "CR" ,  # Copyright
        r"\\x1b\[C\\x1b" : "SECTN" ,  # Section header, #1b is an escapt
        r"\\x16" : "-" ,
        r"\\x[0-9a-fA-F]{2}" : "?"  ,
        r"\\\\u201[012345]" : "-" ,
        r"\\\\u201[6]" : "|" ,
        r"\\\\u201[7]" : "_" ,
        r"\\\\u201[89ab]" : "'" ,
        r"\\\\u201[cdef]" : '"'  ,
        r"\\\\u[0-9a-fA-F]{8}" : "?" ,
        r"\\\\u[0-9a-fA-F]{4}" : "?" ,
        r"\\\\u[0-9a-fA-F]{3}" : "?" ,
        r"\\r" : '' ,
        r"\\" : '' }
    try:
        with open(infile, 'r', encoding=ENCODER) as fin:
            # count number of lines in infile and then seek back to beginning
            line_count = sum(1 for line in fin)
            fin.seek(0,0)
            print(f"fixnonreadables: {line_count=} {limitlines=}")
            with open(fixedfile, 'w+') as fout:
                while(lnnbr < limitlines) & (lnnbr <= line_count):
                    lnnbr += 1  # in this case, increment just for trying for safety
                    try:
                        ln = fin.readline()
                        newln = str(ln)  # Make it 1 long string so re can operate on it
                        # DO SUBS HERE
                        newln = do_subs(newln, replacements)
                        fout.write(newln)  # here don't need newline since readline() has it
                    except Exception as e:
                        print(f"fixnonreadables: Exception {e=}")
                        return
                ## end while
            ## end open write
        ## end open read
    except Exception as e:
        print(f"fixnonreadables: Exception {e=}")
        return
    finally:
        print('fixnonreadables: returning file ', os.path.basename(outfile))
        return fixedfile


In [167]:
# Another problem with the transcripts is that some lines
# are very long, making it hard to read.
# So split/break very long lines into 'limit' number of characters.
# If the break is in the middle of a word, then let the line go a bit longer
# to include the whole word.  Then concatenate the remainder of the line
# that is approximately 'limit' number of chaacters onto the beginning
# of the next line, so that we don't have excess newlines.
# To find the previous word boundary would be more complicated.

# split a line, but keep the delimiters.
#https://stackoverflow.com/questions/2136556/in-python-how-do-i-split-a-string-and-keep-the-separators
def splitkeep(s, delimiter):  # Stack Overflow
   splt = s.split(delimiter)
   return [substr + delimiter for substr in splt[:-1]] + [splt[-1]]

# Split very long lines into 'limit' number of characters
# For testing or safety, in the calling code, limit the number of lines
# processed to limitnbrlines.  Here it is initialized to satisfy the global
# in the next function.
def splitlonglines(infile, limit, limitlines=limitnbrlines):
    import os
    import re
    incount = 0
    outcount = 0
    remainder = ''  # so short remaining parts of lines are concated w/ next line
    #limitlines = 10  ## for testing only process this number of lines
    #limit = 40  ## testing max length of each output line
    inputfile = os.path.basename(infile)
    print(f"splitlonglines: {inputfile=} {limit=} {limitlines=}")
    file_size = os.path.getsize(infile)
    if file_size > (MAXFILESIZE):
        raise IOError("splitlonglines(): IOError File " + os.path.basename(infile) + " is over ", MAXFILESIZE)
        return
    outfile = f"{infile}.txt"
    # Here, we could either use os.path.dirname(infile)
    # or just use current working directory
    splitfile = os.path.join(os.getcwd(), outfile)
    with open(infile, 'r', encoding=ENCODER) as f:  # Exceptions caught by system
        # count number of lines in infile and then seek back to beginning
        # to validate that the file is not too big.
        line_count = sum(1 for line in f)
        f.seek(0,0)
        print(f"splitlonglines: {line_count=}")
        inlines = f.readlines()
        print(f"splitlonglines: number of inlines = {len(inlines)=}")
    with open(splitfile, 'w+', encoding=ENCODER) as fout:  # Exceptions caught by system
        remainder = ''
        for ln in inlines:
            incount += 1
            # print(f"{len(ln)=} {incount=} {ln=}")  # for debugging short file only
            if incount > limitlines:  # for testing
                #print(f"splitlonglines: ln[limit:]={ln[limit:]=}")  # for debugging
                fout.write(remainder)  # since might have it leftover, should have \n
                outcount += 1
                break  # we are done
            # Here we will concat the remainder with the next line unless next is newline
            # This is to make it look nicer.  Otherwise there are short lines between
            # the limit length lines.
            if not ln.strip():  # test for blank line
                if len(remainder) > 1:   # it is blank
                    # write the last part of the line by itself with the new \n line
                    # print(f"{len(ln)=} {incount=} {ln=}")  # debugging
                    fout.write(remainder + ln)  # keep the blank line
                    outcount += 1
                    remainder = ''
                    continue  # next line 
                else:  # keep the blank line
                    fout.write(ln)
                    outcount += 1
                    continue  # next line
            else:  # not a blank line
                if len(remainder) > 1: 
                    # still in the loop, concat rest of line to next line
                    # This is to make it look nicer.  Otherwise there are short lines between
                    # the limit length lines.
                    ln = remainder[:-1] + ' ' + ln  # remove \n from remainder & concat
                    #print(f"{len(ln)=} {incount=} {ln=}")
                    remainder = ''
            while len(ln) > limit:  ## the max lenth of each line plus or minus 1 word
                #print(f"{len(ln)=} {incount=} {ln=}")
                match = re.search(r'[\S]+[\s]' , ln[limit:])  # word boundary and word start
                if match:
                    #print(f"splitlonglines:start={match.start()=} end={match.end()=}") #testing
                    newlim = match.end() + limit
                else:
                    newlim = limit
                #print(f"splitlonglines: {newlim=}")  # for testing short file only
                outline = ln[0:newlim]
                fout.write(outline + '\n')  # must add newline here
                outcount += 1
                #print(f"len of outline = {len(outline)=}")  # for debugging only
                #print( "splitlonglines: 0123456789 123456789 123456789 123456789 1234567890 123456789 12345678")
                #print(f"splitlonglines: {outline}")
                ln = ln[newlim:]  # cut out the matched part of the line
            remainder = ln  # we will concat to front of next line, might seem to throw off counts
        if len(remainder) > 0:  # last part of text in the file
            fout.write(remainder)  # here there is still a newline at the end already
            outcount += 1
    print(f"splitlonglines: {incount=} {outcount=} approximate, may reflect some joins")
    print("splitlonglines: returning output file ", os.path.basename(outfile))
    fout.close()
    return splitfile


In [168]:
# Some of converting unreadable bytes may have left blank lines
# so remove most of them.
def cleanexcessblanklines(infile):
    import os
    import re
    file_size = os.path.getsize(infile)
    if file_size > (MAXFILESIZE):
        raise IOError("cleanexcessblanklines(): IOError File " + os.path.baename(infile) + " is over ", MAXFILESIZE)
        return
    lnnbr = 0
    outcount = 0
    # calling code should set limitnbrlines as a global.
    limitlines = limitnbrlines  # make this short for testing
    ln = ''
    outfile = f"{infile}.txt"
    cleanfile = os.path.join(os.getcwd(), outfile)
    try:
        with open(infile, 'r', encoding=ENCODER) as fin:
            line_count = sum(1 for line in fin)  # count number of lines and seek back to beginning
            fin.seek(0,0)
            print(f"cleanexcessblanklines: {line_count=} {limitlines=} set by calling code!!!")
            lastlineblank = 0  # a flag
            with open(cleanfile, 'w+') as fout:
                while(lnnbr <= limitlines) & (lnnbr <= line_count):
                    lnnbr += 1  # in this case, increment just for trying for safety
                    if lnnbr > limitlines:
                        print('cleanexcessblanklines: done limitlines = ', limitlines)
                        break  # done
                    try:
                        ln = fin.readline()
                        newln = str(ln)  # must be string so re can operate on it or strip
                        if not newln.strip():  # still a blank line
                            if lastlineblank == 1:  # skip it
                                continue
                            else:  # write it and set flag
                                fout.write(newln)
                                outcount += 1
                                lastlineblank = 1
                        else:
                            fout.write(newln)
                            outcount += 1
                            lastlineblank = 0
                    except Exception as e:
                        print(f"cleanexcessblanklines: Exception {e=}")
                        return
    except Exception as e:
        print(f"cleanexcessblanklines: Exception {e=}")
    print(f"cleanexcessblanklines: {lnnbr=} {outcount=} returning file ", os.path.basename(outfile))
    return cleanfile


In [169]:
# After printplaintext() run this with file.txt.txt to create file 
# with backslash lines.
# Then verify that the overall program got rid of nonreadable characters
# using file.txt.txt.txt as the infile.  The result should have 0 lines.
def findbackslash(infile):
    import re
    import os
    file_size = os.path.getsize(infile)
    filename = os.path.basename(infile)
    if file_size > (MAXFILESIZE):
        raise IOError("cleanexcessblanklines(): IOError File " + filename + " is over ", MAXFILESIZE)
        return
    incount = 0
    with open(infile, 'r') as fin:
        outname = f"{filename}Backslash.txt"
        outfile = os.path.join(os.getcwd(), outname)
        with open(outfile , 'w+') as fout:  # exceptions found by system
            ln = 'x'
            while len(ln) > 0:
                ln = fin.readline()
                incount += 1
                match = re.search(r'\\+[xu]' , ln)
                if match:
                    fout.write("line " + str(incount) + ":" + ln)
    fin.close()
    fout.close()
    print("findbackslash: returning file ", os.path.basename(outfile))
    return(outfile)


In [175]:
# The problem with transcripts file is unicode types of newlines, or literal backslashes
# Mostly in the range \u2010 to \u201f may also appear as \\xe2\\x80\\x9[0-0a-f]
# Other undecodable bytes are changed to newlines.
if __name__ == '__main__':
    MAXFILESIZE = 2 * 1024 * 1024 * 1024  # 2 Gb
    ENCODER = 'utf-8'
    infile = 'tests.txt'
    limit = 75
    limitnbrlines = 1000000  # for testing to limit number of lines processed

    import os
    import time
    !python --version
    prnow()
    numslist = []  # in future, this will collect err nbrs for UnicodeDecodeError if we run readdecode
    plainfile = printplaintext(infile)
    time.sleep(0.1)  # make sure each file is written in order
    fixedfile = fixnonreadables(plainfile)
    time.sleep(0.1)  # make sure each file is written in order
    splitfile = splitlonglines(fixedfile, limit, limitnbrlines)
    time.sleep(0.1)  # make sure each file is written in order
    cleanfile = cleanexcessblanklines(splitfile)
    # After printplaintext(), or that plus fixdecodeerror()
    # run findbackslash to see line numbers having backslashes
    myfile = findbackslash(plainfile)
    myfile = findbackslash(fixedfile)
    prnow()

Python 3.12.7
2025-03-07 16:39:24.781670
printplaintext: infile='tests.txt' limitlines=1000000
printplaintext: line_count=74
len(ln) is 0, so break/done, lnnbr=75
printplaintext: plainfile name is  tests.txt.txt
fixnonreadables: line_count=74 limitlines=1000000
fixnonreadables: returning file  tests.txt.txt.txt
splitlonglines: inputfile='tests.txt.txt.txt' limit=75 limitlines=1000000
splitlonglines: line_count=74
splitlonglines: number of inlines = len(inlines)=74
splitlonglines: incount=74 outcount=71 approximate, may reflect some joins
splitlonglines: returning output file  tests.txt.txt.txt.txt
cleanexcessblanklines: line_count=90 limitlines=1000000 set by calling code!!!
cleanexcessblanklines: lnnbr=91 outcount=90 returning file  tests.txt.txt.txt.txt.txt
findbackslash: returning file  tests.txt.txtBackslash.txt
findbackslash: returning file  tests.txt.txt.txtBackslash.txt
2025-03-07 16:39:25.124436


In [171]:
# We are here because readdecode failed, none of the enocdings
# that were tried could read the file
def runfixundecodable(infile):
    import time
    !python --version
    prnow()
    # unicodenewlines = u'[\u00a0\u1680\u180E\u2000\u2001\u2002\u2003\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u3000\uFEFF]'
    numslist = []  # this will collect err nbrs for UnicodeDecodeError if we do part-reads
    plainfile = printplaintext(infile)
    time.sleep(0.01)  # make sure each file is written in order
    fixedfile = fixnonreadables(plainfile)
    time.sleep(0.01)  # make sure each file is written in order
    splitfile = splitlonglines(fixedfile, limit, limitnbrlines)
    time.sleep(0.01)  # make sure each file is written in order
    cleanfile = cleanexcessblanklines(splitfile)
    # After printplaintext(), or that plus fixdecodeerror()
    # run findbackslash to see line numbers having backslashes
    myfile = findbackslash(plainfile)
    myfile = findbackslash(fixedfile)
    prnow()


In [182]:
# Problem: some files are mostly utf8 but have some sequences
# that are not readable with utf8 decoders.
# If desired, first run readdecode with your infile.
# Then the other functions will be called if non of the tried decoders works.
# Else, run as the main program which assumes fixing unreadables is needed.
# .txt is appended to each file name in the sequence.  For example, if
# infile is funnyfile.txt, then 
# This work ok, later try to read as json as well, and maybe Universal
numslist = []  # initialilze in case calling code doesn't
def readdecode(infile):
    import os, re
    #import json
    global numslist
    readok = 0
    file_size = os.path.getsize(infile)
    if file_size > (MAXFILESIZE):
        raise IOError("readdecode(): IOError File " + infile + " is over ", MAXFILESIZE)
        return
    try:
        with open(infile, 'r', encoding='utf-8') as fin:
            utf8str = fin.read()
            print(f"readdecode: SUCCESS utf-8 {len(utf8str)=}")
            decodeds = utf8str.decode('utf-8',errors='replace')
            print(f"readdecode: attempted utf8str.decode {len(decodeds)=}")
            readok += 1
            #return decodes
    except UnicodeDecodeError as e:
        print('readdecode: EXCEPTION UnicodeDecodeError utf-8\n', e)        
        nums = re.findall(r'\d+' , str(e))  # get the byte number of unreadable
        # numslist.append(nums)  # for future version
        print(f"{nums=}")
    except Exception as e:
        print('readdecode: other EXCEPTION for utf8\n', e)
    try:
        print('readdecode: try to read as cp1252')
        with open(infile,'r',encoding='cp1252') as fin:
            data = fin.read()
            print('readdecode: SUCCESS cp1252')
            readok += 1
            #return data
    except Exception as e:
        print('readdecode: EXCEPTION for cp1252\n', e)
    try:
        print('readdecode: try to read as latin-1')
        with open(infile,'r',encoding='latin=1') as fin:
            data = fin.read()
            print('readdecode: SUCCESS latin-1')
            print('readdecode: tail of data is ', data[-50:-1])
            readok += 1
            #return data
    except Exception as e:
        print('readdecode: EXCEPTION for latin-1\n', e)
    try:
        print('readdecode: try to read as utf-16')
        with open(infile, 'r', encoding='utf-16') as fin:
            utf16str = fin.read()
            print(f"readdecode: SUCCESS utf16 {len(utf16str)=}")
            readok += 1
            #return utf16str
    except Exception as e:
        print('readdecode: EXCEPTION on last try for open encoded file\n', e)
        #print('readdecode: calling fixdecodeerr')
        #return decodes  # return string
    print("readdecode: readok = ", readok)
    '''  UNCOMMENT if desired
    if readok < 1:
        runfixundecodable(infile)
    ''';
    

In [183]:
if __name__ == '__main__':
    readdecode('tests.txt')  # Comments show which simple encoding(s) work or not
    # runfixundecodable('tests.txt')  # UNCOMMENT if desired

readdecode: EXCEPTION UnicodeDecodeError utf-8
 'utf-8' codec can't decode byte 0x93 in position 1063: invalid start byte
nums=['8', '0', '93', '1063']
readdecode: try to read as cp1252
readdecode: EXCEPTION for cp1252
 'charmap' codec can't decode byte 0x81 in position 3379: character maps to <undefined>
readdecode: try to read as latin-1
readdecode: SUCCESS latin-1
readdecode: tail of data is  "ì¸" (se) = "EC 95 8C" 
"ì" (yo) = "EB 82 98" 
readdecode: try to read as utf-16
readdecode: EXCEPTION on last try for open encoded file
 'utf-16-le' codec can't decode bytes in position 2732-2733: illegal encoding
readdecode: readok =  1


In [174]:
# TEST CELL IN PROGRESS
# Problem: some files are mostly utf8 but have some sequences
# that are not readable with utf8 decoders.
#!pip install shutil
import os, re
# https://www.google.com/search?q=python3+truncate+file+before+after&num=10&sca_esv=d2412773b0b8d781&rlz=1C1UEAD_enUS1102US1102&ei=Y0zLZ8XoFubewN4P27Ll2QU&ved=0ahUKEwiFpt2y3fiLAxVmL9AFHVtZOVsQ4dUDCBA&uact=5&oq=python3+truncate+file+before+after&gs_lp=Egxnd3Mtd2l6LXNlcnAiInB5dGhvbjMgdHJ1bmNhdGUgZmlsZSBiZWZvcmUgYWZ0ZXIyBRAhGKsCSIYuUJgDWP0mcAF4AJABAJgBrQGgAZsMqgEDNi44uAEDyAEA-AEBmAIOoAKxDMICChAAGLADGNYEGEfCAgcQABiABBgNwgIGEAAYFhgewgILEAAYgAQYhgMYigXCAgUQABjvBcICCBAAGKIEGIkFwgIIEAAYgAQYogTCAgcQIRigARgKwgIFECEYnwWYAwDiAwUSATEgQIgGAZAGCJIHAzUuOaAHw1o&sclient=gws-wiz-serp
def truncate_before(filename, position):
    print(f"truncate_before:")
    with open(filename, "rb+") as file:
        file.seek(position)
        remaining_content = file.read()
        file.seek(0)
        file.write(remaining_content)
        file.truncate()

def truncate_after(filename, position):
    with open(filename, "rb+") as file:
        file.truncate(position)

### this will be a def to call
# This only locates unreadable bytes and makes no attempt to fix them
# For that use printplaintext(), fixunreadables() and cleanexcessblanklines()
def tryread(workfile):
    global numslist
    global displaynums
    global doneflag
    try:
        with open(workfile, 'r', encoding='utf-8') as fin:
            utf8str = fin.read()
            print(f"tryread: SUCCESS {os.path.basename(workfile)} {len(utf8str)=} {str(trycount)=}")
            print(f"{numslist=}")
            doneflag = 1
            return
    except UnicodeDecodeError as e:
        print('tryread: EXCEPTION UnicodeDecodeError\n', e)      
        nums = re.findall(r'\d+' , str(e))  # get the byte & position numbers of unreadable
        print(f"tryread: {nums=}")
        numy =[nums[2],nums[3]]  # list
        numslist.append(numy)
        print(f"{numslist=}")  # debugging
        # sum of past byte numbers to match original
        numz = displaynums[-1]  # accumulating sum to keep original byte numbers
        numz[1] = int(numz[1]) + int(numy[1])  # accumulating sum of positions
        numz[0] = numy[0]  # copy the offending byte
        displaynums.append([numz[0],numz[1]])
        print(f"{displaynums=}")  # debugging
        # expecting to repeat this case so leave doneflag as 0
        # fall through to truncate step
    except Exception as e:
        print(f"tryread: EXCEPTION unable to read {os.path.basename(workfile)}\n", e)
        doneflag = 1
        return
    # Truncate working file to remove the bad byte
    try:
        with open(workfile, 'rb+') as fin:
            lastnums = numslist[-1]
            lastbadbyte = int(lastnums[1])
            truncate_before(workfile,lastbadbyte+2)  # positions start with 0
            return
    except:
        print(f"tryread: cannot read {os.path.basename(workfile)} as rb+")
        doneflag = 1
        return

def findbadbytes(infile):
    import os, re
    import shutil
    global numslist
    global displaynums
    global doneflag
    global limitnbrtries
    numslist = [['BadByte', 'Position'],[0,0]]  # initialilze
    displaynums = [['BadByte', 'Position'],[0,0]]  # initialilze
    file_size = os.path.getsize(infile)
    if file_size > (MAXFILESIZE):
        raise IOError(f"findbadbytes e(): IOError {os.path.basename(infile)} is over ", MAXFILESIZE)
        doneflag = 1
        return
        # first time only copy the file to a working version
    workfile = f"{infile}WORKING"
    shutil.copy2(infile, workfile)
    limitnbrtries = 3
    doneflag = 0
    trycount = 0
    while(trycount < limitnbrtries) & (doneflag < 1):
        trycount += 1
        tryread(workfile)
if __name__ == '__main__':
    infile = "tests.txt"
    findbadbytes(infile)
    print(f"{displaynums[:-2]=}")


tryread: EXCEPTION
 'utf-8' codec can't decode byte 0x93 in position 1063: invalid start byte
tryread: nums=['8', '0', '93', '1063']
numslist=[['BadByte', 'Position'], [0, 0], ['93', '1063']]
displaynums=[['BadByte', 'Position'], ['93', 1063], ['93', 1063]]
truncate_before:
tryread: EXCEPTION
 'utf-8' codec can't decode byte 0x96 in position 376: invalid start byte
tryread: nums=['8', '0', '96', '376']
numslist=[['BadByte', 'Position'], [0, 0], ['93', '1063'], ['96', '376']]
displaynums=[['BadByte', 'Position'], ['93', 1063], ['96', 1439], ['96', 1439]]
truncate_before:
tryread: EXCEPTION
 'utf-8' codec can't decode byte 0x92 in position 13: invalid start byte
tryread: nums=['8', '0', '92', '13']
numslist=[['BadByte', 'Position'], [0, 0], ['93', '1063'], ['96', '376'], ['92', '13']]
displaynums=[['BadByte', 'Position'], ['93', 1063], ['96', 1439], ['92', 1452], ['92', 1452]]
truncate_before:
displaynums[:-1]=[['BadByte', 'Position'], ['93', 1063], ['96', 1439], ['92', 1452]]


In [181]:
# This prints the whole file with line numbers and byte numbers
# It has \n as a literal as well as \xdd and \udddd
def writelineandbytenbrs():
    global writecnt
    writecnt = 0
    lnnbr = 0
    limitlines = 300
    print(f"{limitlines=}")
    bytenbr = -1  # START with ZERO or make it 0 to start with 1
    oldbytenbr = -1
    try:
        with open(infile, 'rb') as fin:
            line_count = sum(1 for line in fin)
            print(f"writelineandbytenbrs: {line_count=}")
            fin.close()
        with open(infile, 'rb') as fin:
            with open(outfile, 'w+') as fout:
                while(lnnbr < limit) and (lnnbr < line_count):
                    try:
                        #print("trying rb readline")
                        ln = fin.readline()
                        lnnbr += 1  # in this case, set the actual line number to print
                        oldbytenbr = bytenbr
                        nextnbr = bytenbr + 1
                        bytenbr = bytenbr + len(ln)
                        outline = 'line ' + str(lnnbr) + ' byte ' + str(nextnbr) + ' ' + str(ln) + ' byte ' + str(bytenbr) + '\n'
                        fout.write(outline)
                        writecnt += 1
                    except Exception as e:
                        print(e)
                        return
    except Exception as e:
        print(f"{e=}")
    fin.close()  # not needed, but just to remember
    fout.close()
if __name__ == '__main__':
    infile = './tests.txt'
    outfile = './testsByteNumber.txt'
    writelineandbytenbrs()
    print("wrote ", writecnt, "lines to ", outfile)


limitlines=300
writelineandbytenbrs: line_count=74
wrote  74 lines to  ./testsByteNumber.txt


In [184]:
%%bash
ls


  i n d o w s   S u b s y s t e m   f o r   L i n u x   h a s   n o   i n s t a l l e d   d i s t r i b u t i o n s . 
 Y o u   c a n   r e s o l v e   t h i s   b y   i n s t a l l i n g   a   d i s t r i b u t i o n   w i t h   t h e   i n s t r u c t i o n s   b e l o w : 
 
 U s e   ' w s l . e x e   - - l i s t   - - o n l i n e '   t o   l i s t   a v a i l a b l e   d i s t r i b u t i o n s 
 a n d   ' w s l . e x e   - - i n s t a l l   < D i s t r o > '   t o   i n s t a l l . 
 

CalledProcessError: Command 'b'ls\n'' returned non-zero exit status 1.