In [1]:
import pandas as pd 

In [2]:
codings = ['utf-8', 'utf-16', 'gbk']
char = u'工'

In [3]:
class UnicodeDetector(object):
    ''' Detector of Unicode '''

    def __init__(self, codings=codings):
        ''' The initializer of the Detector

        Args:
        - @self: Self instance;
        - @codings: The codings of interests.
        '''
        self.codings = codings
        pass

    def set_char(self, char):
        ''' Setup Example Char

        Args:
        - @self: Self instance;
        - @char: The Example Char to be detected.

        Returns:
        - @char: The Example Char;
        - @char_table: The Unicode of the Example Char in the coding of interests.
        '''
        char_table = dict()
        for c in codings:
            e = char.encode(c)
            print(c, '->', e)
            char_table[c] = e
        char_table
        
        self.char = char
        self.char_table = char_table
        
        return char, char_table

    def get_neighbors(self, coding=None):
        ''' Get neighbors of the Example Char

        Args:
        - @self: Self instance;
        - @coding: The coding of interest, default value is None.

        Returns:
        - @df: The neighbors DataFrame of the Example Char using the selected Coding.
        '''
        # Setup the default coding
        if not coding in self.codings:
            coding = self.codings[0]

        # Get the neighbors and save them into a 256 length array [lst]
        bs = [e for e in self.char_table[coding]]
        lst = []
        for j in range(256):
            bs[-1] = j
            c = None
            try:
                c = bytes(bs).decode(coding)
            except UnicodeDecodeError:
                pass
            lst.append(c)

        # Convert the [lst] into the DataFrame
        tmp = pd.DataFrame(lst)
        df = pd.DataFrame(tmp.to_numpy().reshape((16, 16)))
        
        self.coding = coding
        self.df = df
        
        return df
    
ud = UnicodeDetector()
ud.set_char(char)

utf-8 -> b'\xe5\xb7\xa5'
utf-16 -> b'\xff\xfe\xe5]'
gbk -> b'\xb9\xa4'


('工',
 {'utf-8': b'\xe5\xb7\xa5', 'utf-16': b'\xff\xfe\xe5]', 'gbk': b'\xb9\xa4'})

In [4]:
ud.get_neighbors(ud.codings[0])
print('The Neighbor Table of "{}" in "{}"'.format(ud.char, ud.coding))
ud.df

The Neighbor Table of "工" in "utf-8"


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,
5,,,,,,,,,,,,,,,,
6,,,,,,,,,,,,,,,,
7,,,,,,,,,,,,,,,,
8,巀,巁,巂,巃,巄,巅,巆,巇,巈,巉,巊,巋,巌,巍,巎,巏
9,巐,巑,巒,巓,巔,巕,巖,巗,巘,巙,巚,巛,巜,川,州,巟


In [5]:
ud.get_neighbors(ud.codings[1])
print('The Neighbor Table of "{}" in "{}"'.format(ud.char, ud.coding))
ud.df

The Neighbor Table of "工" in "utf-16"


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,å,ǥ,˥,ϥ,ӥ,ץ,ۥ,ߥ,ࣥ,৥,૥,௥,೥,෥,໥,࿥
1,ქ,ᇥ,ዥ,Ꮵ,ᓥ,ᗥ,ᛥ,៥,ᣥ,᧥,᫥,ᯥ,᳥,ᷥ,ụ,ῥ
2,⃥,⇥,⋥,⏥,ⓥ,◥,⛥,⟥,⣥,⧥,⫥,⯥,⳥,ⷥ,⻥,⿥
3,ュ,㇥,㋥,㏥,㓥,㗥,㛥,㟥,㣥,㧥,㫥,㯥,㳥,㷥,㻥,㿥
4,䃥,䇥,䋥,䏥,䓥,䗥,䛥,䟥,䣥,䧥,䫥,䯥,䳥,䷥,以,俥
5,僥,凥,勥,句,哥,嗥,囥,埥,壥,姥,嫥,寥,峥,工,廥,忥
6,惥,懥,拥,揥,擥,日,曥,查,棥,槥,櫥,毥,泥,淥,滥,濥
7,烥,燥,狥,珥,瓥,痥,盥,知,磥,秥,童,篥,糥,緥,绥,翥
8,胥,臥,若,菥,蓥,藥,蛥,蟥,裥,觥,諥,该,賥,跥,軥,迥
9,郥,釥,鋥,鏥,铥,闥,雥,韥,飥,駥,髥,鯥,鳥,鷥,黥,鿥


In [6]:
ud.get_neighbors(ud.codings[2])
print('The Neighbor Table of "{}" in "{}"'.format(ud.char, ud.coding))
ud.df

The Neighbor Table of "工" in "gbk"


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,
4,笯,笰,笲,笴,笵,笶,笷,笹,笻,笽,笿,筀,筁,筂,筃,筄
5,筆,筈,筊,筍,筎,筓,筕,筗,筙,筜,筞,筟,筡,筣,筤,筥
6,筦,筧,筨,筩,筪,筫,筬,筭,筯,筰,筳,筴,筶,筸,筺,筼
7,筽,筿,箁,箂,箃,箄,箆,箇,箈,箉,箊,箋,箌,箎,箏,
8,箑,箒,箓,箖,箘,箙,箚,箛,箞,箟,箠,箣,箤,箥,箮,箯
9,箰,箲,箳,箵,箶,箷,箹,箺,箻,箼,箽,箾,箿,節,篂,篃
