In [None]:
# default_exp chars

# chars

> Set of functions used to preprocess french text characters.

In [None]:
#hide
from nbdev.showdoc import *

Configure the root directory of the french dataset on this computer :

In [None]:
from pathlib import Path

rootdir = Path(r"\\?\C:\Users\laure\OneDrive\Dev\Python\nlptextdoc\dataset 092019")

Configure tabular data display in this notebook :

In [None]:
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

# Character set normalization for french

## 1. Load Unicode character set data for latin script

In [None]:
chardatadir = rootdir / "libdata" / "chars"

### 1.1 Frequent encoding errors : windows1252 read as iso8859-1

In [None]:
dfencodingwin1252 = pd.read_csv(chardatadir / "windows1252-iso8859-errors.csv", sep=";")
dfencodingwin1252.head(10)

Unnamed: 0,Code,Char,DecodedCode,DecodedChar
0,146,,8217,’
1,128,,8364,€
2,133,,8230,…
3,150,,8211,–
4,156,,339,œ
5,149,,8226,•
6,147,,8220,“
7,148,,8221,”
8,151,,8212,—
9,145,,8216,‘


In [None]:
print(f"{len(dfencodingwin1252)} frequent encoding errors seen in french datasets : a character encoded as windows1252 was incorrectly decoded as iso8859-1")

10 frequent encoding errors seen in french datasets : a character encoded as windows1252 was incorrectly decoded as iso8859-1


Columns :
- Code/Char : incorrectly decoded control char seen in french text
- DecodedCode/DecodedChare : properly decoded char which should replace the original control char

In [None]:
win1252errorchars = {}
for rowidx,row in dfencodingwin1252.iterrows():
    win1252errorchars[row["Char"]] = row["DecodedChar"]

### 1.2 Frequent encoding errors : utf8 read as windows1252

In [None]:
dfencodingutf8 = pd.read_csv(chardatadir / "utf8-windows1252-errors.csv", sep=";")
dfencodingutf8.head()

Unnamed: 0,ErrorSubstring,DecodedCode,DecodedChar
0,â‚¬,8364,€
1,â€š,8218,‚
2,Æ’,402,ƒ
3,â€ž,8222,„
4,â€¦,8230,…


In [None]:
print(f"{len(dfencodingutf8)} very unlikely substrings produced when text encoded with UTF-8 is decoded by mistake as iso8859-1 or windows1252")

117 very unlikely substrings produced when text encoded with UTF-8 is decoded by mistake as iso8859-1 or windows1252


Columns :
- ErrorSubstring : unlikely substring of length 2 or 3 characters produced when UTF-8 text is decoded by mistake as windows1252
- DecodedCode/DecodedChar : properly decoded char which should be used to replace the unlikley substring

In [None]:
utf8errorchars = {}
for rowidx,row in dfencodingutf8.iterrows():
    utf8errorchars[row["ErrorSubstring"]] = row["DecodedChar"]

### 1.3 Unicode combining chars

In [None]:
dfcombiningchars = pd.read_csv(chardatadir / "combiningdiacritics.csv", sep=";")
dfcombiningchars.head()

Unnamed: 0,BaseChar,Code,Char,Diacritic,CombinedChar
0,A,769,́,Acute,Á
1,E,769,́,Acute,É
2,I,769,́,Acute,Í
3,O,769,́,Acute,Ó
4,U,769,́,Acute,Ú


In [None]:
print(f"{len(dfcombiningchars['Char'].unique())} combining chars {list(dfcombiningchars['Diacritic'].unique())} should be recombined with {len(dfcombiningchars)} base latin characters to produce standard latin characters with diacritics")

12 combining chars ['Acute', 'Grave', 'Circumflex', 'Cedilla', 'Tilde', 'Diaeresis', 'Long Stroke Overlay', 'Macron', 'Caron', 'Dot Below', 'Dot Above', 'Ring Above'] should be recombined with 274 base latin characters to produce standard latin characters with diacritics


Columns :
- BaseChar : latin char encountered first in the string, which will be modified by the combining char immediately following it
- Code/Char : combining char immediately following BaseChar, which should be combined with it to produce CombinedChar
- Diacritic : type of accent / diacritic applied by the combining char
- CombinedChar : latin char with diacritic produced by the combination of BaseChar and the combining Char following it

In [None]:
combiningchars = {}
for rowidx,row in dfcombiningchars.iterrows():
    combiningchars[row["BaseChar"]+row["Char"]] = row["CombinedChar"]

### 1.4 Control chars

In [None]:
dfcontrolchars = pd.read_csv(chardatadir / "controlchars.csv", sep=";")
dfcontrolchars.loc[0,"Char"] = chr(0) # chr(0) can't be saved in CSV file
dfcontrolchars.head()

Unnamed: 0,Code,Char,CharName
0,0,�,Char 0
1,1,,Char 1
2,2,,Char 2
3,3,,Char 3
4,4,,Char 4


In [None]:
print(f"{len(dfcontrolchars)} control chars seen in french datasets, which can't be displayed and should be ignored")

125 control chars seen in french datasets, which can't be displayed and should be ignored


Columns :
- Code : Unicode code point for the character
- Char : control character
- CharName : name of the character in the Python Unicode database

In [None]:
controlchars = set(dfcontrolchars["Char"])

### 1.5 Latin letter symbols

In [None]:
dflatinsymbols = pd.read_csv(chardatadir / "latinsymbols.csv", sep=";")
dflatinsymbols.head()

Unnamed: 0,Code,Char,CharName,NormString,Layout
0,8253,‽,Interrobang,?!,
1,8265,⁉,Exclamation Question Mark,!?,
2,8448,℀,Account Of,a/c,
3,8449,℁,Addressed To The Subject,a/s,
4,8450,ℂ,Double-Struck Capital C,C,Double-Struck


In [None]:
print(f"{len(dflatinsymbols)} Unicode symbols which represent latin letters with a specific layout like {list(dflatinsymbols['Layout'].unique())}")

917 Unicode symbols which represent latin letters with a specific layout like [nan, 'Double-Struck', 'Unit', 'Script', 'Black-Letter', 'Turned', 'Rotated', 'Turned Sans-Serif', 'Reversed Sans-Serif', 'Double-Struck Italic', 'Parenthesized', 'Circled', 'Mathematical Bold', 'Mathematical Italic', 'Mathematical Bold Italic', 'Mathematical Script', 'Mathematical Bold Script', 'Mathematical Fraktur', 'Mathematical Double-Struck', 'Mathematical Bold Fraktur', 'Mathematical Sans-Serif', 'Mathematical Sans-Serif Bold', 'Mathematical Sans-Serif Italic', 'Mathematical Sans-Serif Bold Italic', 'Mathematical Monospace', 'Tortoise Shell Bracketed', 'Circled Italic', 'Squared', 'Negative Circled', 'Negative Squared', 'Crossed Negative Squared', 'Regional Indicator']


Columns :
- Code/Char/CharName : Unicode symbol representing a latin letter with a specific layout
- NormString : normalized string using only very frequent chars
- Layout : info about the specific layout applied to the latin char

In [None]:
latinlettersnolayout = {}
latinlettersremovedlayout = {}
for rowidx,row in dflatinsymbols.iterrows():
    latinlettersnolayout[row["Char"]] = row["NormString"]
    latinlettersremovedlayout[row["Char"]] = row["Layout"]

### 1.6 Latin letters

In [None]:
dflatinletters = pd.read_csv(chardatadir / "latinletters.csv", sep=";")
dflatinletters.head()

Unnamed: 0,Code,Char,LetterName,IsUpper,UpperChar,IsLower,LowerChar,IsDiacritic,BaseChar,Diacritics,IsLigature,MultiChars,CharName,Block,Category,SubCategory
0,65,A,A,True,A,False,a,False,,,False,,Latin Capital Letter A,Basic Latin,Letter,Uppercase
1,66,B,B,True,B,False,b,False,,,False,,Latin Capital Letter B,Basic Latin,Letter,Uppercase
2,67,C,C,True,C,False,c,False,,,False,,Latin Capital Letter C,Basic Latin,Letter,Uppercase
3,68,D,D,True,D,False,d,False,,,False,,Latin Capital Letter D,Basic Latin,Letter,Uppercase
4,69,E,E,True,E,False,e,False,,,False,,Latin Capital Letter E,Basic Latin,Letter,Uppercase


In [None]:
print(f"{len(dflatinletters)} chars representing latin letters, {len(dflatinletters[dflatinletters['IsUpper']])} upper case and {len(dflatinletters[dflatinletters['IsLower']])} lower case, {len(dflatinletters[dflatinletters['IsDiacritic']])} with diacritics like {list(dflatinletters[dflatinletters['IsDiacritic']]['Diacritics'].unique())[0:20]}, {len(dflatinletters[dflatinletters['IsLigature']])} representing multiple letters in ligature")

1230 chars representing latin letters, 459 upper case and 704 lower case, 1031 with diacritics like ['Grave', 'Acute', 'Circumflex', 'Tilde', 'Diaeresis', 'Ring Above', 'Cedilla', 'Stroke', 'Macron', 'Breve', 'Ogonek', 'Dot Above', 'Caron', 'Dotless', 'Middle Dot', 'Preceded By Apostrophe', 'Double Acute', 'Long', 'Hook', 'Topbar'], 88 representing multiple letters in ligature


Columns :
- Code/Char/CharName : Unicode character representing one or more latin letters
- LetterName : name of the latin letter (without case and diacritics qualifiers)
- IsUpper/UpperChar and IsLower/LowerChar : upper case or lower case equivalent chars
- IsDiacritic => BaseChar : equivalent char without any diacritic (accents ...), Diacritics : description of all diacritics applied to the char
- IsLigature => MultiChars : if the char represents multiple latin letters in a single ligature, string representing the equivalent list of letters
- Block/Category/SubCategory : Unicode classification for each char

In [None]:
latinletters = set(dflatinletters["Char"])

In [None]:
latinletterstolower = {}
for rowidx,row in dflatinletters.iterrows():
    if row["Char"] != row["LowerChar"]:
        latinletterstolower[row["Char"]] = row["LowerChar"]

In [None]:
latinlettersnodiacritics = {}
latinlettersremoveddiacritics = {}
for rowidx,row in dflatinletters.iterrows():
    if row["IsDiacritic"]:
        latinlettersnodiacritics[row["Char"]] = row["BaseChar"]
        latinlettersremoveddiacritics[row["Char"]] = row["Diacritics"]

In [None]:
latinlettersnoligatures = {}
for rowidx,row in dflatinletters.iterrows():
    if row["IsLigature"]:
        latinlettersnoligatures[row["Char"]] = row["MultiChars"]

### 1.7 Latin numbers and number symbols

In [None]:
dflatinnumbers = pd.read_csv(chardatadir / "latinnumbers.csv", sep=";")
dflatinnumbers.head()

Unnamed: 0,Code,Char,CharName,NormString,Layout
0,48,0,Digit Zero,0,Digit
1,49,1,Digit One,1,Digit
2,50,2,Digit Two,2,Digit
3,51,3,Digit Three,3,Digit
4,52,4,Digit Four,4,Digit


In [None]:
print(f"{len(dflatinnumbers)} chars representing latin digits, some with specific layouts like {list(dflatinnumbers['Layout'].unique())[1:]}")

302 chars representing latin digits, some with specific layouts like ['Superscript', 'Vulgar Fraction', 'Subscript', 'Roman Numeral', 'Small Roman Numeral', 'Circled', 'Parenthesized', ' Full Stop', 'Negative Circled', 'Double Circled', 'Dingbat Negative Circled', 'Dingbat Circled Sans-Serif', 'Dingbat Negative Circled Sans-Serif ', 'Circled On Black Square', 'Fullwidth', 'Mathematical Bold', 'Mathematical Double-Struck', 'Mathematical Sans-Serif', 'Mathematical Sans-Serif Bold', 'Mathematical Monospace', 'Full Stop', 'Comma']


Columns :
- Code/Char/CharName : Unicode char representing on or more latin digits
- NormString : normalized string representing the equivalent number, plus punctuation if needed
- Layout : info about the specific layout applied to the latin digits

In [None]:
latinnumbersnolayout = {}
latinnumbersremovedlayout = {}
for rowidx,row in dflatinnumbers.iterrows():
    latinnumbersnolayout[row["Char"]] = row["NormString"]
    latinnumbersremovedlayout[row["Char"]] = row["Layout"]

### 1.8 Variations on frequent chars to normalize

In [None]:
dfnormchars = pd.read_csv(chardatadir / "normalizedchars.csv", sep=";")
dfnormchars.head()

Unnamed: 0,Code,Char,CharName,NormCode,NormChar,NormCharName
0,11,,Char 11,10,\n,Char 10
1,13,\r,Char 13,10,\n,Char 10
2,182,¶,Pilcrow Sign,10,\n,Char 10
3,8232,,Line Separator,10,\n,Char 10
4,160,,No-Break Space,32,,Space


In [None]:
print(f"{len(dfnormchars)} alternative chars which are sometimes used as equivalent visual representations for {len(dfnormchars['NormChar'].unique())} other very frequent chars")

171 alternative chars which are sometimes used as equivalent visual representations for 53 other very frequent chars


Columns :
- Code/Char/CharName : alternative Unicode char often used as a visual equivalent of a more frequent char
- NormCode/NormChar/NormCharName : more frequent char which should be used to normalize text

In [None]:
normalizedchars = {}
for rowidx,row in dfnormchars.iterrows():
    normalizedchars[row["Char"]] = row["NormChar"]

## 2. Text normalization

### 2.1 Normalization functions

We need to apply several replacement functions in a row, each replacement function building on the replacements already applied by the previous ones.

We can't simply use replace statements on immutable strings to do this : we would need to allocate new strings for each replacement at each level, and this would put a high load on the garbage collector.

A better solution is to implement our normalization function as a chain of iterators on chars.

In [None]:
import functools
import itertools

def ignorechars(chariterator, charset):
    for char in chariterator:
        if not char in charset:
            yield char
            
def replacechars1to1(chariterator, chardict):
    for char in chariterator:
        if char in chardict:
            yield chardict[char]
        else:
            yield char
            
def replacechars1toN(chariterator, chardict):
    for char in chariterator:
        if char in chardict:
            for outchar in chardict[char]:
                yield outchar
        else:
            yield char

In [None]:
replaceWin1252ErrorChars = functools.partial(replacechars1to1, chardict=win1252errorchars)
ignoreControlChars = functools.partial(ignorechars, charset=controlchars)
replaceLatinLettersSymbols = functools.partial(replacechars1toN, chardict=latinlettersnolayout)
replaceLatinLettersLigatures = functools.partial(replacechars1toN, chardict=latinlettersnoligatures)
replaceLatinNumbersSymbols = functools.partial(replacechars1toN, chardict=latinnumbersnolayout)
replaceNormalizedChars = functools.partial(replacechars1to1, chardict=normalizedchars)

In [None]:
testString = "ABCabcd"

ignoreSet = set(['A','a'])            
ignoreAs = functools.partial(ignorechars, charset=ignoreSet)

result = ignoreAs(testString)
print("".join(result))

replace1to1Dict = {'A':'X','a':'x'}
replaceAs = functools.partial(replacechars1to1, chardict=replace1to1Dict)

result = replaceAs(testString)
print("".join(result))

replace1toNDict = {'B':'XY','b':'xyz'}
replaceBs = functools.partial(replacechars1toN, chardict=replace1toNDict)

result = replaceBs(testString)
print("".join(result))

BCbcd
XBCxbcd
AXYCaxyzcd


To match several chars in an iterator, we have to build a hierarchical dictionary structure.

For example, if we want to implement the following replacements :
```
ABC => 1
ABD => 2
AC  => 3
BC  => 4
```
We build the following dictionary structure :

```
A : { B : { C : 1
            D : 2
            
      C : 3 }
      
B : { C : 4 }
```

In [None]:
def buildhierarchicaldict(idict):
    hdict = {}
    odicts = []
    for key in idict:
        if len(key) > 1:
            firstchar = key[0]
            remainingstring = key[1:]
            if not firstchar in hdict:
                newdict = {}
                hdict[firstchar] = newdict
                odicts.append((firstchar,newdict))
            hdict[firstchar][remainingstring] = idict[key]
    for pkey,odict in odicts: 
        dictwithlongkey = False
        for key in odict:
            if len(key)>1:
                dictwithlongkey = True
                break
        if dictwithlongkey:
            hdict[pkey] = buildhierarchicaldict(odict)
    return hdict

In [None]:
utf8errorshdict = buildhierarchicaldict(utf8errorchars)
# utf8errorshdict

In [None]:
combiningcharshdict = buildhierarchicaldict(combiningchars)
#combiningcharshdict

In [None]:
def replacecharsNto1(chariterator, hierarchicaldict):
    candidatechars = []
    candidatedicts = []
    for char in chariterator:
        # Try to match previously started patterns
        if len(candidatechars)>0:    
            for idx,candidatedict in enumerate(candidatedicts):
                if not candidatedict is None:
                    if char in candidatedict:
                        value = candidatedict[char]
                        if isinstance(value,dict):
                            candidatedicts[idx] = value
                        else:   
                            # Success : found a char to return
                            for ridx in range(0,idx):
                                yield candidatechars[ridx]
                            candidatechars = []
                            candidatedicts = []
                            char = None
                            yield value
                            break
                    else:   
                        candidatedicts[idx] = None
            # Clean oldest failed attemps and return accumulated chars           
            while len(candidatedicts)>0 and candidatedicts[0] is None:
                candidatedicts.pop(0)                  
                yield candidatechars.pop(0)
        # Handle the current char     
        if not char is None:
            if len(candidatechars)==0:
                if char in hierarchicaldict:
                    value = hierarchicaldict[char]
                    if isinstance(value,dict):
                        candidatechars.append(char)
                        candidatedicts.append(value)
                    else:
                        yield value
                else:
                    yield char
            else:
                candidatechars.append(char)
                if char in hierarchicaldict:
                    value = hierarchicaldict[char]
                    candidatedicts.append(value)
                else:
                    candidatedicts.append(value)

In [None]:
replaceUtf8Errors = functools.partial(replacecharsNto1, hierarchicaldict=utf8errorshdict)
replaceCombiningChars = functools.partial(replacecharsNto1, hierarchicaldict=combiningcharshdict)

In [None]:
testString = "XABCDEFDXYEZ"

hdict = {"A": {"B": {"C":'1'}}, "B": {"C":'2'}, "C": {"D":'3'}, "D": {"E":'4'}, "E":'5', "F":'6', "X": {"Y":'0'} } # , "A":'9'
replaceTest = functools.partial(replacecharsNto1, hierarchicaldict=hdict)

result = replaceTest(testString)
print("".join(result))

X146D05Z


#### Unicode normalization pipeline 

In [None]:
def compose(*functions):
    def compose2(f, g):
        return lambda x: f(g(x))
    return functools.reduce(compose2, functions, lambda x: x)

def tostring(iterator):
    return "".join(iterator)

In [None]:
unicodeNorm = compose(tostring, replaceNormalizedChars, replaceLatinNumbersSymbols, replaceLatinLettersLigatures, replaceLatinLettersSymbols, ignoreControlChars, replaceCombiningChars, replaceUtf8Errors, replaceWin1252ErrorChars)

In [None]:
teststring = chr(127995)+"① l`"+chr(156)+"uv"+chr(127)+"re est¨ "+chr(147)+"belle"+chr(148)+"¸ Ã  Â½ â‚¬ énième â€° "+chr(133)+" ⁽🇪ﬃc🇦ce⁾ ！"
teststring

'🏻① l`\x9cuv\x7fre est¨ \x93belle\x94¸ Ã\xa0\xa0Â½ â‚¬ énième â€° \x85 ⁽🇪ﬃc🇦ce⁾ ！'

In [None]:
#[(ord(char),char) for char in unicodeNorm(teststring)]
unicodeNorm(teststring)

"(1) l'oeuvre est «belle», à 1/2 € énième ‰ … (EfficAce) !"

### 2.2 Normalization class with change tracking

In [None]:
import pandas as pd
from functools import partial
from operator import itemgetter
from io import StringIO

    
class TextNormalizer():
    
    def __init__(self, rootdir):
        
        # 1. Load Unicode character set data for latin script
        chardatadir = rootdir / "libdata" / "chars"
        # 1.1 Frequent encoding errors : windows1252 read as iso8859-1
        dfencodingwin1252 = pd.read_csv(chardatadir / "windows1252-iso8859-errors.csv", sep=";")
        win1252errorchars = {}
        for rowidx,row in dfencodingwin1252.iterrows():
            win1252errorchars[row["Char"]] = row["DecodedChar"]
        # 1.2 Frequent encoding errors : utf8 read as windows1252
        dfencodingutf8 = pd.read_csv(chardatadir / "utf8-windows1252-errors.csv", sep=";")
        utf8errorchars = {}
        for rowidx,row in dfencodingutf8.iterrows():
            utf8errorchars[row["ErrorSubstring"]] = row["DecodedChar"]
        utf8errorshdict = self.buildhierarchicaldict(utf8errorchars)
        # 1.3 Frequent encoding errors : windows1252 read as utf8
        dfencodingwin1252utf8 = pd.read_csv(chardatadir / "windows1252-utf8-errors.csv", sep=";")
        win1252utf8errorchars = {}
        for rowidx,row in dfencodingwin1252utf8.iterrows():
            win1252utf8errorchars[row["Char"]] = row["DecodedChars"]
        # 1.4 Unicode combining chars
        dfcombiningchars = pd.read_csv(chardatadir / "combiningdiacritics.csv", sep=";")
        combiningchars = {}
        for rowidx,row in dfcombiningchars.iterrows():
            combiningchars[row["BaseChar"]+row["Char"]] = row["CombinedChar"]
        combiningcharshdict = self.buildhierarchicaldict(combiningchars)
        # 1.5 Control chars
        dfcontrolchars = pd.read_csv(chardatadir / "controlchars.csv", sep=";")
        dfcontrolchars.loc[0,"Char"] = chr(0) # chr(0) can't be saved in CSV file
        controlchars = set(dfcontrolchars["Char"])
        # 1.6 Latin letter symbols
        dflatinsymbols = pd.read_csv(chardatadir / "latinsymbols.csv", sep=";")
        latinlettersnolayout = {}
        latinlettersremovedlayout = {}
        for rowidx,row in dflatinsymbols.iterrows():
            latinlettersnolayout[row["Char"]] = row["NormString"]
            latinlettersremovedlayout[row["Char"]] = row["Layout"]
        # 1.7 Latin letters
        dflatinletters = pd.read_csv(chardatadir / "latinletters.csv", sep=";")
        latinletterstoupper = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["Char"] != row["UpperChar"]:
                latinletterstoupper[row["Char"]] = row["UpperChar"]
        latinlettersnodiacritics = {}
        latinlettersremoveddiacritics = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["IsDiacritic"]:
                latinlettersnodiacritics[row["Char"]] = row["BaseChar"]
                latinlettersremoveddiacritics[row["Char"]] = row["Diacritics"]
        latinlettersnoligatures = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["IsLigature"]:
                latinlettersnoligatures[row["Char"]] = row["MultiChars"]
        # 1.8 Latin numbers and number symbols
        dflatinnumbers = pd.read_csv(chardatadir / "latinnumbers.csv", sep=";")
        latinnumbersnolayout = {}
        latinnumbersremovedlayout = {}
        for rowidx,row in dflatinnumbers.iterrows():
            if rowidx < 10:
                continue
            latinnumbersnolayout[row["Char"]] = row["NormString"]
            latinnumbersremovedlayout[row["Char"]] = row["Layout"]
        # 1.9 Variations on frequent chars to normalize
        dfnormchars = pd.read_csv(chardatadir / "normalizedchars.csv", sep=";")
        normalizedchars = {}
        for rowidx,row in dfnormchars.iterrows():
            normalizedchars[row["Char"]] = row["NormChar"]
        # 1.10 Optional replacement of cyrillic and greek chars looking like latin letters
        dfcgnormchars = pd.read_csv(chardatadir / "cyrillic-greek-chars.csv", sep=";")
        cgnormalizedchars = {}
        for rowidx,row in dfcgnormchars.iterrows():
            cgnormalizedchars[row["Char"]] = row["NormChar"]
        # 1.11 Final supported french charset
        dfsupportedchars = pd.read_csv(chardatadir / "charset-fr.csv", sep=";", quotechar='"')
        dfsupportedchars.loc[0,"Char"] = chr(0) # chr(0) can't be saved in CSV file
        supportedchars = set(dfsupportedchars["Char"])
    
        # 2.1 List successive transformations    
        self.transformsDescs = []
        transforms = []
        self.transformsDescs.append("Fix encoding errors : windows1252 read as iso8859-1")
        transforms.append(partial(self.replacechars1to1, 0, win1252errorchars))
        self.transformsDescs.append("Fix encoding errors : utf8 read as windows1252")
        transforms.append(partial(self.replacecharsNto1, 1, utf8errorshdict))
        self.transformsDescs.append("Fix encoding errors :  windows1252 read as utf8")
        transforms.append(partial(self.replacechars1toN, 2, win1252utf8errorchars))
        self.transformsDescs.append("Merge Unicode combining chars")
        transforms.append(partial(self.replacecharsNto1, 3, combiningcharshdict))
        self.transformsDescs.append("Ignore control chars")
        transforms.append(partial(self.ignorechars, 4, controlchars))
        self.transformsDescs.append("Replace latin letter symbols")
        transforms.append(partial(self.replacechars1toN, 5, latinlettersnolayout))
        self.transformsDescs.append("Replace latin letter ligatures")
        transforms.append(partial(self.replacechars1toN, 6, latinlettersnoligatures))
        self.transformsDescs.append("Replace latin number symbols")
        transforms.append(partial(self.replacechars1toN, 7, latinnumbersnolayout))
        self.transformsDescs.append("Normalize equivalent chars") 
        transforms.append(partial(self.replacechars1to1, 8, normalizedchars))   
        self.transformsDescs.append("Replace cyrillic and greek chars looking like latin letters") 
        transforms.append(partial(self.replacechars1to1,9, cgnormalizedchars))  
        self.transformsDescs.append("Replace infrequent chars : latin letters with diacritics") 
        transforms.append(partial(self.replacecharsnotinset, 10, supportedchars, latinlettersnodiacritics))  
        self.transformsDescs.append("Replace infrequent chars : other scripts") 
        transforms.append(partial(self.replaceotherscripts, 11, supportedchars))
        self.transformsDescs.append("Replace infrequent chars : symbols") 
        transforms.append(partial(self.replacesymbols, 12, supportedchars)) 
        self.transformsDescs.append("Replace infrequent chars : chars to ignore") 
        transforms.append(partial(self.ignoreotherchars, 13, supportedchars))        
        
        # 2.2 Combine all transformations
        def func(x,y):
            ci = transforms[0](x,y)
            for transform in transforms[1:]:
                ci = transform(ci,y)
            return ci
        self.transformsFunc = func

    def __repr__(self):
        desc = StringIO()
        for idx,transformDesc in enumerate(self.transformsDescs):
            desc.write(f'{idx+1} - {transformDesc}\n')
        return desc.getvalue()
        
    def __call__(self, inputText):
        result = NormResult(inputText, self.transformsDescs)
        result.setOutput(self.tostring(self.transformsFunc(inputText,result)))
        return result
        
    @staticmethod
    def buildhierarchicaldict(idict):
        hdict = {}
        odicts = []
        for key in idict:
            if len(key) > 1:
                firstchar = key[0]
                remainingstring = key[1:]
                if not firstchar in hdict:
                    newdict = {}
                    hdict[firstchar] = newdict
                    odicts.append((firstchar,newdict))
                hdict[firstchar][remainingstring] = idict[key]
        for pkey,odict in odicts: 
            dictwithlongkey = False
            for key in odict:
                if len(key)>1:
                    dictwithlongkey = True
                    break
            if dictwithlongkey:
                hdict[pkey] = TextNormalizer.buildhierarchicaldict(odict)
        return hdict

    @staticmethod
    def ignorechars(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if not char in charset:
                yield char
            else:
                result.addChange(layer, index, char, '')

    @staticmethod  
    def replacechars1to1(layer, chardict, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in chardict:
                resChar = chardict[char]
                result.addChange(layer, index, char, resChar)
                yield resChar
            else:
                yield char

    @staticmethod  
    def replacechars1toN(layer, chardict, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in chardict:
                resStr = chardict[char]
                result.addChange(layer, index, char, resStr)
                for outchar in resStr:
                    yield outchar
            else:
                yield char

    @staticmethod
    def replacecharsNto1(layer, hierarchicaldict, chariterator, result):
        candidatechars = []
        candidatedicts = []
        for index,char in enumerate(chariterator):
            # Try to match previously started patterns
            if len(candidatechars)>0:    
                for idx,candidatedict in enumerate(candidatedicts):
                    if not candidatedict is None:
                        if char in candidatedict:
                            value = candidatedict[char]
                            if isinstance(value,dict):
                                candidatedicts[idx] = value
                            else:   
                                # Success : found a char to return
                                for ridx in range(0,idx):
                                    yield candidatechars[ridx]
                                replacedStr = "".join(candidatechars[idx:]) + char
                                result.addChange(layer, index-len(replacedStr)+1, replacedStr, value)
                                candidatechars = []
                                candidatedicts = []
                                char = None
                                yield value
                                break
                        else:   
                            candidatedicts[idx] = None
                # Clean oldest failed attemps and return accumulated chars           
                while len(candidatedicts)>0 and candidatedicts[0] is None:
                    candidatedicts.pop(0)                  
                    yield candidatechars.pop(0)
            # Handle the current char  
            if not char is None:
                if len(candidatechars)==0:
                    if char in hierarchicaldict:
                        value = hierarchicaldict[char]
                        if isinstance(value,dict):
                            candidatechars.append(char)
                            candidatedicts.append(value)
                        else:
                            result.addChange(layer, index, char, value)
                            yield value
                    else:
                        yield char
                else:
                    candidatechars.append(char)
                    if char in hierarchicaldict:
                        value = hierarchicaldict[char]
                        candidatedicts.append(value)
                    else:
                        candidatedicts.append(None)     
        if len(candidatechars)>0:
            for char in candidatechars:
                yield char
    
    @staticmethod
    def replacecharsnotinset(layer, charset, replacedict, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in charset:
                yield char
            else:
                if char in replacedict:
                    resChar = replacedict[char]
                    result.addChange(layer, index, char, resChar)
                    yield resChar
                else:
                    yield char            
    
    @staticmethod
    def replaceotherscripts(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in charset:
                yield char
            else:
                family = blockfamily(charblock(char))
                if not family in ("Symbols","Ignore"):
                    resStr = chr(65532) + str(ord(char)) + '_'
                    result.addChange(layer, index, char, resStr)
                    for outchar in resStr:
                        yield outchar
                else:
                    yield char           
    
    @staticmethod
    def replacesymbols(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in charset:
                yield char
            else:
                family = blockfamily(charblock(char))
                if family == "Symbols":
                    resStr ='$' + charname(char).replace(' ','') + '_'
                    result.addChange(layer, index, char, resStr)
                    for outchar in resStr:
                        yield outchar
                else:
                    yield char          
    
    @staticmethod
    def ignoreotherchars(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in charset:
                yield char
            else:
                family = blockfamily(charblock(char))
                if family == "Ignore":
                    result.addChange(layer, index, char, '')
                else:
                    yield char            
    
    @staticmethod
    def tostring(iterator):
        return "".join(iterator)
    
    
class NormResult():
    
    def __init__(self, inputText, transformsDescs):
        self.input, self.transforms = inputText, transformsDescs
        self.layerChanges = None
        self.output = ""
    
    def addChange(self, layer, index, charsInput, charsOutput, removedInfo=None):
        if self.layerChanges is None:
            self.layerChanges = []
        if layer > (len(self.layerChanges)-1):
            for i in range(0,layer-len(self.layerChanges)+1):
                self.layerChanges.append([])
        changes = self.layerChanges[layer]
        change = NormChange(layer,index,charsInput,charsOutput,removedInfo)
        changes.append(change)   
        
    def describeChanges(self):
         if self.layerChanges is None:
            return 'No change'
         else:
            desc = StringIO()
            previousString = self.input
            for changes in self.layerChanges:
                layer = changes[0].layer
                layerDesc = self.transforms[layer]
                desc.write(layerDesc+"\n")                
                dispInparts = []     
                outparts = []
                dispOutparts = []
                lastIndex = 0
                for change in changes:
                    if change.index > lastIndex:
                        samePart = previousString[lastIndex:change.index]
                        dispInparts.append(samePart)
                        outparts.append(samePart)
                        dispOutparts.append(samePart) 
                    dispInpart = change.input
                    outpart = change.output
                    dispOutpart = outpart
                    if len(dispInpart)>len(outpart):
                        dispOutpart = outpart + ("_"*(len(dispInpart)-len(outpart)))
                    elif len(outpart)>len(dispInpart):
                        dispInpart = dispInpart + (" "*(len(outpart)-len(dispInpart)))
                    dispInparts.append(' ['+dispInpart+'] ')
                    outparts.append(outpart)
                    dispOutparts.append(' ['+dispOutpart+'] ')
                    lastIndex = change.index + len(change.input)
                if lastIndex < len(previousString):
                    samePart = previousString[lastIndex:]
                    dispInparts.append(samePart)
                    outparts.append(samePart)
                    dispOutparts.append(samePart)
                previousString = "".join(outparts)
                desc.write(" < ")
                for inpart in dispInparts:
                    desc.write(inpart)
                desc.write('\n')
                desc.write(" < ")
                for outpart in dispOutparts:
                    desc.write(outpart)
                desc.write('\n')
            return desc.getvalue()
            
    def mapOutputIndexToInput(self,outputIndex):
        inputIndex = outputIndex
        for changes in self.layerChanges:
            outputIndex = inputIndex
            for change in changes:
                if outputIndex < change.index:
                    break
                elif outputIndex > (change.index + len(change.output)):
                    inputIndex = inputIndex + (len(change.input)-len(change.output))
                else:
                    inputIndex = inputIndex -(outputIndex-change.index)
                    break
        return inputIndex        
            
    def setOutput(self, outputText):
        self.output = outputText
        
    def __repr__(self):
        return self.output
    
class NormChange():
    
    def __init__(self, layer, index, charsInput, charsOutput, removedInfo=None):
        self.layer, self.index, self.input, self.output, self.removedInfo = layer, index, charsInput, charsOutput, removedInfo
        
    def __repr__(self):
        return f"{self.layer} - {self.index} : {self.input} => {self.output}"

In [None]:
%time norm = TextNormalizer(rootdir)
norm

Wall time: 1.35 s


1 - Fix encoding errors : windows1252 read as iso8859-1
2 - Fix encoding errors : utf8 read as windows1252
3 - Fix encoding errors :  windows1252 read as utf8
4 - Merge Unicode combining chars
5 - Ignore control chars
6 - Replace latin letter symbols
7 - Replace latin letter ligatures
8 - Replace latin number symbols
9 - Normalize equivalent chars
10 - Replace cyrillic and greek chars looking like latin letters
11 - Replace infrequent chars : latin letters with diacritics
12 - Replace infrequent chars : other scripts
13 - Replace infrequent chars : symbols
14 - Replace infrequent chars : chars to ignore

In [None]:
teststring = chr(127995)+"① l`"+chr(156)+"uv"+chr(127)+"re est¨ "+chr(147)+"belle"+chr(148)+"¸ Ã  Â½ â‚¬ énième â€° "+chr(133)+" ⁽🇪ﬃc🇦ce⁾ ！"
teststring

'🏻① l`\x9cuv\x7fre est¨ \x93belle\x94¸ Ã  Â½ â‚¬ énième â€° \x85 ⁽🇪ﬃc🇦ce⁾ ！'

In [None]:
result = norm(teststring)
result

(1) l'oeuvre est «belle», Ã  1/2 € énième ‰ … (EfficAce) !

In [None]:
print(result.describeChanges())

IndexError: list index out of range

In [None]:
result.output[0:12]

"(1) l'oeuvre"

In [None]:
result.input[result.mapOutputIndexToInput(0):result.mapOutputIndexToInput(12)]

'🏻① l`\x9cuv\x7fre'

In [None]:
result.output[3:10]

" l'oeuv"

In [None]:
result.input[result.mapOutputIndexToInput(3):result.mapOutputIndexToInput(10)]

' l`\x9cuv\x7f'

In [None]:
%timeit -n100 norm(teststring)

197 µs ± 19.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## 3. Explore french dataset characters

### 3.1 Characters frequency in french datasets

In [None]:
dfcharstats = pd.read_csv(chardatadir / "charsetstats_raw.csv", sep=";")
dfcharstats.head()

Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count
0,0,101,e,Latin Small Letter E,Letter,Lowercase,Basic Latin,3503992000.0,4595437000.0,8099428000.0
1,1,115,s,Latin Small Letter S,Letter,Lowercase,Basic Latin,1960554000.0,2534105000.0,4494658000.0
2,2,97,a,Latin Small Letter A,Letter,Lowercase,Basic Latin,1865590000.0,2447239000.0,4312829000.0
3,3,110,n,Latin Small Letter N,Letter,Lowercase,Basic Latin,1819350000.0,2388609000.0,4207959000.0
4,5,105,i,Latin Small Letter I,Letter,Lowercase,Basic Latin,1766427000.0,2331461000.0,4097888000.0


### 3.2 Characters stats in Wikipedia dataset

- 35.6 billion chars

In [None]:
charsCountWikipedia = dfcharstats["CountWikipedia"].sum()
charsCountWikipedia

35682395281.0

- 13 502 distinct Unicode chars

In [None]:
distinctCharsWikipedia = len(dfcharstats[dfcharstats["CountWikipedia"]>0])
distinctCharsWikipedia

13502

- Only 1316 chars more frequent than 1 in 100 million

In [None]:
frequentCharsWikipedia = len(dfcharstats[dfcharstats["CountWikipedia"]>356])
frequentCharsWikipedia

1316

- Frequent chars represent 9.7 % of all distinct Unicode chars

In [None]:
pctFreqCharsWikipedia = frequentCharsWikipedia/distinctCharsWikipedia*100
pctFreqCharsWikipedia

9.74670419197156

- 99.9987 % of Wikipedia chars would be preserved if we only kept the frequent chars

In [None]:
pctPreservedCharsWikipedia = (1-dfcharstats[dfcharstats["CountWikipedia"]<=356]["CountWikipedia"].sum()/dfcharstats["CountWikipedia"].sum())*100
pctPreservedCharsWikipedia

99.99871204274157

### 3.3 Characters stats in Business dataset

- 27.5 billion chars

In [None]:
charsCountBusiness = dfcharstats["CountBusiness"].sum()
charsCountBusiness

27577304956.0

-  3 763 distinct Unicode chars

In [None]:
distinctCharsBusiness = len(dfcharstats[dfcharstats["CountBusiness"]>0])
distinctCharsBusiness

3763

- Only 531 chars more frequent than 1 in 100 million

In [None]:
frequentCharsBusiness = len(dfcharstats[dfcharstats["CountBusiness"]>275])
frequentCharsBusiness

531

- Frequent chars represent 14.1 % of all distinct Unicode chars

In [None]:
pctFreqCharsBusiness = frequentCharsBusiness/distinctCharsBusiness*100
pctFreqCharsBusiness

14.11108158384268

- 99.9996 % of Business chars would be preserved if we only kept the frequent chars

In [None]:
pctPreservedCharsBusiness = (1-dfcharstats[dfcharstats["CountBusiness"]<=275]["CountBusiness"].sum()/dfcharstats["CountBusiness"].sum())*100
pctPreservedCharsBusiness

99.9996564385093

- 99.985 % of Wikipedia chars would be preserved if we only kept the frequent Business chars

In [None]:
pctPreservedBizCharsInWikipedia = (1-dfcharstats[dfcharstats["CountBusiness"]<=275]["CountWikipedia"].sum()/dfcharstats["CountWikipedia"].sum())*100
pctPreservedBizCharsInWikipedia

99.9848317525845

### 3.4 Character stats after Unicode normalization

In [None]:
dfcharsnorm = pd.read_csv(chardatadir / "charsetstats_norm.csv", sep=";")
dfcharsnorm.sort_values(by="CountBusiness",ascending=False,inplace=True)
dfcharsnorm["IsLatinLetter"] = dfcharsnorm["Char"].apply(lambda c: c in latinletters)
dfcharsnorm

Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
0,0,101,e,Latin Small Letter E,Letter,Lowercase,Basic Latin,3.504972e+09,4.598059e+09,8.103030e+09,True
1,1,115,s,Latin Small Letter S,Letter,Lowercase,Basic Latin,1.960554e+09,2.534105e+09,4.494659e+09,True
2,2,97,a,Latin Small Letter A,Letter,Lowercase,Basic Latin,1.865560e+09,2.447291e+09,4.312851e+09,True
3,3,110,n,Latin Small Letter N,Letter,Lowercase,Basic Latin,1.819350e+09,2.388609e+09,4.207959e+09,True
5,5,105,i,Latin Small Letter I,Letter,Lowercase,Basic Latin,1.766427e+09,2.331462e+09,4.097890e+09,True
...,...,...,...,...,...,...,...,...,...,...,...
6188,6188,32490,绪,Cjk Unified Ideograph-7Eea,Letter,Other,CJK Unified Ideographs,0.000000e+00,2.100000e+01,2.100000e+01,False
6189,6189,11397,ⲅ,Coptic Small Letter Gamma,Letter,Lowercase,Coptic,0.000000e+00,1.900000e+01,1.900000e+01,False
6190,6190,66466,𐎢,Old Persian Sign U,Letter,Other,Old Persian,0.000000e+00,1.900000e+01,1.900000e+01,False
6191,6191,2537,৩,Bengali Digit Three,Number,Decimal Digit,Bengali,0.000000e+00,2.200000e+01,2.200000e+01,False


#### Stats for the character families after normalization

In [None]:
dfblocks = dfcharsnorm.groupby(by=["Block","Category"]).sum().sort_values(by="CountBusiness",ascending=False)["CountBusiness"].reset_index()
dfblocks["CountBusiness"] = dfblocks["CountBusiness"] / 2.75773e+10 * 1000000
dfblocks.head(20)

Unnamed: 0,Block,Category,CountBusiness
0,Basic Latin,Letter,881902.669297
1,Basic Latin,Punctuation,46315.655485
2,Basic Latin,Number,32711.158344
3,Latin-1 Supplement,Letter,32478.404267
4,Basic Latin,Separator,3208.964039
5,Latin-1 Supplement,Punctuation,1299.564461
6,Basic Latin,Symbol,712.644313
7,Basic Latin,Other,402.874828
8,Latin-1 Supplement,Symbol,333.857085
9,General Punctuation,Punctuation,216.378688


### Frequent characters to keep

In [None]:
dfcharskeep = dfcharsnorm[dfcharsnorm["CountBusiness"]>275]
dfcharskeep

Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
0,0,101,e,Latin Small Letter E,Letter,Lowercase,Basic Latin,3.504972e+09,4.598059e+09,8.103030e+09,True
1,1,115,s,Latin Small Letter S,Letter,Lowercase,Basic Latin,1.960554e+09,2.534105e+09,4.494659e+09,True
2,2,97,a,Latin Small Letter A,Letter,Lowercase,Basic Latin,1.865560e+09,2.447291e+09,4.312851e+09,True
3,3,110,n,Latin Small Letter N,Letter,Lowercase,Basic Latin,1.819350e+09,2.388609e+09,4.207959e+09,True
5,5,105,i,Latin Small Letter I,Letter,Lowercase,Basic Latin,1.766427e+09,2.331462e+09,4.097890e+09,True
...,...,...,...,...,...,...,...,...,...,...,...
449,449,1102,ю,Cyrillic Small Letter Yu,Letter,Lowercase,Cyrillic,2.790000e+02,2.814000e+03,3.093000e+03,False
414,414,8776,≈,Almost Equal To,Symbol,Math,Mathematical Operators,2.780000e+02,3.833000e+03,4.111000e+03,False
868,868,129318,🤦,Face Palm,Symbol,Other,Supplemental Symbols and Pictographs,2.770000e+02,1.900000e+02,4.670000e+02,False
190,190,962,ς,Greek Small Letter Final Sigma,Letter,Lowercase,Greek and Coptic,2.760000e+02,2.787900e+04,2.815500e+04,False


In [None]:
dfcharskeep["Category"].unique()

array(['Letter', 'Punctuation', 'Number', 'Separator', 'Other', 'Symbol',
       'Mark'], dtype=object)

1. Latin letters

In [None]:
dfkeep_latinletters = dfcharskeep[dfcharskeep["IsLatinLetter"]]
print(len(dfkeep_latinletters))
dfkeep_latinletters.tail(50)

127


Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
135,135,241,ñ,Latin Small Letter N With Tilde,Letter,Lowercase,Latin-1 Supplement,34910.0,121216.0,156126.0,True
124,124,243,ó,Latin Small Letter O With Acute,Letter,Lowercase,Latin-1 Supplement,29186.0,291813.0,320999.0,True
162,162,214,Ö,Latin Capital Letter O With Diaeresis,Letter,Uppercase,Latin-1 Supplement,27400.0,49366.0,76766.0,True
123,123,237,í,Latin Small Letter I With Acute,Letter,Lowercase,Latin-1 Supplement,26639.0,311149.0,337788.0,True
145,145,227,ã,Latin Small Letter A With Tilde,Letter,Lowercase,Latin-1 Supplement,14743.0,78217.0,92960.0,True
218,218,207,Ï,Latin Capital Letter I With Diaeresis,Letter,Uppercase,Latin-1 Supplement,12459.0,12984.0,25443.0,True
229,229,203,Ë,Latin Capital Letter E With Diaeresis,Letter,Uppercase,Latin-1 Supplement,11456.0,11277.0,22733.0,True
184,184,219,Û,Latin Capital Letter U With Circumflex,Letter,Uppercase,Latin-1 Supplement,9748.0,8508.0,18256.0,True
142,142,250,ú,Latin Small Letter U With Acute,Letter,Lowercase,Latin-1 Supplement,8041.0,78223.0,86264.0,True
164,164,248,ø,Latin Small Letter O With Stroke,Letter,Lowercase,Latin-1 Supplement,7548.0,38132.0,45680.0,True


2. Numbers

In [None]:
dfkeep_numbers = dfcharskeep[dfcharskeep["Category"]=="Number"]
print(len(dfkeep_numbers))
dfkeep_numbers

10


Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
18,18,48,0,Digit Zero,Number,Decimal Digit,Basic Latin,205900253.0,203783618.0,409683871.0,False
22,22,49,1,Digit One,Number,Decimal Digit,Basic Latin,182578174.0,193454781.0,376032955.0,False
23,23,50,2,Digit Two,Number,Decimal Digit,Basic Latin,150468456.0,148890442.0,299358898.0,False
37,37,51,3,Digit Three,Number,Decimal Digit,Basic Latin,63204503.0,62978149.0,126182652.0,False
38,38,53,5,Digit Five,Number,Decimal Digit,Basic Latin,61928962.0,63015043.0,124944005.0,False
42,42,52,4,Digit Four,Number,Decimal Digit,Basic Latin,54315896.0,55117336.0,109433232.0,False
47,47,57,9,Digit Nine,Number,Decimal Digit,Basic Latin,49659061.0,64890554.0,114549615.0,False
49,49,56,8,Digit Eight,Number,Decimal Digit,Basic Latin,45892947.0,51573918.0,97466865.0,False
52,52,55,7,Digit Seven,Number,Decimal Digit,Basic Latin,44417739.0,48012755.0,92430494.0,False
51,51,54,6,Digit Six,Number,Decimal Digit,Basic Latin,43719436.0,46734704.0,90454140.0,False


3. Non latin scripts letters

In [None]:
dfkeep_otherscripts = dfcharskeep[(dfcharskeep["Category"] == "Letter") & (~dfcharskeep["IsLatinLetter"]) & (dfcharskeep["Block"] != "Latin-1 Supplement")]
print(len(dfkeep_otherscripts))
dfkeep_otherscripts

88


Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
152,152,1077,е,Cyrillic Small Letter Ie,Letter,Lowercase,Cyrillic,6550.0,69243.0,75793.0,False
144,144,1072,а,Cyrillic Small Letter A,Letter,Lowercase,Cyrillic,5945.0,88915.0,94860.0,False
146,146,1086,о,Cyrillic Small Letter O,Letter,Lowercase,Cyrillic,5695.0,79919.0,85614.0,False
169,169,1090,т,Cyrillic Small Letter Te,Letter,Lowercase,Cyrillic,4452.0,40796.0,45248.0,False
160,160,1575,ا,Arabic Letter Alef,Letter,Other,Arabic,4403.0,60273.0,64676.0,False
148,148,1080,и,Cyrillic Small Letter I,Letter,Lowercase,Cyrillic,3699.0,75722.0,79421.0,False
159,159,1085,н,Cyrillic Small Letter En,Letter,Lowercase,Cyrillic,3115.0,59369.0,62484.0,False
163,163,1089,с,Cyrillic Small Letter Es,Letter,Lowercase,Cyrillic,3020.0,45943.0,48963.0,False
161,161,1088,р,Cyrillic Small Letter Er,Letter,Lowercase,Cyrillic,2953.0,52739.0,55692.0,False
312,312,38971,頻,Cjk Unified Ideograph-983B,Letter,Other,CJK Unified Ideographs,2562.0,1830.0,4392.0,False


4. Punctuation and separators

In [None]:
dfkeep_punctsep = dfcharskeep[(dfcharskeep["Category"] == "Punctuation") | (dfcharskeep["Category"] == "Separator") | (dfcharskeep["Category"] == "Other")]
print(len(dfkeep_punctsep))
dfkeep_punctsep

36


Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
16,16,44,",",Comma,Punctuation,Other,Basic Latin,286106887.0,406440340.0,692547227.0,False
19,19,39,',Apostrophe,Punctuation,Other,Basic Latin,279745827.0,359597564.0,639343391.0,False
17,17,46,.,Full Stop,Punctuation,Other,Basic Latin,270047735.0,329582293.0,599630028.0,False
34,34,45,-,Hyphen-Minus,Punctuation,Dash,Basic Latin,100118715.0,120962325.0,221081040.0,False
28,28,32,,Space,Separator,Space,Basic Latin,88494564.0,102873788.0,191368352.0,False
30,30,58,:,Colon,Punctuation,Other,Basic Latin,80872165.0,71707754.0,152579919.0,False
43,43,47,/,Solidus,Punctuation,Other,Basic Latin,50243665.0,42180730.0,92424395.0,False
50,50,34,"""",Quotation Mark,Punctuation,Other,Basic Latin,47363272.0,58415460.0,105778732.0,False
58,58,41,),Right Parenthesis,Punctuation,Close,Basic Latin,39529388.0,55662250.0,95191638.0,False
59,59,40,(,Left Parenthesis,Punctuation,Open,Basic Latin,38061346.0,54844441.0,92905787.0,False


5. Symbols

In [None]:
dfkeep_symbols = dfcharskeep[((dfcharskeep["Category"] == "Symbol") & (dfcharskeep["Code"]<9000)) | ((dfcharskeep["Block"] == "Latin-1 Supplement") & (dfcharskeep["Category"] == "Letter") & (~dfcharskeep["IsLatinLetter"]))]
print(len(dfkeep_symbols))
dfkeep_symbols

34


Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
74,74,176,°,Degree Sign,Symbol,Other,Latin-1 Supplement,8558786.0,5457787.0,14016573.0,False
84,84,124,|,Vertical Line,Symbol,Math,Basic Latin,5506250.0,7417170.0,12923420.0,False
82,82,8364,€,Euro Sign,Symbol,Currency,Currency Symbols,5309670.0,3588551.0,8898221.0,False
83,83,61,=,Equals Sign,Symbol,Math,Basic Latin,3851169.0,4258407.0,8109576.0,False
100,100,62,>,Greater-Than Sign,Symbol,Math,Basic Latin,3822951.0,4608298.0,8431249.0,False
93,93,43,+,Plus Sign,Symbol,Math,Basic Latin,3728710.0,3279061.0,7007771.0,False
107,107,60,<,Less-Than Sign,Symbol,Math,Basic Latin,2007466.0,3255821.0,5263287.0,False
111,111,169,©,Copyright Sign,Symbol,Other,Latin-1 Supplement,432972.0,305566.0,738538.0,False
114,114,36,$,Dollar Sign,Symbol,Currency,Basic Latin,369961.0,314547.0,684508.0,False
116,116,94,^,Circumflex Accent,Symbol,Modifier,Basic Latin,344331.0,253219.0,597550.0,False


6. Graphical symbols

In [None]:
dfkeep_graphsymbols = dfcharskeep[((dfcharskeep["Category"] == "Symbol") & (dfcharskeep["Code"]>=9000) & (dfcharskeep["Block"] != "Emoticons") & (dfcharskeep["Block"] != "Supplemental Symbols and Pictographs"))]
print(len(dfkeep_graphsymbols))
dfkeep_graphsymbols

66


Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
115,115,10003,✓,Check Mark,Symbol,Other,Dingbats,382579.0,212625.0,595204.0,False
167,167,9829,♥,Black Heart Suit,Symbol,Other,Miscellaneous Symbols,13095.0,7887.0,20982.0,False
134,134,65533,�,Replacement Character,Symbol,Other,Specials,9182.0,4585.0,13767.0,False
214,214,9654,▶,Black Right-Pointing Triangle,Symbol,Other,Geometric Shapes,7839.0,6770.0,14609.0,False
183,183,9888,⚠,Warning Sign,Symbol,Other,Miscellaneous Symbols,3448.0,2585.0,6033.0,False
207,207,9733,★,Black Star,Symbol,Other,Miscellaneous Symbols,3280.0,1836.0,5116.0,False
337,337,10145,➡,Black Rightwards Arrow,Symbol,Other,Dingbats,2808.0,3310.0,6118.0,False
358,358,128308,🔴,Large Red Circle,Symbol,Other,Miscellaneous Symbols and Pictographs,2383.0,3360.0,5743.0,False
401,401,10084,❤,Heavy Black Heart,Symbol,Other,Dingbats,2321.0,2367.0,4688.0,False
350,350,128170,💪,Flexed Biceps,Symbol,Other,Miscellaneous Symbols and Pictographs,2237.0,2820.0,5057.0,False


7. Emoticons

In [None]:
dfkeep_emoticons = dfcharskeep[((dfcharskeep["Category"] == "Symbol") & (dfcharskeep["Code"]>=9000) & ((dfcharskeep["Block"] == "Emoticons") | (dfcharskeep["Block"] == "Supplemental Symbols and Pictographs")))]
print(len(dfkeep_emoticons))
dfkeep_emoticons

31


Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
153,153,128578,🙂,Slightly Smiling Face,Symbol,Other,Emoticons,20346.0,18170.0,38516.0,False
150,150,128521,😉,Winking Face,Symbol,Other,Emoticons,18080.0,13563.0,31643.0,False
260,260,128512,😀,Grinning Face,Symbol,Other,Emoticons,3992.0,3645.0,7637.0,False
326,326,128514,😂,Face With Tears Of Joy,Symbol,Other,Emoticons,3682.0,3250.0,6932.0,False
387,387,128513,😁,Grinning Face With Smiling Eyes,Symbol,Other,Emoticons,2194.0,1699.0,3893.0,False
287,287,128522,😊,Smiling Face With Smiling Eyes,Symbol,Other,Emoticons,1958.0,1385.0,3343.0,False
527,527,128577,🙁,Slightly Frowning Face,Symbol,Other,Emoticons,1072.0,1175.0,2247.0,False
548,548,128591,🙏,Person With Folded Hands,Symbol,Other,Emoticons,1051.0,1345.0,2396.0,False
512,512,128517,😅,Smiling Face With Open Mouth And Cold Sweat,Symbol,Other,Emoticons,978.0,834.0,1812.0,False
562,562,128525,😍,Smiling Face With Heart-Shaped Eyes,Symbol,Other,Emoticons,957.0,1285.0,2242.0,False


Will be ignored ...

In [None]:
 dfcharskeep[dfcharskeep["Category"] == "Mark"]

Unnamed: 0.1,Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,IsLatinLetter
278,278,769,́,Combining Acute Accent,Mark,Nonspacing,Combining Diacritical Marks,1448.0,9189.0,10637.0,False
441,441,3659,๋,Thai Character Mai Chattawa,Mark,Nonspacing,Thai,864.0,492.0,1356.0,False
475,475,2494,া,Bengali Vowel Sign Aa,Mark,Spacing Combining,Bengali,441.0,2840.0,3281.0,False
