In [None]:
from pathlib import Path

rootdir = Path(r"\\?\C:\Users\laure\OneDrive\Dev\Python\nlptextdoc\dataset 092019")

In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

# Datasets access functions

In [None]:
def list_datasets():
    return pd.read_csv(rootdir / "datasets.csv",sep=';')
    
datasetsdf = list_datasets()

In [None]:
def read_dataset_file(datasetfile):
    datasetdffile = rootdir / "datasets" / (datasetfile+".dataset.feather")
    if(datasetdffile.exists()):
        datasetdf = pd.read_feather(datasetdffile)
        print(f"Loaded dataframe for dataset {datasetfile} : {len(datasetdf)} text blocks")
        return datasetdf
    else:
        raise Exception(f"No dataframe for dataset {datasetfile}")

In [None]:
def get_textblocks_from_dataset(dataset, minwords=5, maxwords=None, lang=None):
    for datasetfile in datasetsdf.loc[datasetsdf["Dataset"]==dataset,"DatasetFile"].unique():
        datasetdf = read_dataset_file(datasetfile)
        for rowidx,row in get_rows_from_datasetdf(datasetdf,minwords,maxwords,lang):
            yield row["Text"]

In [None]:
def loadVocabulary(datasetfile):
    vocabfile = rootdir / "datasets" / (datasetfile.lower()+".vocabulary.feather")
    return pd.read_feather(vocabfile)

In [None]:
def loadCharset(datasetfile):
    charsetfile = rootdir / "datasets" / (datasetfile.lower()+".charset.feather")
    chrdf = pd.read_feather(charsetfile)
    return chrdf

In [None]:
for datasetfile in datasetsdf["DatasetFile"].unique():
    vocabdf = loadVocabulary(datasetfile)
    print(f"{datasetfile} : {len(vocabdf)}")

assurance : 159060
banque : 235269
bourse : 392732
comparateur : 186390
crédit : 48299
forum : 1873326
institution : 52982
presse-1 : 843583
presse-2 : 610211
presse-3 : 679899
presse-4 : 892912
presse-5 : 740413
presse-6 : 741077
siteinfo : 445874
wikipedia-1 : 1273696
wikipedia-2 : 1339014
wikipedia-3 : 1376058
wikipedia-4 : 1412029
wikipedia-5 : 1404220


# Unicode characters properties

In [None]:
from unicodedata import name as unicodename
from unicodedata import category as unicodecategory

chardatadir = rootdir / "libdata" / "chars"

unicodeblocks = pd.read_csv(chardatadir / "unicode_blocks.csv", sep=";")
unicodeblocks["StartNum"] = unicodeblocks["Start"].map(lambda hex:int(hex,16))
unicodeblocks["EndNum"] = unicodeblocks["End"].map(lambda hex:int(hex,16))

unicodecategories = pd.read_csv(chardatadir / "unicode_categories.csv", sep=";")

unicodefamilies = pd.read_csv(chardatadir / "unicode_families.csv", sep=";")

def charname(char):
    return unicodename(char,f"Char {ord(char)}").title()

def charcategory(char):
    catcode = unicodecategory(char)
    catrow = unicodecategories.loc[unicodecategories["Code"]==catcode]
    return catrow["Category"].iloc[0]

def charsubcategory(char):
    catcode = unicodecategory(char)
    catrow = unicodecategories.loc[unicodecategories["Code"]==catcode]
    return catrow["Subcategory"].iloc[0]

def charblock(char):
    charnum = ord(char)
    blockrow = unicodeblocks[(unicodeblocks["StartNum"]<= charnum) & (unicodeblocks["EndNum"]>= charnum)]
    return blockrow["Block"].iloc[0]

def blockfamily(block):
    filteredseries = unicodefamilies[unicodefamilies["UnicodeBlock"]==block]
    if len(filteredseries) == 0:
        family= "Other"
    else:
        family = filteredseries["CharFamily"].iloc[0]
    return family

In [None]:
def enhanceCharset(charsetdf):
    charsetdf["Char"] = charsetdf["Code"].map(lambda x:chr(x))
    charsetdf["isAlpha"] = charsetdf["Char"].map(lambda x:x.isalpha())
    charsetdf["isDigit"] = charsetdf["Char"].map(lambda x:x.isdigit())
    charsetdf["isSpace"] = charsetdf["Char"].map(lambda x:x.isspace())
    charsetdf["Percent"] = 100*charsetdf["Count"].cumsum()/charsetdf["Count"].sum()
    charsetdf["Name"] = charsetdf["Char"].map(lambda c:charname(c))
    charsetdf["Category"] = charsetdf["Char"].map(lambda c:charcategory(c))
    charsetdf["Subcategory"] = charsetdf["Char"].map(lambda c:charsubcategory(c))
    charsetdf["Block"] = charsetdf["Char"].map(lambda c:charblock(c))

# Generate characters stats after char normalization

In [None]:
import pandas as pd
from functools import partial
from operator import itemgetter
from io import StringIO

    
class TextNormalizer():
    
    def __init__(self, rootdir):
        
        # 1. Load Unicode character set data for latin script
        chardatadir = rootdir / "libdata" / "chars"
        # 1.1 Frequent encoding errors : windows1252 read as iso8859-1
        dfencodingwin1252 = pd.read_csv(chardatadir / "windows1252-iso8859-errors.csv", sep=";")
        win1252errorchars = {}
        for rowidx,row in dfencodingwin1252.iterrows():
            win1252errorchars[row["Char"]] = row["DecodedChar"]
        # 1.2 Frequent encoding errors : utf8 read as windows1252
        dfencodingutf8 = pd.read_csv(chardatadir / "utf8-windows1252-errors.csv", sep=";")
        utf8errorchars = {}
        for rowidx,row in dfencodingutf8.iterrows():
            utf8errorchars[row["ErrorSubstring"]] = row["DecodedChar"]
        utf8errorshdict = self.buildhierarchicaldict(utf8errorchars)
        # 1.3 Frequent encoding errors : windows1252 read as utf8
        dfencodingwin1252utf8 = pd.read_csv(chardatadir / "windows1252-utf8-errors.csv", sep=";")
        win1252utf8errorchars = {}
        for rowidx,row in dfencodingwin1252utf8.iterrows():
            win1252utf8errorchars[row["Char"]] = row["DecodedChars"]
        # 1.4 Unicode combining chars
        dfcombiningchars = pd.read_csv(chardatadir / "combiningdiacritics.csv", sep=";")
        combiningchars = {}
        for rowidx,row in dfcombiningchars.iterrows():
            combiningchars[row["BaseChar"]+row["Char"]] = row["CombinedChar"]
        combiningcharshdict = self.buildhierarchicaldict(combiningchars)
        # 1.5 Control chars
        dfcontrolchars = pd.read_csv(chardatadir / "controlchars.csv", sep=";")
        dfcontrolchars.loc[0,"Char"] = chr(0) # chr(0) can't be saved in CSV file
        controlchars = set(dfcontrolchars["Char"])
        # 1.6 Latin letter symbols
        dflatinsymbols = pd.read_csv(chardatadir / "latinsymbols.csv", sep=";")
        latinlettersnolayout = {}
        latinlettersremovedlayout = {}
        for rowidx,row in dflatinsymbols.iterrows():
            latinlettersnolayout[row["Char"]] = row["NormString"]
            latinlettersremovedlayout[row["Char"]] = row["Layout"]
        # 1.7 Latin letters
        dflatinletters = pd.read_csv(chardatadir / "latinletters.csv", sep=";")
        latinletterstoupper = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["Char"] != row["UpperChar"]:
                latinletterstoupper[row["Char"]] = row["UpperChar"]
        latinlettersnodiacritics = {}
        latinlettersremoveddiacritics = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["IsDiacritic"]:
                latinlettersnodiacritics[row["Char"]] = row["BaseChar"]
                latinlettersremoveddiacritics[row["Char"]] = row["Diacritics"]
        latinlettersnoligatures = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["IsLigature"]:
                latinlettersnoligatures[row["Char"]] = row["MultiChars"]
        # 1.8 Latin numbers and number symbols
        dflatinnumbers = pd.read_csv(chardatadir / "latinnumbers.csv", sep=";")
        latinnumbersnolayout = {}
        latinnumbersremovedlayout = {}
        for rowidx,row in dflatinnumbers.iterrows():
            if rowidx < 10:
                continue
            latinnumbersnolayout[row["Char"]] = row["NormString"]
            latinnumbersremovedlayout[row["Char"]] = row["Layout"]
        # 1.9 Variations on frequent chars to normalize
        dfnormchars = pd.read_csv(chardatadir / "normalizedchars.csv", sep=";")
        normalizedchars = {}
        for rowidx,row in dfnormchars.iterrows():
            normalizedchars[row["Char"]] = row["NormChar"]
        # 1.10 Optional replacement of cyrillic and greek chars looking like latin letters
        dfcgnormchars = pd.read_csv(chardatadir / "cyrillic-greek-chars.csv", sep=";")
        cgnormalizedchars = {}
        for rowidx,row in dfcgnormchars.iterrows():
            cgnormalizedchars[row["Char"]] = row["NormChar"]
        # 1.11 Final supported french charset
        dfsupportedchars = pd.read_csv(chardatadir / "charset-fr.csv", sep=";", quotechar='"')
        dfsupportedchars.loc[0,"Char"] = chr(0) # chr(0) can't be saved in CSV file
        supportedchars = set(dfsupportedchars["Char"])
    
        # 2.1 List successive transformations    
        self.transformsDescs = []
        transforms = []
        self.transformsDescs.append("Fix encoding errors : windows1252 read as iso8859-1")
        transforms.append(partial(self.replacechars1to1, 0, win1252errorchars))
        self.transformsDescs.append("Fix encoding errors : utf8 read as windows1252")
        transforms.append(partial(self.replacecharsNto1, 1, utf8errorshdict))
        self.transformsDescs.append("Fix encoding errors :  windows1252 read as utf8")
        transforms.append(partial(self.replacechars1toN, 2, win1252utf8errorchars))
        self.transformsDescs.append("Merge Unicode combining chars")
        transforms.append(partial(self.replacecharsNto1, 3, combiningcharshdict))
        self.transformsDescs.append("Ignore control chars")
        transforms.append(partial(self.ignorechars, 4, controlchars))
        self.transformsDescs.append("Replace latin letter symbols")
        transforms.append(partial(self.replacechars1toN, 5, latinlettersnolayout))
        self.transformsDescs.append("Replace latin letter ligatures")
        transforms.append(partial(self.replacechars1toN, 6, latinlettersnoligatures))
        self.transformsDescs.append("Replace latin number symbols")
        transforms.append(partial(self.replacechars1toN, 7, latinnumbersnolayout))
        self.transformsDescs.append("Normalize equivalent chars") 
        transforms.append(partial(self.replacechars1to1, 8, normalizedchars))   
        self.transformsDescs.append("Replace cyrillic and greek chars looking like latin letters") 
        transforms.append(partial(self.replacechars1to1,9, cgnormalizedchars))  
        self.transformsDescs.append("Replace infrequent chars : latin letters with diacritics") 
        transforms.append(partial(self.replacecharsnotinset, 10, supportedchars, latinlettersnodiacritics))  
        self.transformsDescs.append("Replace infrequent chars : other scripts") 
        transforms.append(partial(self.replaceotherscripts, 11, supportedchars))
        self.transformsDescs.append("Replace infrequent chars : symbols") 
        transforms.append(partial(self.replacesymbols, 12, supportedchars)) 
        self.transformsDescs.append("Replace infrequent chars : chars to ignore") 
        transforms.append(partial(self.ignoreotherchars, 13, supportedchars))        
        
        # 2.2 Combine all transformations
        def func(x,y):
            ci = transforms[0](x,y)
            for transform in transforms[1:]:
                ci = transform(ci,y)
            return ci
        self.transformsFunc = func

    def __repr__(self):
        desc = StringIO()
        for idx,transformDesc in enumerate(self.transformsDescs):
            desc.write(f'{idx+1} - {transformDesc}\n')
        return desc.getvalue()
        
    def __call__(self, inputText):
        result = NormResult(inputText, self.transformsDescs)
        result.setOutput(self.tostring(self.transformsFunc(inputText,result)))
        return result
        
    @staticmethod
    def buildhierarchicaldict(idict):
        hdict = {}
        odicts = []
        for key in idict:
            if len(key) > 1:
                firstchar = key[0]
                remainingstring = key[1:]
                if not firstchar in hdict:
                    newdict = {}
                    hdict[firstchar] = newdict
                    odicts.append((firstchar,newdict))
                hdict[firstchar][remainingstring] = idict[key]
        for pkey,odict in odicts: 
            dictwithlongkey = False
            for key in odict:
                if len(key)>1:
                    dictwithlongkey = True
                    break
            if dictwithlongkey:
                hdict[pkey] = TextNormalizer.buildhierarchicaldict(odict)
        return hdict

    @staticmethod
    def ignorechars(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if not char in charset:
                yield char
            else:
                result.addChange(layer, index, char, '')

    @staticmethod  
    def replacechars1to1(layer, chardict, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in chardict:
                resChar = chardict[char]
                result.addChange(layer, index, char, resChar)
                yield resChar
            else:
                yield char

    @staticmethod  
    def replacechars1toN(layer, chardict, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in chardict:
                resStr = chardict[char]
                result.addChange(layer, index, char, resStr)
                for outchar in resStr:
                    yield outchar
            else:
                yield char

    @staticmethod
    def replacecharsNto1(layer, hierarchicaldict, chariterator, result):
        candidatechars = []
        candidatedicts = []
        for index,char in enumerate(chariterator):
            # Try to match previously started patterns
            if len(candidatechars)>0:    
                for idx,candidatedict in enumerate(candidatedicts):
                    if not candidatedict is None:
                        if char in candidatedict:
                            value = candidatedict[char]
                            if isinstance(value,dict):
                                candidatedicts[idx] = value
                            else:   
                                # Success : found a char to return
                                for ridx in range(0,idx):
                                    yield candidatechars[ridx]
                                replacedStr = "".join(candidatechars[idx:]) + char
                                result.addChange(layer, index-len(replacedStr)+1, replacedStr, value)
                                candidatechars = []
                                candidatedicts = []
                                char = None
                                yield value
                                break
                        else:   
                            candidatedicts[idx] = None
                # Clean oldest failed attemps and return accumulated chars           
                while len(candidatedicts)>0 and candidatedicts[0] is None:
                    candidatedicts.pop(0)                  
                    yield candidatechars.pop(0)
            # Handle the current char  
            if not char is None:
                if len(candidatechars)==0:
                    if char in hierarchicaldict:
                        value = hierarchicaldict[char]
                        if isinstance(value,dict):
                            candidatechars.append(char)
                            candidatedicts.append(value)
                        else:
                            result.addChange(layer, index, char, value)
                            yield value
                    else:
                        yield char
                else:
                    candidatechars.append(char)
                    if char in hierarchicaldict:
                        value = hierarchicaldict[char]
                        candidatedicts.append(value)
                    else:
                        candidatedicts.append(None)     
        if len(candidatechars)>0:
            for char in candidatechars:
                yield char
    
    @staticmethod
    def replacecharsnotinset(layer, charset, replacedict, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in charset:
                yield char
            else:
                if char in replacedict:
                    resChar = replacedict[char]
                    result.addChange(layer, index, char, resChar)
                    yield resChar
                else:
                    yield char            
    
    @staticmethod
    def replaceotherscripts(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in charset:
                yield char
            else:
                family = blockfamily(charblock(char))
                if not family in ("Symbols","Ignore"):
                    resStr = chr(65532) + str(ord(char)) + '_'
                    result.addChange(layer, index, char, resStr)
                    for outchar in resStr:
                        yield outchar
                else:
                    yield char           
    
    @staticmethod
    def replacesymbols(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in charset:
                yield char
            else:
                family = blockfamily(charblock(char))
                if family == "Symbols":
                    resStr ='$' + charname(char).replace(' ','') + '_'
                    result.addChange(layer, index, char, resStr)
                    for outchar in resStr:
                        yield outchar
                else:
                    yield char          
    
    @staticmethod
    def ignoreotherchars(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in charset:
                yield char
            else:
                family = blockfamily(charblock(char))
                if family == "Ignore":
                    result.addChange(layer, index, char, '')
                else:
                    yield char            
    
    @staticmethod
    def tostring(iterator):
        return "".join(iterator)
    
    
class NormResult():
    
    def __init__(self, inputText, transformsDescs):
        self.input, self.transforms = inputText, transformsDescs
        self.layerChanges = None
        self.output = ""
    
    def addChange(self, layer, index, charsInput, charsOutput, removedInfo=None):
        if self.layerChanges is None:
            self.layerChanges = []
        if layer > (len(self.layerChanges)-1):
            for i in range(0,layer-len(self.layerChanges)+1):
                self.layerChanges.append([])
        changes = self.layerChanges[layer]
        change = NormChange(layer,index,charsInput,charsOutput,removedInfo)
        changes.append(change)   
        
    def describeChanges(self):
         if self.layerChanges is None:
            return 'No change'
         else:
            desc = StringIO()
            previousString = self.input
            for changes in self.layerChanges:
                layer = changes[0].layer
                layerDesc = self.transforms[layer]
                desc.write(layerDesc+"\n")                
                dispInparts = []     
                outparts = []
                dispOutparts = []
                lastIndex = 0
                for change in changes:
                    if change.index > lastIndex:
                        samePart = previousString[lastIndex:change.index]
                        dispInparts.append(samePart)
                        outparts.append(samePart)
                        dispOutparts.append(samePart) 
                    dispInpart = change.input
                    outpart = change.output
                    dispOutpart = outpart
                    if len(dispInpart)>len(outpart):
                        dispOutpart = outpart + ("_"*(len(dispInpart)-len(outpart)))
                    elif len(outpart)>len(dispInpart):
                        dispInpart = dispInpart + (" "*(len(outpart)-len(dispInpart)))
                    dispInparts.append(' ['+dispInpart+'] ')
                    outparts.append(outpart)
                    dispOutparts.append(' ['+dispOutpart+'] ')
                    lastIndex = change.index + len(change.input)
                if lastIndex < len(previousString):
                    samePart = previousString[lastIndex:]
                    dispInparts.append(samePart)
                    outparts.append(samePart)
                    dispOutparts.append(samePart)
                previousString = "".join(outparts)
                desc.write(" < ")
                for inpart in dispInparts:
                    desc.write(inpart)
                desc.write('\n')
                desc.write(" < ")
                for outpart in dispOutparts:
                    desc.write(outpart)
                desc.write('\n')
            return desc.getvalue()
            
    def mapOutputIndexToInput(self,outputIndex):
        inputIndex = outputIndex
        for changes in self.layerChanges:
            outputIndex = inputIndex
            for change in changes:
                if outputIndex < change.index:
                    break
                elif outputIndex > (change.index + len(change.output)):
                    inputIndex = inputIndex + (len(change.input)-len(change.output))
                else:
                    inputIndex = inputIndex -(outputIndex-change.index)
                    break
        return inputIndex        
            
    def setOutput(self, outputText):
        self.output = outputText
        
    def __repr__(self):
        return self.output
    
class NormChange():
    
    def __init__(self, layer, index, charsInput, charsOutput, removedInfo=None):
        self.layer, self.index, self.input, self.output, self.removedInfo = layer, index, charsInput, charsOutput, removedInfo
        
    def __repr__(self):
        return f"{self.layer} - {self.index} : {self.input} => {self.output}"

In [None]:
dfsupportedchars = pd.read_csv(chardatadir / "charset-fr.csv", sep=";", quotechar='"')
dfsupportedchars.head(20)

Unnamed: 0,FrCode,Category,SubCategory,Code,Char,CharName,CountBusiness
0,0,separator,control,0,,Reserved - End of string,0
1,1,separator,space,32,,Space,88494564
2,2,separator,space,10,\n,Char 10,9588147
3,3,separator,space,9,\t,Char 9,1522053
4,4,separator,punctuation,44,",",Comma,286106887
5,5,separator,punctuation,39,',Apostrophe,279745827
6,6,separator,punctuation,46,.,Full Stop,270047735
7,7,separator,punctuation,45,-,Hyphen-Minus,100118715
8,8,separator,punctuation,58,:,Colon,80872165
9,9,separator,punctuation,47,/,Solidus,50243665


In [None]:
norm = TextNormalizer(rootdir)

In [None]:
from collections import defaultdict

datasets = {}
for datasetfile in datasetsdf["DatasetFile"].unique():
    print(datasetfile)
    vocabdf = loadVocabulary(datasetfile)
    layers = []
    for layerIdx in range(0,len(norm.transformsDescs)):
        layers.append(defaultdict(int))        
    datasets[datasetfile] = layers
    vocabnorm = defaultdict(int)
    tw = len(vocabdf)
    wc = 0
    for idx,row in vocabdf.iterrows():
        wc = wc + 1
        if wc%10000 == 0:
            print(f"{wc/tw*100}")
        result = norm(row["Word"])
        normWord = result.output
        count = + row["Count"]
        vocabnorm[normWord] = vocabnorm[normWord] + count
        if result.layerChanges is not None:
            for changes in result.layerChanges:
                for change in changes:
                    chgDict = layers[change.layer]
                    chgDict[change.input] = chgDict[change.input] + count
    dfvocabnorm = pd.DataFrame.from_dict({"Word":list(vocabnorm.keys()), "Count":list(vocabnorm.values())})
    dfvocabnorm.sort_values(by="Count",inplace=True,ascending=False)
    dfvocabnorm.reset_index(drop=True,inplace=True)
    dfvocabnorm.to_feather(rootdir / "datasets" / (datasetfile+".vocabnorm.feather"))

replacements = {}
for dataset in datasets:
    for layerIdx,layer in enumerate(datasets[dataset]):
        for inp in layer:
            key = (layerIdx,inp)
            if not key in replacements:
                replacements[key] = {}
            replacements[key][dataset] = layer[inp]

layers = []
inputs = []
outputs = []
datasetCounts = []
for dataset in datasets:
    datasetCounts.append([])
for key in replacements:
    layerIdx,inp = key
    dictDatasets = replacements[key]
    layers.append(layerIdx)
    inputs.append(inp)
    outputs.append(norm(inp).output)
    for idx,dataset in enumerate(datasets):
        if dataset in dictDatasets:
            datasetCounts[idx].append(dictDatasets[dataset])
        else:
            datasetCounts[idx].append(0)
            
dfnorms = pd.DataFrame.from_dict({"Layer":layers, "Input":inputs,  "Output":outputs})
for idx,dataset in enumerate(datasets):
        dfnorms[dataset] = datasetCounts[idx]
dfnorms["Count"] = dfnorms[[dataset for dataset in datasets]].sum(axis=1)

dfnorms.sort_values(by=["Layer","Count"],ascending=[True,False],inplace=True)
dfnorms.reset_index(drop=True,inplace=True)

dfnorms.to_csv(rootdir / "datasets" / "normalization.stats.csv")
dfnorms

assurance
6.28693574751666
12.57387149503332
18.86080724254998
25.14774299006664
31.434678737583297
37.72161448509996
44.00855023261662
50.29548598013328
56.582421727649944
62.869357475166595
69.15629322268326
75.44322897019993
81.73016471771658
88.01710046523324
94.3040362127499
banque
4.250453735936311
8.500907471872623
12.751361207808934
17.001814943745245
21.252268679681556
25.502722415617868
29.75317615155418
34.00362988749049
38.2540836234268
42.50453735936311
46.754991095299424
51.005444831235735
55.255898567172046
59.50635230310836
63.75680603904467
68.00725977498098
72.25771351091728
76.5081672468536
80.7586209827899
85.00907471872623
89.25952845466254
93.50998219059885
97.76043592653517
bourse
2.5462656468023996
5.092531293604799
7.6387969404071985
10.185062587209599
12.731328234011999
15.277593880814397
17.823859527616797
20.370125174419197
22.916390821221597
25.462656468023997
28.008922114826397
30.555187761628794
33.1014534084312
35.647719055233594
38.19398470203599
40.740

17.6496803201652
19.120487013512303
20.5912937068594
22.062100400206504
23.5329070935536
25.0037137869007
26.474520480247804
27.945327173594904
29.416133866942
30.8869405602891
32.357747253636205
33.828553946983305
35.2993606403304
36.7701673336775
38.240974027024606
39.711780720371706
41.1825874137188
42.6533941070659
44.12420080041301
45.5950074937601
47.0658141871072
48.5366208804543
50.0074275738014
51.4782342671485
52.94904096049561
54.4198476538427
55.89065434718981
57.361461040536895
58.832267733884
60.30307442723111
61.7738811205782
63.2446878139253
64.71549450727241
66.1863012006195
67.65710789396661
69.1279145873137
70.5987212806608
72.06952797400791
73.540334667355
75.01114136070211
76.48194805404921
77.9527547473963
79.42356144074341
80.8943681340905
82.3651748274376
83.83598152078471
85.3067882141318
86.7775949074789
88.24840160082601
89.7192082941731
91.1900149875202
92.6608216808673
94.1316283742144
95.60243506756152
97.0732417609086
98.5440484542557
presse-4
1.119931191

88.71818707132628
89.50330377107252
90.28842047081879
91.07353717056503
91.85865387031129
92.64377057005754
93.42888726980378
94.21400396955003
94.99912066929629
95.78423736904253
96.56935406878878
97.35447076853504
98.13958746828129
98.92470416802753
99.70982086777379
wikipedia-2
0.7468181811392561
1.4936363622785123
2.2404545434177683
2.9872727245570245
3.7340909056962808
4.480909086835537
5.227727267974793
5.974545449114049
6.721363630253306
7.4681818113925615
8.21499999253182
8.961818173671073
9.70863635481033
10.455454535949587
11.202272717088842
11.949090898228098
12.695909079367354
13.442727260506611
14.189545441645867
14.936363622785123
15.68318180392438
16.42999998506364
17.176818166202892
17.923636347342146
18.670454528481407
19.41727270962066
20.164090890759915
20.910909071899173
21.65772725303843
22.404545434177685
23.151363615316942
23.898181796456196
24.644999977595454
25.391818158734708
26.138636339873965
26.885454521013223
27.632272702152477
28.379090883291735
29.125909

16.379199840480837
17.091338963980004
17.803478087479167
18.515617210978334
19.2277563344775
19.939895457976668
20.652034581475835
21.364173704975002
22.07631282847417
22.788451951973336
23.500591075472503
24.21273019897167
24.924869322470837
25.637008445970004
26.34914756946917
27.061286692968338
27.773425816467505
28.48556493996667
29.19770406346584
29.909843186965006
30.621982310464173
31.33412143396334
32.04626055746251
32.75839968096167
33.470538804460844
34.18267792796001
34.89481705145917
35.606956174958334
36.319095298457505
37.03123442195667
37.74337354545584
38.455512668955
39.16765179245417
39.879790915953336
40.59193003945251
41.30406916295167
42.01620828645084
42.728347409950004
43.440486533449175
44.15262565694834
44.86476478044751
45.57690390394667
46.28904302744584
47.001182150945006
47.713321274444176
48.42546039794334
49.13759952144251
49.849738644941674
50.561877768440844
51.27401689194001
51.98615601543918
52.69829513893834
53.41043426243751
54.122573385936676
54.83

Unnamed: 0,Layer,Input,Output,assurance,banque,bourse,comparateur,crédit,forum,institution,presse-1,presse-2,presse-3,presse-4,presse-5,presse-6,siteinfo,wikipedia-1,wikipedia-2,wikipedia-3,wikipedia-4,wikipedia-5,Count
0,0,,',39,45,32,134,0,65079,0,1400,8359,1018,15,3,2,738,24,21,31,9,21,76970
1,0,,€,8,5,13,145,0,11797,0,16,465,136,3,21,37,1789,1,2,2,2,9,14451
2,0,,…,8,0,1,5,0,2502,0,56,1051,208,347,0,32,106,3,6,5,4,5,4339
3,0,,oe,0,0,4,4,0,663,2,139,567,61,6,2,3,95,2,3,0,3,1,1555
4,0,,-,0,21,0,8,0,1025,0,22,289,46,11,1,7,3,6,3,9,9,0,1460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13328,13,̷,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
13329,13,̫,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
13330,13,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1
13331,13,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1


In [None]:
dfnorms = pd.read_csv(rootdir / "libdata" / "chars" / "stats" / "normalization.stats.csv")
dfnorms

Unnamed: 0.1,Unnamed: 0,Layer,Input,Output,assurance,banque,bourse,comparateur,crédit,forum,institution,presse-1,presse-2,presse-3,presse-4,presse-5,presse-6,siteinfo,wikipedia-1,wikipedia-2,wikipedia-3,wikipedia-4,wikipedia-5,Count,CountBusiness,CountForum,CountPresse,CountWikipedia,CharName,CharFamily
0,0,0,,',39,45,32,134,0,65079,0,1400,8359,1018,15,3,2,738,24,21,31,9,21,76970,988,65079,10797,106,Char 146,Latin
1,1,0,,€,8,5,13,145,0,11797,0,16,465,136,3,21,37,1789,1,2,2,2,9,14451,1960,11797,678,16,Char 128,Latin
2,2,0,,…,8,0,1,5,0,2502,0,56,1051,208,347,0,32,106,3,6,5,4,5,4339,120,2502,1694,23,Char 133,Latin
3,3,0,,oe,0,0,4,4,0,663,2,139,567,61,6,2,3,95,2,3,0,3,1,1555,103,665,778,9,Char 156,Latin
4,4,0,,-,0,21,0,8,0,1025,0,22,289,46,11,1,7,3,6,3,9,9,0,1460,32,1025,376,27,Char 150,Latin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13328,13328,13,̷,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,Combining Short Solidus Overlay,Ignore
13329,13329,13,̫,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,Combining Inverted Double Arch Below,Ignore
13330,13330,13,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,Char 61552,Ignore
13331,13331,13,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,Char 61538,Ignore


In [None]:
dsgroups = {}
dsgroups["CountBusiness"] = ["assurance","banque","bourse","comparateur","crédit","siteinfo"]
dsgroups["CountForum"]=["forum","institution"]
dsgroups["CountPresse"]=["presse-1","presse-2","presse-3","presse-4","presse-5","presse-6"]
dsgroups["CountWikipedia"]=["wikipedia-1","wikipedia-2","wikipedia-3","wikipedia-4","wikipedia-5"]

In [None]:
for group in dsgroups:
    dfnorms[group] = dfnorms[dsgroups[group]].sum(axis=1)
dfnorms

Unnamed: 0.1,Unnamed: 0,Layer,Input,Output,assurance,banque,bourse,comparateur,crédit,forum,institution,presse-1,presse-2,presse-3,presse-4,presse-5,presse-6,siteinfo,wikipedia-1,wikipedia-2,wikipedia-3,wikipedia-4,wikipedia-5,Count,CountBusiness,CountForum,CountPresse,CountWikipedia,CharName,CharFamily
0,0,0,,',39,45,32,134,0,65079,0,1400,8359,1018,15,3,2,738,24,21,31,9,21,76970,988,65079,10797,106,Char 146,Latin
1,1,0,,€,8,5,13,145,0,11797,0,16,465,136,3,21,37,1789,1,2,2,2,9,14451,1960,11797,678,16,Char 128,Latin
2,2,0,,…,8,0,1,5,0,2502,0,56,1051,208,347,0,32,106,3,6,5,4,5,4339,120,2502,1694,23,Char 133,Latin
3,3,0,,oe,0,0,4,4,0,663,2,139,567,61,6,2,3,95,2,3,0,3,1,1555,103,665,778,9,Char 156,Latin
4,4,0,,-,0,21,0,8,0,1025,0,22,289,46,11,1,7,3,6,3,9,9,0,1460,32,1025,376,27,Char 150,Latin
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13328,13328,13,̷,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,Combining Short Solidus Overlay,Ignore
13329,13329,13,̫,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,Combining Inverted Double Arch Below,Ignore
13330,13330,13,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,Char 61552,Ignore
13331,13331,13,,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,Char 61538,Ignore


In [None]:
dfnorms["CharName"] = dfnorms["Input"].apply(lambda inp: ' , '.join([charname(c) for c in inp]))
dfnorms["CharFamily"] = dfnorms["Input"].apply(lambda inp: blockfamily(charblock(inp[0])))
dfnorms.head(20)

In [None]:
dfnorms.to_csv(rootdir / "datasets" / "normalization.stats.csv",index=False)

In [None]:
from collections import defaultdict

dsgroupcounts = defaultdict(int)
for group in dsgroups:
    for dataset in dsgroups[group]:
        dfcharset = loadCharset(dataset)
        dsgroupcounts[group] += dfcharset["Count"].sum()
dsgroupcounts

defaultdict(int,
            {'CountBusiness': 6663269722,
             'CountForum': 1964693068,
             'CountPresse': 18949342166,
             'CountWikipedia': 35682395281})

In [None]:
normstats = dfnorms.groupby(by="Layer")[["CountBusiness","CountForum","CountPresse","CountWikipedia"]].sum().reset_index()
normstats["Transform"] = normstats["Layer"].apply(lambda l:norm.transformsDescs[l])
for group in dsgroups:
    normstats["Freq"+group[5:]] = normstats[group]*1000000/dsgroupcounts[group]
normstats.to_csv(rootdir / "datasets" / "normalization.total.stats.csv")
normstats

Unnamed: 0,Layer,CountBusiness,CountForum,CountPresse,CountWikipedia,Transform,FreqBusiness,FreqForum,FreqPresse,FreqWikipedia
0,0,3402,82161,15415,215,Fix encoding errors : windows1252 read as iso8...,0.51056,41.818746,0.813485,0.006025
1,1,845,114,1373,37,Fix encoding errors : utf8 read as windows1252,0.126815,0.058024,0.072456,0.001037
2,2,0,0,366,0,Fix encoding errors : windows1252 read as utf8,0.0,0.0,0.019315,0.0
3,3,18737,850,10766,5,Merge Unicode combining chars,2.811983,0.432638,0.568146,0.00014
4,4,42983,685782,122306,146961,Ignore control chars,6.450737,349.052995,6.454367,4.118586
5,5,129,78,5635,5372,Replace latin letter symbols,0.01936,0.039701,0.297372,0.15055
6,6,44003,12852,191337,613895,Replace latin letter ligatures,6.603815,6.54148,10.09729,17.204422
7,7,16847,8178,48528,15336,Replace latin number symbols,2.528338,4.162482,2.560933,0.429792
8,8,5426083,2452744,12967674,8649100,Normalize equivalent chars,814.327384,1248.410777,684.33373,242.391239
9,9,416,1494,9323,266901,Replace cyrillic and greek chars looking like ...,0.062432,0.760424,0.491996,7.479907


In [None]:
replacestats = dfnorms[dfnorms["Layer"]==8].groupby(by="Output").sum().sort_values(by="Count",ascending=False)
replacestats["Char"] = replacestats.index
replacestats["CharName"] = replacestats["Char"].apply(lambda c:charname(c))
for group in dsgroups:
    replacestats["Freq"+group[5:]] = replacestats[group]*1000000/dsgroupcounts[group]
replacestats.to_csv(rootdir / "datasets" / "normalization.layer8.stats.csv",index=False)
replacestats

Unnamed: 0_level_0,Unnamed: 0,Layer,assurance,banque,bourse,comparateur,crédit,forum,institution,presse-1,presse-2,presse-3,presse-4,presse-5,presse-6,siteinfo,wikipedia-1,wikipedia-2,wikipedia-3,wikipedia-4,wikipedia-5,Count,CountBusiness,CountForum,CountPresse,CountWikipedia,Char,CharName,FreqBusiness,FreqForum,FreqPresse,FreqWikipedia
Output,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1
',11173,128,306069,384494,306097,689149,56749,290947,23923,764443,519991,380485,3232110,709491,1520422,1496023,517639,745307,1178936,1231249,1131813,15485337,3238581,314870,7126942,4804944,',Apostrophe,486.034805,160.264219,376.104982,134.658673
,7699,88,110027,214136,127257,248824,13648,2063418,64042,573498,323560,207429,2050495,939744,1374736,1354461,1336179,993868,258940,279354,267344,12800960,2068353,2127460,5469462,3135685,,Space,310.411117,1082.845985,288.635983,87.877649
-,11333,128,7414,17916,17244,11805,1440,5184,521,30702,23571,15829,119547,29822,23615,40340,238777,175894,53732,57506,52972,923831,96159,5705,243086,578881,-,Hyphen-Minus,14.431203,2.903761,12.828203,16.223154
«,1362,16,2403,1286,1617,837,309,1290,47,16251,9989,6299,19492,524,4339,3073,3244,3400,4503,4473,4349,87725,9525,1337,56894,19969,«,Left-Pointing Double Angle Quotation Mark,1.429478,0.680513,3.002426,0.559632
»,1361,16,1886,1192,1554,775,284,1002,47,15864,3133,3583,19258,549,4264,3128,3131,3439,4372,4266,4208,75935,8819,1049,46651,19416,»,Right-Pointing Double Angle Quotation Mark,1.323524,0.533926,2.46188,0.544134
|,6358,72,0,12,5,0,0,2,0,3,0,15,76,10,0,6,24762,5792,322,288,90,31383,23,2,104,31254,|,Vertical Line,0.003452,0.001018,0.005488,0.875894
•,15598,168,336,306,171,417,4,470,8,2057,17,196,1068,224,32,126,9155,9898,134,79,118,24816,1360,478,3594,19384,•,Bullet,0.204104,0.243295,0.189664,0.543237
.,2022,24,66,199,83,5,0,155,0,68,36,7537,8544,32,8,42,982,900,175,210,205,19247,395,155,16225,2472,.,Full Stop,0.05928,0.078893,0.85623,0.069278
"""",2115,24,485,4,27,12,2,46,0,56,8,33,102,10,9,37,5434,3858,326,445,370,11264,567,46,218,10433,"""",Quotation Mark,0.085093,0.023413,0.011504,0.292385
:,659,8,0,0,0,0,0,1,0,0,0,1,0,0,0,1,1531,1257,1180,985,1079,6035,1,1,1,6032,:,Colon,0.00015,0.000509,5.3e-05,0.169047


In [None]:
scriptsstats = dfnorms[dfnorms["Layer"]==11].groupby(by="CharFamily").sum().sort_values(by="Count",ascending=False)
for group in dsgroups:
    scriptsstats["Freq"+group[5:]] = scriptsstats[group]*1000000/dsgroupcounts[group]
scriptsstats.drop(columns=["Unnamed: 0","Layer"],inplace=True)
scriptsstats.reset_index(inplace=True)
scriptsstats.to_csv(rootdir / "datasets" / "normalization.layer11.stats.csv",index=False)
scriptsstats

Unnamed: 0,CharFamily,assurance,banque,bourse,comparateur,crédit,forum,institution,presse-1,presse-2,presse-3,presse-4,presse-5,presse-6,siteinfo,wikipedia-1,wikipedia-2,wikipedia-3,wikipedia-4,wikipedia-5,Count,CountBusiness,CountForum,CountPresse,CountWikipedia,FreqBusiness,FreqForum,FreqPresse,FreqWikipedia
0,ChineseJapaneseKorean,0,31,25,0,0,348,0,195,5,41,1448,1446,554,27,30042,23426,24564,30859,35950,148961,83,348,3689,144841,0.012456,0.177127,0.194677,4.059173
1,Arabic,0,58,8,2,0,52,0,64,13,248,2675,5295,427,14,40512,24866,12665,22605,11399,120903,82,52,8722,112047,0.012306,0.026467,0.46028,3.14012
2,Cyrillic,0,22,36,0,0,321,6,283,0,479,2118,1523,91,105,33877,25080,12028,19861,20446,116276,163,327,4494,111292,0.024462,0.166438,0.237159,3.118961
3,Greek,2,2,13,2,0,45,0,103,13,42,277,81,78,88,21912,19767,14565,15117,15133,87240,107,45,594,86494,0.016058,0.022904,0.031347,2.423996
4,Hebrew,0,0,0,0,0,0,0,1,0,26,601,2826,50,1,12516,6170,5931,8116,7665,43903,1,0,3504,40398,0.00015,0.0,0.184914,1.132155
5,Other,0,0,0,1,0,57,0,6,0,6,54,10,1,4,8398,6290,4392,5007,4490,28716,5,57,77,28577,0.00075,0.029012,0.004063,0.800871
6,Indian,0,0,5,0,0,74,0,448,0,0,70,100,16,0,10531,5610,3164,3439,3588,27045,5,74,634,26332,0.00075,0.037665,0.033458,0.737955
7,Phonetic,2,3,0,1,0,3,0,2,0,4,22,3,0,10,4216,3041,992,1130,1275,10704,16,3,31,10654,0.002401,0.001527,0.001636,0.298579
8,Latin,0,15,17,0,1,12,0,8,1,45,61,11,12,57,1472,2021,1771,1600,2748,9852,90,12,138,9612,0.013507,0.006108,0.007283,0.269377
9,Math,1,3,0,5,0,1,0,2,0,2,4,0,2,3,3427,2994,754,455,959,8612,12,1,10,8589,0.001801,0.000509,0.000528,0.240707


# Generate new charset after normalization

In [None]:
def saveNormCharset(rootdir):
    print("Saving the normalized character set ...")
    charcounts = defaultdict(lambda:0)
    for datasetfile in datasetsdf["DatasetFile"].unique():
        print(datasetfile)
        vocabdf = pd.read_feather(rootdir / "datasets" / (datasetfile.lower()+".vocabnorm.feather"))
        for idx,row in vocabdf.iterrows():
            token = row["Word"]
            count = row["Count"]
            for char in token:
                charcode = ord(char)
                charcounts[charcode] = charcounts[charcode] + count
        charsetdf = pd.DataFrame({"Code" : [*charcounts.keys()], "Count" : [*charcounts.values()]})
        charsetdf.sort_values("Count", ascending=False, inplace=True)
        charsetdf.reset_index(inplace=True)
        charsetdf.drop('index', axis=1, inplace=True)
        enhanceCharset(charsetdf)
        charsetfile = rootdir / "datasets" / (datasetfile.lower()+".charsetnorm.feather")
        charsetdf.to_feather(charsetfile)
        charsetdf.to_csv(rootdir / "datasets" / (datasetfile.lower()+".charsetnorm.csv"),sep=";")
        print(f"- {len(charsetdf)} distinct characters")
            
def loadNormCharset(datasetfile):
    charsetfile = rootdir / "datasets" / (datasetfile.lower()+".charsetnorm.feather")
    chrdf = pd.read_feather(charsetfile)
    chrdf["Freq"] = 1000000*chrdf["Count"]/chrdf["Count"].sum()
    return chrdf

In [None]:
saveNormCharset(rootdir)

Saving the normalized character set ...
assurance
- 184 distinct characters
banque
- 228 distinct characters
bourse
- 241 distinct characters
comparateur
- 249 distinct characters
crédit
- 249 distinct characters
forum
- 253 distinct characters
institution
- 253 distinct characters
presse-1
- 253 distinct characters
presse-2
- 253 distinct characters
presse-3
- 254 distinct characters
presse-4
- 255 distinct characters
presse-5
- 255 distinct characters
presse-6
- 255 distinct characters
siteinfo
- 255 distinct characters
wikipedia-1
- 255 distinct characters
wikipedia-2
- 255 distinct characters
wikipedia-3
- 255 distinct characters
wikipedia-4
- 255 distinct characters
wikipedia-5
- 255 distinct characters


In [None]:
mergeddf = None
freqcolslist=[]
countcolslist=[]
for datasetfile in datasetsdf["DatasetFile"].unique():
    chrdf = loadNormCharset(datasetfile)
    del chrdf["Percent"]
    if(mergeddf is None):
        mergeddf = chrdf
    else:
        chrdf = chrdf[["Code","Char","Count","Freq"]]
        mergeddf = pd.merge(mergeddf, chrdf, how='outer', on="Code",suffixes=("", "_"+datasetfile))
        mergeddf["Char"].fillna(mergeddf["Char_"+datasetfile],inplace=True)
        del mergeddf["Char_"+datasetfile]
    mergeddf.rename(columns = {"Freq":"Freq_"+datasetfile}, inplace = True)
    freqcolslist.append("Freq_"+datasetfile)
    mergeddf.rename(columns = {"Count":"Count_"+datasetfile}, inplace = True)
    countcolslist.append("Count_"+datasetfile)
mergeddf["Name"] = mergeddf["Char"].map(lambda c:charname(c))
mergeddf["Category"] = mergeddf["Char"].map(lambda c:charcategory(c))
mergeddf["Subcategory"] = mergeddf["Char"].map(lambda c:charsubcategory(c))
mergeddf["Block"] = mergeddf["Char"].map(lambda c:charblock(c))
mergeddf.fillna(0,inplace=True)
mergeddf["Freq_max"] = mergeddf[freqcolslist].values.max(1)
mergeddf["Freq_min"] = mergeddf[freqcolslist].values.min(1)
mergeddf["Freq_mean"] = mergeddf[freqcolslist].values.mean(1)
mergeddf.sort_values(by="Freq_max",inplace=True,ascending=False)
mergeddf.reset_index(inplace=True)
del mergeddf["index"]
#mergeddf= mergeddf[mergeddf["Freq_max"]<0.7]
for col in freqcolslist:
    mergeddf["Rel"+col] = (100 * mergeddf[col] / mergeddf["Freq_mean"]).astype("int32")
mergeddf["CountBusiness"] = mergeddf[[col for col in mergeddf.columns if (col.startswith("Count_") and "wiki" not in col)]].sum(axis=1)
mergeddf["CountWikipedia"] = mergeddf[[col for col in mergeddf.columns if (col.startswith("Count_") and "wiki" in col)]].sum(axis=1)
mergeddf["Count"] = mergeddf[countcolslist].values.sum(1)
mergeddf["PerMillion"] = 1000000-mergeddf["Count"].cumsum()/mergeddf["Count"].sum()*1000000
mergeddf.to_csv(rootdir / "stats" / "charset-normalized.csv",sep=";")

In [None]:
normcharsdf = mergeddf[["Code","Char","Name","Category","Subcategory","Block","CountBusiness","CountWikipedia","Count"]]
normcharsdf.to_csv(rootdir / "libdata" / "chars" / "charsetstats_norm.csv",sep=";")

In [None]:
normcharsdf

Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count
0,101,e,Latin Small Letter E,Letter,Lowercase,Basic Latin,3.505036e+09,4598275413,8.103312e+09
1,115,s,Latin Small Letter S,Letter,Lowercase,Basic Latin,1.960573e+09,2534185235,4.494759e+09
2,97,a,Latin Small Letter A,Letter,Lowercase,Basic Latin,1.865613e+09,2447667654,4.313281e+09
3,110,n,Latin Small Letter N,Letter,Lowercase,Basic Latin,1.819382e+09,2388677306,4.208059e+09
4,114,r,Latin Small Letter R,Letter,Lowercase,Basic Latin,1.751654e+09,2277785371,4.029440e+09
...,...,...,...,...,...,...,...,...,...
250,129321,🤩,Grinning Face With Star Eyes,Symbol,Other,Supplemental Symbols and Pictographs,4.560000e+02,550,1.006000e+03
251,128539,😛,Face With Stuck-Out Tongue,Symbol,Other,Emoticons,3.740000e+02,405,7.790000e+02
252,10060,❌,Cross Mark,Symbol,Other,Dingbats,4.190000e+02,470,8.890000e+02
253,9994,✊,Raised Fist,Symbol,Other,Dingbats,3.470000e+02,485,8.320000e+02


In [None]:
supportedchars = set(dfsupportedchars["Char"])
normcharsdf["Supported"] = normcharsdf["Char"].isin(supportedchars)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
normcharsdf = normcharsdf.sort_values(by="CountBusiness",ascending=False)
normcharsdf.reset_index(inplace=True,drop=True)

In [None]:
normcharsdf["Family"] = normcharsdf["Block"].apply(lambda block: blockfamily(block))
normcharsdf

Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count,Supported,Family
0,101,e,Latin Small Letter E,Letter,Lowercase,Basic Latin,3.505036e+09,4598275413,8.103312e+09,True,Latin
1,115,s,Latin Small Letter S,Letter,Lowercase,Basic Latin,1.960573e+09,2534185235,4.494759e+09,True,Latin
2,97,a,Latin Small Letter A,Letter,Lowercase,Basic Latin,1.865613e+09,2447667654,4.313281e+09,True,Latin
3,110,n,Latin Small Letter N,Letter,Lowercase,Basic Latin,1.819382e+09,2388677306,4.208059e+09,True,Latin
4,105,i,Latin Small Letter I,Letter,Lowercase,Basic Latin,1.766467e+09,2331584399,4.098052e+09,True,Latin
...,...,...,...,...,...,...,...,...,...,...,...
250,8734,∞,Infinity,Symbol,Math,Mathematical Operators,1.150000e+02,3719,3.834000e+03,True,Math
251,8730,√,Square Root,Symbol,Math,Mathematical Operators,1.140000e+02,1052,1.166000e+03,True,Math
252,8592,←,Leftwards Arrow,Symbol,Math,Arrows,1.080000e+02,10576,1.068400e+04,True,Symbols
253,9993,✉,Envelope,Symbol,Other,Dingbats,5.500000e+01,2301,2.356000e+03,True,Symbols


In [None]:
normcharsdf.to_csv(rootdir / "libdata" / "chars" / "charsetstats_norm.csv",sep=";")

In [None]:
wikicharcountpM = 35682.395281
bizcharcountpM = 27577.304956

In [None]:
unsupporteddf = normcharsdf[~normcharsdf["Supported"]]
unsupstats = unsupporteddf[["Family","CountBusiness","CountWikipedia"]].groupby(by="Family").sum()
unsupstats.sort_values(by="CountBusiness",ascending=False,inplace=True)
unsupstats["FreqBusiness"] = unsupstats["CountBusiness"] / bizcharcountpM
unsupstats["FreqWikipedia"] = unsupstats["CountWikipedia"] / wikicharcountpM
unsupstats

Unnamed: 0_level_0,CountBusiness,CountWikipedia,FreqBusiness,FreqWikipedia
Family,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [None]:
unsupstats.to_csv(rootdir / "libdata" / "chars" / "unsupported.stats.csv",sep=";")

In [None]:
topunsupdf = None
for family in unsupporteddf["Family"].unique():
    filtereddf = unsupporteddf[unsupporteddf["Family"]==family].head(10)
    if topunsupdf is None:
        topunsupdf = filtereddf
    else:
        topunsupdf = topunsupdf.append(filtereddf)
topunsupdf

In [None]:
topunsupdf.to_csv(rootdir / "libdata" / "chars" / "charsetstats_norm.unsupported_business.csv",sep=";")

AttributeError: 'NoneType' object has no attribute 'to_csv'

In [None]:
topunsupdf = None
unsupwikidf = unsupporteddf.sort_values(by="CountWikipedia",ascending=False)
for family in unsupwikidf["Family"].unique():
    filtereddf = unsupwikidf[unsupwikidf["Family"]==family].head(20)
    if topunsupdf is None:
        topunsupdf = filtereddf
    else:
        topunsupdf = topunsupdf.append(filtereddf)
topunsupdf

In [None]:
topunsupdf.to_csv(rootdir / "libdata" / "chars" / "charsetstats_norm.unsupported_wikipedia.csv",sep=";")