In [1]:
from pathlib import Path

rootdir = Path(r"\\?\C:\Users\laure\OneDrive\Dev\Python\nlptextdoc\dataset 092019")

In [2]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 100
pd.options.display.max_columns = 50

In [3]:
def list_datasets():
    return pd.read_csv(rootdir / "datasets.csv",sep=';')
    
datasetsdf = list_datasets()

In [4]:
def read_dataset_file(datasetfile):
    datasetdffile = rootdir / "datasets" / (datasetfile+".dataset.feather")
    if(datasetdffile.exists()):
        datasetdf = pd.read_feather(datasetdffile)
        print(f"Loaded dataframe for dataset {datasetfile} : {len(datasetdf)} text blocks")
        return datasetdf
    else:
        raise Exception(f"No dataframe for dataset {datasetfile}")

In [5]:
def get_textblocks_from_dataset(dataset, minwords=5, maxwords=None, lang=None):
    for datasetfile in datasetsdf.loc[datasetsdf["Dataset"]==dataset,"DatasetFile"].unique():
        datasetdf = read_dataset_file(datasetfile)
        for rowidx,row in get_rows_from_datasetdf(datasetdf,minwords,maxwords,lang):
            yield row["Text"]

# Extract vocabulary for each dataset

In [16]:
!python -m spacy info



spaCy version    2.2.1                         
Location         C:\Users\laure\Anaconda3\envs\spacy\lib\site-packages\spacy
Platform         Windows-10-10.0.18362-SP0     
Python version   3.7.5                         
Models           fr                            



In [17]:
import spacy

nlp = spacy.load("fr_core_news_sm",disable=["tagger","ner","parser"])
nlp.pipeline

[]

In [16]:
def saveVocabulary(vocabdict, dataset=None):
    if(dataset is None):
        dataset = "All"
    print(f"Saving vocabulary for dataset {dataset}")
    vocabdf = pd.DataFrame({"Word" : [*vocabdict.keys()], "Count" : [*vocabdict.values()]})	
    vocabdf.sort_values("Count", ascending=False, inplace=True)
    vocabdf.reset_index(inplace=True)    
    vocabdf.drop('index', axis=1, inplace=True)
    vocabfile = rootdir / "datasets" / (dataset.lower()+".vocabulary.feather")
    vocabdf.to_feather(vocabfile)
    vocabdf.to_csv(rootdir / "datasets" / (dataset.lower()+".vocabulary.csv"),sep=";")
    print(f"- {len(vocabdf)} distinct words")
    return vocabdf

def loadVocabulary(datasetfile):
    vocabfile = rootdir / "datasets" / (datasetfile.lower()+".vocabulary.feather")
    return pd.read_feather(vocabfile)

In [None]:
from collections import defaultdict

for dataset in datasetsdf["Dataset"].unique():
    print(f"Extracting vocabulary for dataset {dataset}")
    vocabds = defaultdict(lambda:0)
    textiterator = get_textblocks_from_dataset(dataset, minwords=None)
    i = 0
    for text in textiterator:
        i = i+1
        if(i%1000 == 0):
            print(i)
        doc = nlp(text)
        for token in doc:
            tokentext = token.text
            vocabds[tokentext] = vocabds[tokentext] + 1   
    saveVocabulary(vocabds, dataset=dataset)

In [None]:
datasetfiles = ["presse-1","presse-2","presse-3","presse-4","presse-5","presse-6"]
vocabds = defaultdict(lambda:0)
i = 0
for datasetfile in datasetfiles:
    vocabfiledf = pd.read_feather(rootdir / "datasets" / (datasetfile.lower()+".vocabulary.feather"))
    for word in vocabfiledf["Word"]:
        i = i+1
        if(i%10000 == 0):
            print(i)
        vocabds[word] = vocabds[word] + 1  
saveVocabulary(vocabds, dataset="presse")

# Extract character set for each dataset

In [6]:
from unicodedata import name as unicodename
from unicodedata import category as unicodecategory

unicodeblocks = pd.read_csv(rootdir.parent / "dataset_studies" / "unicode" / "unicode_blocks.csv", sep=";")
unicodeblocks["StartNum"] = unicodeblocks["Start"].map(lambda hex:int(hex,16))
unicodeblocks["EndNum"] = unicodeblocks["End"].map(lambda hex:int(hex,16))

unicodecategories = pd.read_csv(rootdir.parent / "dataset_studies" / "unicode" / "unicode_categories.csv", sep=";")

def charname(char):
    return unicodename(char,f"Char {ord(char)}").title()

def charcategory(char):
    catcode = unicodecategory(char)
    catrow = unicodecategories.loc[unicodecategories["Code"]==catcode]
    return catrow["Category"].iloc[0]

def charsubcategory(char):
    catcode = unicodecategory(char)
    catrow = unicodecategories.loc[unicodecategories["Code"]==catcode]
    return catrow["Subcategory"].iloc[0]

def charblock(char):
    charnum = ord(char)
    blockrow = unicodeblocks[(unicodeblocks["StartNum"]<= charnum) & (unicodeblocks["EndNum"]>= charnum)]
    return blockrow["Block"].iloc[0]

In [5]:
def enhanceCharset(charsetdf):
    charsetdf["Char"] = charsetdf["Code"].map(lambda x:chr(x))
    charsetdf["isAlpha"] = charsetdf["Char"].map(lambda x:x.isalpha())
    charsetdf["isDigit"] = charsetdf["Char"].map(lambda x:x.isdigit())
    charsetdf["isSpace"] = charsetdf["Char"].map(lambda x:x.isspace())
    charsetdf["Percent"] = 100*charsetdf["Count"].cumsum()/charsetdf["Count"].sum()
    charsetdf["Name"] = charsetdf["Char"].map(lambda c:charname(c))
    charsetdf["Category"] = charsetdf["Char"].map(lambda c:charcategory(c))
    charsetdf["Subcategory"] = charsetdf["Char"].map(lambda c:charsubcategory(c))
    charsetdf["Block"] = charsetdf["Char"].map(lambda c:charblock(c))

def saveCharset(rootdir):
    print("Saving the character set ...")
    charcounts = defaultdict(lambda:0)
    for datasetfile in datasetsdf["DatasetFile"].unique():
        print(datasetfile)
        vocabdf = pd.read_feather(rootdir / "datasets" / (datasetfile.lower()+".vocabulary.feather"))
        for idx,row in vocabdf.iterrows():
            token = row["Word"]
            count = row["Count"]
            for char in token:
                charcode = ord(char)
                charcounts[charcode] = charcounts[charcode] + count
        charsetdf = pd.DataFrame({"Code" : [*charcounts.keys()], "Count" : [*charcounts.values()]})
        charsetdf.sort_values("Count", ascending=False, inplace=True)
        charsetdf.reset_index(inplace=True)
        charsetdf.drop('index', axis=1, inplace=True)
        enhanceCharset(charsetdf)
        charsetfile = rootdir / "datasets" / (datasetfile.lower()+".charset.feather")
        charsetdf.to_feather(charsetfile)
        charsetdf.to_csv(rootdir / "datasets" / (datasetfile.lower()+".charset.csv"),sep=";")
        print(f"- {len(charsetdf)} distinct characters")
            
def loadCharset(datasetfile):
    charsetfile = rootdir / "datasets" / (datasetfile.lower()+".charset.feather")
    chrdf = pd.read_feather(charsetfile)
    chrdf["Freq"] = 1000000*chrdf["Count"]/chrdf["Count"].sum()
    return chrdf

In [None]:
saveCharset(rootdir)

# Generate latinletters.csv file

In [21]:
def charlatindiacritics(char):
    if not char.isalpha():
        return None    
    name = charname(char)
    if not "Latin " in name:
        if "Modifier Letter Small" in name:
            parts = name.split()
            if len(parts)==4:
                alllatinchars.add(char)
                modifierchars[char] = parts[3]
            else:
                return None
        else:
            return None
    alllatinchars.add(char)
    parts = name.split()
    if parts[0]=="Latin" and (parts[2]=="Letter" or parts[2]=="Ligature") and (parts[1]=="Small" or parts[1]=="Capital") and (len(parts)==4 or (len(parts)==5 and (parts[3]=="Sharp" or parts[3]=="Tone" or parts[3]=="Rams"))):
        latinchars[char] = name
        normchar = char.lower()
        if len(parts)==5:
            latincharsfromname[parts[-2] + " " + parts[-1]] = normchar
        else:
            latincharsfromname[parts[-1]] = normchar
        if (len(parts[-1])==2 and parts[-1]!="Um") or parts[2]=="Ligature":
            if parts[1]=="Small":
                equivchars = parts[-1].lower()
            else:
                equivchars = parts[-1].upper()
            latincharsligatures[char] = equivchars
        return None
    elif len(parts)>=6 and parts[0]=="Latin" and parts[2]=="Letter" and parts[4]=="With":
        latinLetter = parts[3]
        if len(latinLetter) > 1:
            if latinLetter in latincharsfromname:
                latinLetter = latincharsfromname[latinLetter]
            else:
                #print("Ignored len>1 : " + str((ord(char), char, name)))
                return None
        if parts[1]=="Small":
            latinLetter = latinLetter.lower()
        elif parts[1]=="Capital":
            latinLetter = latinLetter.upper()
        else:
            #print("Ignored not Small/Capital : " + str((ord(char), char, name)))
            return None
        latinDiacritic = " ".join(parts[5:])
        if "Letter" in latinDiacritic:
            #print("Ignored Letter : " + str((ord(char), char, name)))
            return None
        props = (ord(char), char, name, latinLetter, latinDiacritic)
        return props
    else:        
        if name.startswith("Latin Letter Small Capital"):
            diacritic = "Small Capital"
            if len(parts[4])==1:            
                if(len(parts)>5):
                    diacritic = diacritic + " " + " ".join(parts[5:])
                props = (ord(char), char, name, parts[4], diacritic)
                return props
            elif len(parts)==6 and len(parts[5])==1:            
                diacritic = diacritic + " " + parts[4]
                props = (ord(char), char, name, parts[5], diacritic)
                return props
            else: 
                latinLetterName = parts[4]
                if latinLetterName in latincharsfromname:
                    latinLetter = latincharsfromname[latinLetterName]
                    props = (ord(char), char, name, latinLetter.upper(), diacritic)
                    return props
                else:
                    #print("Ignored len>1 Small Cap : " + str((ord(char), char, name)))
                    return None
        elif name.startswith("Latin Small Letter Dotless") and len(parts[4])==1:
            diacritic = "Dotless"
            if(len(parts)>5):
                diacritic = diacritic + " " + " ".join(parts[5:])
            props = (ord(char), char, name, parts[4].lower(), diacritic)
            return props
        elif len(parts[-1])==1 and code>255:
            if (name.startswith("Latin Small Letter") or name.startswith("Latin Capital Letter")):
                diacritic = " ".join(parts[3:-1])
                latinLetter = parts[-1]
                if parts[1]=="Small":
                    latinLetter = latinLetter.lower()
                props = (ord(char), char, name, latinLetter, diacritic)
                return props
            elif parts[0] == "Fullwidth":
                diacritic = "Fullwidth"
                latinLetter = parts[-1]
                if parts[2]=="Small":
                    latinLetter = latinLetter.lower()
                props = (ord(char), char, name, latinLetter, diacritic)
                return props
            elif parts[1] == "Subscript":
                diacritic = "Subscript"
                latinLetter = parts[-1]
                if parts[2]=="Small":
                    latinLetter = latinLetter.lower()
                props = (ord(char), char, name, latinLetter, diacritic)
                return props
            elif parts[0] == "Superscript":
                diacritic = "Superscript"
                latinLetter = parts[-1]
                if parts[2]=="Small":
                    latinLetter = latinLetter.lower()
                props = (ord(char), char, name, latinLetter, diacritic)
                return props
            elif parts[1] == "Epigraphic":
                diacritic = "Epigraphic " + parts[3]
                latinLetter = parts[-1]
                props = (ord(char), char, name, latinLetter, diacritic)
                return props
            else: 
                #print("Ignored parts[-1]==1 : " + str((ord(char), char, name)))
                return None
        elif parts[0]=="Latin" and (parts[1]=="Small" or parts[1]=="Capital") and parts[2]=="Letter":
            if parts[3] in latincharsfromname or parts[4] in latincharsfromname:
                if parts[3] in latincharsfromname:
                    latinLetter = latincharsfromname[parts[3]]
                    diacritic = " ".join(parts[4:])
                elif parts[4] in latincharsfromname:
                    latinLetter = latincharsfromname[parts[4]]
                    diacritic = parts[3] + " " + " ".join(parts[5:])
                if parts[1]=="Small":
                    latinLetter = latinLetter.lower()
                else:
                    latinLetter = latinLetter.upper()
                props = (ord(char), char, name, latinLetter, diacritic)
                return props
            if parts[4]=="Digraph":
                if parts[3]=="Tc":
                    return None
                if len(parts)==5:
                    latinLetter = char
                    latincharsfromname[parts[3]+" "+parts[4]] = latinLetter
                else:
                    latinLetter = latincharsfromname[parts[3]+" "+parts[4]]
                latinchars[char] = name
                #latincharsfromname[latinLetter] = char
                if len(parts)==5:
                    firstLetter = parts[3][0]
                    secondLetter = parts[3][1:]
                    if len(secondLetter)>1:
                        secondLetter = secondLetter[0].upper() + secondLetter[1:]
                        secondLetter = latincharsfromname[secondLetter]                    
                    latincharsligatures[char] = firstLetter + secondLetter
                    return None
                else:
                    diacritic = parts[6]
                    props = (ord(char), char, name, latinLetter, diacritic)
                    return props
            else:
                #print("Ignored else 1 : " + str((ord(char), char, name)))
                pass
            return None
        else:
            #print("Ignored else 2 : " + str((ord(char), char, name)))
            return None


latinunicodeblocks = []
for _,row in unicodeblocks.iterrows():
    start = row["StartNum"]
    end = row["EndNum"]
    for code in range(start,end+1):
        char = chr(code)
        if(char.isalpha()):
            name = charname(char)
            if "Latin " in name:
                latinunicodeblocks.append(row["Block"])
                break
           
alllatinchars = set()  
modifierchars = {}
latinchars = {}
latincharsfromname = {}
# Need to prepare these three chars because diacritics are declared before main char
latincharsfromname["Dz"]="ǳ"
latincharsfromname["Heng"]="ꜧ"
latincharsfromname["Delta"]="ẟ"
latincharsfromname["Omega"]="ꞷ"
latincharsfromname["Rum"]="ꝵ"
latincharsdiacritics = {}
latincharsligatures = {}
for _,row in unicodeblocks[unicodeblocks["Block"].isin(latinunicodeblocks)].iterrows():
    start = row["StartNum"]
    end = row["EndNum"]
    for code in range(start,end+1):
        props = charlatindiacritics(chr(code))
        if not props is None:
            latincharsdiacritics[props[1]] = (props[3],props[4])        
# Additional special cases            
#  => no equivalent without stroke !
latinchars["ƛ"] = "Latin Small Letter Lambda With Stroke"
latincharsfromname["Lambda"] = "ƛ"
latinchars["ᵺ"] = "Latin Small Letter Th with Strikethrough"
latincharsfromname["Th"] = "ᵺ"
latincharsligatures["ᵺ"]="th"
latinchars["ƻ"] = "Latin Letter Two With Stroke"
latincharsfromname["Two"] = "ƻ"
latinchars["ʨ"] = "Latin Small Letter Tc Digraph with Curl"
latincharsfromname["Tc Digraph"] = "ʨ"
# last cases not covered
latinchars["Ʀ"] = "Latin Letter Yr"
latincharsfromname["Yr"] = "Ʀ"
latincharsdiacritics['ƪ'] = ("ʃ", 'Reversed Loop')
latincharsdiacritics['ƾ'] = ("ɂ", 'Inverted Stroke')
latinchars["ƿ"] = "Latin Letter Wynn"
latincharsfromname["Wynn"] = "ƿ"
latinchars["ǀ"] = "Latin Letter Dental Click"
latincharsfromname["Dental Click"] = "ǀ"
latinchars["ǁ"] = "Latin Letter Lateral Click"
latincharsfromname["Lateral Click"] = "ǁ"
latinchars["ǂ"] = "Latin Letter Alveolar Click"
latincharsfromname["Alveolar Click"] = "ǂ"
latinchars["ǃ"] = "Latin Letter Retroflex Click"
latincharsfromname["Retroflex Click"] = "ǃ"
latincharsdiacritics['ǅ'] = ("ǲ", 'Caron')
latinchars["ǈ"] = "Latin Capital Letter L With Small Letter J"
latincharsfromname["L With Small Letter J"] = "ǈ"
latincharsligatures["ǈ"]="Lj"
latinchars["ǋ"] = "Latin Capital Letter N With Small Letter J"
latincharsfromname["N With Small Letter J"] = "ǋ"
latincharsligatures["ǋ"]="Nj"
latinchars["ǲ"] = "Latin Capital Letter D With Small Letter Z"
latincharsfromname["D With Small Letter Z"] = "ǲ"
latincharsligatures["ǲ"]="Dz"
latinchars["Ɂ"] = "Latin Capital Letter Glottal Stop"
latinchars["ɂ"] = "Latin Small Letter Glottal Stop"
latincharsfromname["Glottal Stop"] = "ʔ"
latincharsdiacritics['ɝ'] = ("e", "Reversed Open Hook")
latincharsdiacritics['ʅ'] = ("ʃ", "Squat Reversed")
latinchars["ʔ"] = "Latin Letter Glottal Stop"
latinchars["ʕ"] = "Latin Letter Pharyngeal Voiced Fricative"
latincharsfromname["Pharyngeal Voiced Fricative"] = "ʕ"
latincharsdiacritics['ʖ'] = ("ʔ", "Inverted")
latincharsdiacritics['ʗ'] = ("c", "Stretched")
latinchars["ʘ"] = "Latin Letter Bilabial Click"
latincharsfromname["Bilabial Click"] = "ʘ"
latincharsdiacritics['ʡ'] = ("ʔ", "Stroke")
latincharsdiacritics['ʢ'] = ("ʔ", "Reversed Stroke")
latinchars["ʬ"] = "Latin Letter Bilabial Percussive"
latincharsfromname["Bilabial Percussive"] = "ʬ"
latinchars["ʭ"] = "Latin Letter Bidental Percussive"
latincharsfromname["Bidental Percussive"] = "ʭ"
latinchars["ᴤ"] = "Latin Letter Voiced Laryngeal Spirant"
latincharsfromname["Voiced Laryngeal Spirant"] = "ᴤ"
latinchars["ᴥ"] = "Latin Letter Ain"
latincharsfromname["Ain"] = "ᴥ"
latincharsdiacritics['ᵻ'] = ("I", "Small Capital Stroke")
latincharsdiacritics['ᵾ'] = ("U", "Small Capital Stroke")
latincharsdiacritics['ᶔ'] = ("e", "Reversed Open Retroflex Hook")
latincharsdiacritics['ₔ'] = ("ə", "Subscript")
latincharsdiacritics['ꟾ'] = ("I", "Epigraphic Longa")
latinchars["Ỻ"] = "Latin Capital Letter Middle-Welsh Ll"
latinchars["ỻ"] = "Latin Small Letter Middle-Welsh Ll"
latincharsfromname["Middle-Welsh Ll"] = "ỻ"
latincharsligatures["Ỻ"]="Ll"
latincharsligatures["ỻ"]="ll"
latinchars["Ꜣ"] = "Latin Capital Letter Egyptological Alef"
latinchars["ꜣ"] = "Latin Small Letter Egyptological Alef"
latincharsfromname["Egyptological Alef"] = "ꜣ"
latinchars["Ꜥ"] = "Latin Capital Letter Egyptological Ain"
latinchars["ꜥ"] = "Latin Small Letter Egyptological Ain"
latincharsfromname["Egyptological Ain"] = "ꜥ"
latinchars["ꞏ"] = "Latin Letter Sinological Dot"
latincharsfromname["Sinological Dot"] = "ꞏ"
latinchars["ꭠ"] = "Latin Small Letter Sakha Yat"
latincharsfromname["Sakha Yat"] = "ꭠ"
latincharsdiacritics['ﬅ'] = ("ﬆ", "Long")
latincharsligatures["ﬅ"]="st"

# Modifier chars
for char,latinchar in modifierchars.items():
    if latinchar in latincharsfromname:
        latincharsdiacritics[char] = (latincharsfromname[latinchar], "Modifier")
    elif char=="ᶿ":
        latinchars["ᶿ"] = "Latin Small Letter Egyptological Alef"
        latincharsfromname["Theta"] = "ᶿ"
        
latinnamesfromchar = {value:key for (key,value) in latincharsfromname.items()}
latinnamesfromchar["ɂ"] = "Glottal Stop"

In [22]:
additionalligatures = [452,453,454,482,483,508,509,630,675,677,7425,7426,7444,7445,42810,42811,42906,42907,42908,42909,42910,42911,43840,43841,43842,43857,43874]

In [23]:
codes = []
chars = []
descs = []
blocks = []
cats = []
scats = []
isupper = []
islower = []
uppers = []
lowers = []
names = []
isdiacritic = []
normchars = []
diacritics = []
isligature = []
ligatures = []
for char in alllatinchars:
    code = ord(char)
    codes.append(code)
    chars.append(char)
    descs.append(charname(char))
    blocks.append(charblock(char))
    cats.append(charcategory(char))
    subcat = charsubcategory(char)
    scats.append(subcat)
    upper = subcat=="Uppercase"
    isupper.append(upper)
    lower = subcat=="Lowercase"
    islower.append(lower)
    uppers.append(char.upper())
    lowers.append(char.lower())
    normchar = char
    if(char in latincharsdiacritics):
        res = latincharsdiacritics[char]
        isdiacritic.append(True)
        normchar = res[0]
        normchars.append(normchar)
        diacritics.append(res[1].replace("And ","").replace("With ",""))
    else:
        isdiacritic.append(False)
        normchars.append(None)
        diacritics.append(None)
    if normchar!="Ʀ":
        normchar = normchar.lower()
    if normchar in latinnamesfromchar:
        names.append(latinnamesfromchar[normchar])
    else:
        print("ERROR : " + char + " - " + normchar)
        if char in latinnamesfromchar:
            print(latinnamesfromchar[char])
        continue
    if(char in latincharsligatures):
        isligature.append(True)
        ligatures.append(latincharsligatures[char])
    elif(code in additionalligatures):
        isligature.append(True)
        multichars = names[-1]
        if lower:
            multichars = multichars.lower()
        if upper:
            multichars = multichars.upper()
        ligatures.append(multichars)
    else:
        isligature.append(False)
        ligatures.append(None)
df = pd.DataFrame.from_dict({"Code":codes, "Char":chars, "Name":names, "IsUpper":isupper, "Upper":uppers, "IsLower":islower, "Lower":lowers, "IsDiacritic":isdiacritic, "BaseChar":normchars, "Diacritics":diacritics, "IsLigature":isligature, "MultiChars":ligatures,  "Description":descs, "Block":blocks, "Category":cats, "SubCategory":scats })
df.sort_values(by=['Code'],inplace=True)
df.reset_index(inplace=True)
df.drop(columns="index",inplace=True)
df

Unnamed: 0,Code,Char,Name,IsUpper,Upper,IsLower,Lower,IsDiacritic,BaseChar,Diacritics,IsLigature,MultiChars,Description,Block,Category,SubCategory
0,65,A,A,True,A,False,a,False,,,False,,Latin Capital Letter A,Basic Latin,Letter,Uppercase
1,66,B,B,True,B,False,b,False,,,False,,Latin Capital Letter B,Basic Latin,Letter,Uppercase
2,67,C,C,True,C,False,c,False,,,False,,Latin Capital Letter C,Basic Latin,Letter,Uppercase
3,68,D,D,True,D,False,d,False,,,False,,Latin Capital Letter D,Basic Latin,Letter,Uppercase
4,69,E,E,True,E,False,e,False,,,False,,Latin Capital Letter E,Basic Latin,Letter,Uppercase
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1225,65366,ｖ,V,False,Ｖ,True,ｖ,True,v,Fullwidth,False,,Fullwidth Latin Small Letter V,Halfwidth and Fullwidth Forms,Letter,Lowercase
1226,65367,ｗ,W,False,Ｗ,True,ｗ,True,w,Fullwidth,False,,Fullwidth Latin Small Letter W,Halfwidth and Fullwidth Forms,Letter,Lowercase
1227,65368,ｘ,X,False,Ｘ,True,ｘ,True,x,Fullwidth,False,,Fullwidth Latin Small Letter X,Halfwidth and Fullwidth Forms,Letter,Lowercase
1228,65369,ｙ,Y,False,Ｙ,True,ｙ,True,y,Fullwidth,False,,Fullwidth Latin Small Letter Y,Halfwidth and Fullwidth Forms,Letter,Lowercase


In [None]:
df.to_csv(rootdir / "latinletters.csv", sep=";")

# Generate combiningdiacritics.csv file

In [24]:
combiningdiacritics = [(769,"Acute"),
(768,"Grave"),
(770,"Circumflex"),
(807,"Cedilla"),
(771,"Tilde"),
(776,"Diaeresis"),
(822,"Long Stroke Overlay"),
(772,"Macron"),
(780,"Caron"),
(803,"Dot Below"),
(775,"Dot Above"),
(778,"Ring Above")]
basechars=[]
codes=[]
combchars=[]
diacritics=[]
chars=[]
for code,diacritic in combiningdiacritics:
    for idx,row in df[df["Diacritics"]==diacritic][["BaseChar","Char"]].iterrows():
        basechars.append(row["BaseChar"])
        codes.append(code)
        combchars.append(chr(code))
        diacritics.append(diacritic)
        chars.append(row["Char"])
dfcomb = pd.DataFrame.from_dict({"BaseChar":basechars, "CombiningCode":codes, "CombiningChar":combchars, "Diacritic":diacritics, "CombinedChar":chars })
dfcomb

Unnamed: 0,BaseChar,CombiningCode,CombiningChar,Diacritic,CombinedChar
0,A,769,́,Acute,Á
1,E,769,́,Acute,É
2,I,769,́,Acute,Í
3,O,769,́,Acute,Ó
4,U,769,́,Acute,Ú
...,...,...,...,...,...
269,a,778,̊,Ring Above,å
270,U,778,̊,Ring Above,Ů
271,u,778,̊,Ring Above,ů
272,w,778,̊,Ring Above,ẘ


In [None]:
dfcomb.to_csv(rootdir / "combiningdiacritics.csv", sep=";")

# Test encodings

In [25]:
import unicodedata

ntype = "NFKC"
text = "jusqu’à"
print(text)
norm = unicodedata.normalize(ntype,text)
text2 = "jusqu’à"
print(text2)
norm2 = unicodedata.normalize(ntype,text2)
print((text.encode("utf-8"),text2.encode("utf-8")))
print((norm.encode("utf-8"),norm2.encode("utf-8")))

jusqu’à
jusqu’à
(b'jusqu\xe2\x80\x99\xc3\xa0', b'jusqu\xe2\x80\x99a\xcc\x80')
(b'jusqu\xe2\x80\x99\xc3\xa0', b'jusqu\xe2\x80\x99\xc3\xa0')


In [3]:
"ét頻,".encode("utf-8").decode("iso8859-1")

'Ã©té\xa0»,'

In [27]:
def testencoding(char):
    try:
        utoi = char.encode("utf-8").decode("iso8859-1")
        print(f"{char} =(ui)=> {utoi} = {list(utoi)} ")
    except:
        print("ui ERROR")
    try:
        itou = char.encode("iso8859-1").decode("utf-8")
        print(f"{char} =(iu)=> {itou} = {list(itou)} ")
    except:
        print("iu ERROR")
    try:
        utoi = char.encode("utf-8").decode("Windows-1252")
        print(f"{char} =(uw)=> {utoi} = {list(utoi)} ")
    except:
         print("uw ERROR")
    try:
        itou = char.encode("Windows-1252").decode("utf-8")
        print(f"{char} =(wu)=> {itou} = {list(itou)} ")
    except:
         print("wu ERROR")
    try:
        utoi = char.encode("iso8859-1").decode("Windows-1252")
        print(f"{char} =(iw)=> {utoi} = {list(utoi)} ")
    except:
         print("iw ERROR")
    try:
        itou = char.encode("Windows-1252").decode("iso8859-1")
        print(f"{char} =(wi)=> {itou} = {list(itou)} ")
    except:
         print("wi ERROR")
    if(len(char)==1 and ord(char)>256):
        try:
            high = int(ord(char)/256)
            low = ord(char)%256
            it2u = bytes([high,low]).decode("utf-8")
            print(f"{ord(char)} =(it2u)=> {it2u}")
        except:
            print("it2u ERROR")

In [28]:
testencoding("températureâ?¦")

températureâ?¦ =(ui)=> tempÃ©ratureÃ¢?Â¦ = ['t', 'e', 'm', 'p', 'Ã', '©', 'r', 'a', 't', 'u', 'r', 'e', 'Ã', '¢', '?', 'Â', '¦'] 
iu ERROR
températureâ?¦ =(uw)=> tempÃ©ratureÃ¢?Â¦ = ['t', 'e', 'm', 'p', 'Ã', '©', 'r', 'a', 't', 'u', 'r', 'e', 'Ã', '¢', '?', 'Â', '¦'] 
wu ERROR
températureâ?¦ =(iw)=> températureâ?¦ = ['t', 'e', 'm', 'p', 'é', 'r', 'a', 't', 'u', 'r', 'e', 'â', '?', '¦'] 
températureâ?¦ =(wi)=> températureâ?¦ = ['t', 'e', 'm', 'p', 'é', 'r', 'a', 't', 'u', 'r', 'e', 'â', '?', '¦'] 


In [29]:
#!conda install ftfy

In [None]:
from collections import defaultdict

errCounts = defaultdict(int)
for dataset in datasetsdf["DatasetFile"].unique():
    if(dataset.startswith("wiki")):
        continue
    textdf = get_dataset_df(dataset)
    for idx,errStr in enumerate(encodingErrors):
        errCount = len(textdf[textdf["Text"].str.contains(errStr,regex=False)])
        errCounts[idx] += errCount
errCounts

# Generate windows1252-iso8859-errors.csv file

In [30]:
windows1252chars = [(146,8217),
(128,8364),
(133,8230),
(150,8211),
(156,339),
(149,8226),
(147,8220),
(148,8221),
(151,8212),
(145,8216)]
ctrlcodes=[]
ctrlchars=[]
unicodes=[]
unichars=[]
for ctrlcode,unicode in windows1252chars:
    ctrlcodes.append(ctrlcode)
    ctrlchars.append(chr(ctrlcode))
    unicodes.append(unicode)
    unichars.append(chr(unicode))
dfwin1252 = pd.DataFrame.from_dict({"ControlCode":ctrlcodes, "ControlChar":ctrlchars, "DecodedCode":unicodes, "DecodedChar":unichars })
dfwin1252

Unnamed: 0,ControlCode,ControlChar,DecodedCode,DecodedChar
0,146,,8217,’
1,128,,8364,€
2,133,,8230,…
3,150,,8211,–
4,156,,339,œ
5,149,,8226,•
6,147,,8220,“
7,148,,8221,”
8,151,,8212,—
9,145,,8216,‘


In [None]:
dfwin1252.to_csv(rootdir / "windows1252-iso8859-errors.csv", sep=";")

# Generate utf8-windows1252-errors.csv file

In [3]:
errsubstrings = []
codes = []
chars = []
for i in range(128,256):
    try:
        char = bytes([i]).decode("Windows-1252")
        errsubstrings.append(char.encode("utf-8").decode("Windows-1252"))
        codes.append(ord(char))
        chars.append(char)
    except:
        print(str(i)+" undefined")
dfutf8 = pd.DataFrame.from_dict({"ErrorSubstring":errsubstrings, "DecodedCode":codes, "DecodedChar":chars })
dfutf8   

129 undefined
141 undefined
143 undefined
144 undefined
148 undefined
157 undefined
193 undefined
205 undefined
207 undefined
208 undefined
221 undefined


Unnamed: 0,ErrorSubstring,DecodedCode,DecodedChar
0,â‚¬,8364,€
1,â€š,8218,‚
2,Æ’,402,ƒ
3,â€ž,8222,„
4,â€¦,8230,…
...,...,...,...
112,Ã»,251,û
113,Ã¼,252,ü
114,Ã½,253,ý
115,Ã¾,254,þ


In [None]:
dfutf8.to_csv(rootdir / "utf8-windows1252-errors.csv", sep=";")

# Generate windows1252-utf8-errors.csv file

In [4]:
charstatsdf = pd.read_csv(rootdir / "libdata" / "chars" / "charsetstats_raw.csv", sep=";")[["Code","Char","CountBusiness","CountWikipedia"]]
charstatsdf["Freq"] = charstatsdf["CountBusiness"]/(charstatsdf["CountBusiness"].sum()/100)
charstatsdf.iloc[77,1] = chr(10)
charstatsdf.iloc[106,1] = chr(9)

In [5]:
charprob = {}
charcountbiz = {}
charcountwiki = {}
for idx,row in charstatsdf.iterrows():
    charprob[row["Code"]] = row["Freq"]
    charcountbiz[row["Code"]] = row["CountBusiness"]
    charcountwiki[row["Code"]] = row["CountWikipedia"]

In [23]:
unicodetextintervals = [(32,126),(160,55295),(63744,64975),(65008,65533),(65536,131069),(131072,196605)]
inputcodes = []
inputchars = []
inputcharcountsbiz = []
inputcharcountswiki = []
outputchars = []
outputcodes = []
outputprobs = []
for low,high in unicodetextintervals:
    for code in range(low,high+1):
        if code in charprob:
            utf8codes =chr(code).encode("utf-8")
            if len(utf8codes) > 1:
                probn = 1
                for utf8code in utf8codes:
                    if utf8code in charprob:
                        probn = probn * charprob[utf8code]
                    else:
                        probn = 0
                        break
                if probn > 0:
                    try:               
                        decodedchars = utf8codes.decode('Windows-1252')
                        if decodedchars in errsubstrings:
                            continue
                        else:
                            outputchars.append(decodedchars)
                            outputcodes.append([ord(char) for char in decodedchars])
                            inputcodes.append(code)
                            inputchars.append(chr(code))
                            inputcharcountsbiz.append(charcountbiz[code])
                            inputcharcountswiki.append(charcountwiki[code])
                            outputprobs.append(probn)
                    except:
                        pass
dfutf8win = pd.DataFrame.from_dict({"Code":inputcodes, "Char":inputchars, "CountBusiness":inputcharcountsbiz, "CountWikipedia":inputcharcountswiki, "DecodedChars":outputchars, "DecodedCodes":outputcodes, "Prob":outputprobs})
dfutf8win.sort_values(by="Prob",ascending=False,inplace=True)
dfutf8win.reset_index(inplace=True,drop=True)
dfutf8win["DecodedLenth"] = dfutf8win["DecodedChars"].apply(lambda s:len(s))
dfutf8win["RelProb"] = dfutf8win["Prob"]/(dfutf8win["Prob"].max()/100)
dfutf8win.drop(columns="Prob",inplace=True)
dfutf8win = dfutf8win[((dfutf8win["CountBusiness"]>=200) | (dfutf8win["CountWikipedia"]>=300)) & (dfutf8win["RelProb"]>=0.0001)]

In [25]:
dfutf8win.head(1).to_csv(rootdir / "libdata" / "chars" / "windows1252-utf8-errors.csv", sep=";")

# Generate controlchars.csv file

In [32]:
controlcodes = \
[x for x in range(0,9)] + \
[12] + \
[x for x in range(14,32)] + \
[127] + \
[x for x in range(129,133)] + \
[x for x in range(134,145)] + \
[x for x in range(152,156)] + \
[x for x in range(157,160)] + \
[168] + \
[x for x in range(172,174)] + \
[822,824,8203,8204,8205,8206,8207,\
8234,8236,8294,8297,8419,58893,58912,\
58929,58936,58937,58998,59700,61472,\
61485,61596,61607,61623,61656,61664,\
61672,61680,61692,61701,65038,65039,\
65279,65532,127995,127996,127997,127998]

controlchars = []
charnames = []
for code in controlcodes:
    char = chr(code)
    controlchars.append(char)
    charnames.append(charname(char))

dfctrlchars= pd.DataFrame.from_dict({"ControlCode":controlcodes, "ControlChar":controlchars, "CharName":charnames })
dfctrlchars

Unnamed: 0,ControlCode,ControlChar,CharName
0,0,�,Char 0
1,1,,Char 1
2,2,,Char 2
3,3,,Char 3
4,4,,Char 4
5,5,,Char 5
6,6,,Char 6
7,7,,Char 7
8,8,,Char 8
9,12,,Char 12


In [None]:
dfctrlchars.to_csv(rootdir / "controlchars.csv", sep=";")

# Generate latinnumbers.csv file

In [33]:
codes = []
chars = []
names = []
for code in range(0,256):
    char = chr(code)
    if charcategory(char)=="Number":
        codes.append(code)
        chars.append(char),
        names.append(charname(char))
for code in range(8000,8576):
    if code==8543:
        continue
    char = chr(code)
    if charcategory(char)=="Number":
        codes.append(code)
        chars.append(char),
        names.append(charname(char))
for code in range(9312,10132):
    char = chr(code)
    if charcategory(char)=="Number":
        codes.append(code)
        chars.append(char),
        names.append(charname(char))  
for code in range(12872,12896):
    char = chr(code)
    if charcategory(char)=="Number":
        codes.append(code)
        chars.append(char),
        names.append(charname(char))
for code in range(12977,12992):
    char = chr(code)
    if charcategory(char)=="Number":
        codes.append(code)
        chars.append(char),
        names.append(charname(char))
for code in range(65296,65306):
    char = chr(code)
    if charcategory(char)=="Number":
        codes.append(code)
        chars.append(char),
        names.append(charname(char))
for code in range(120782,120832):
    char = chr(code)
    if charcategory(char)=="Number":
        codes.append(code)
        chars.append(char),
        names.append(charname(char))
for code in range(127232,127244):
    char = chr(code)
    if charcategory(char)=="Number":
        codes.append(code)
        chars.append(char),
        names.append(charname(char))
        
dfnumbers= pd.DataFrame.from_dict({"Code":codes, "Char":chars, "CharName":names })
dfnumbers

Unnamed: 0,Code,Char,CharName
0,48,0,Digit Zero
1,49,1,Digit One
2,50,2,Digit Two
3,51,3,Digit Three
4,52,4,Digit Four
...,...,...,...
297,127239,🄇,Digit Six Comma
298,127240,🄈,Digit Seven Comma
299,127241,🄉,Digit Eight Comma
300,127242,🄊,Digit Nine Comma


In [None]:
dfnumbers.to_csv(rootdir / "latinnumbers.csv", sep=";")

In [None]:
dfnumbers = pd.read_csv(rootdir / "latinnumbers.csv", sep=";")
dfnumbers

# Generate latinsymbols.csv file

In [17]:
codes = []
chars = []
names = []

for code in range(8448,8528):
    char = chr(code)
    name = charname(char)
    codes.append(code)
    chars.append(char)
    names.append(name)
for code in range(9372,9450):
    char = chr(code)
    if charcategory(char)=="Symbol":
        name = charname(char)
        if "Latin " in name:
            codes.append(code)
            chars.append(char)
            names.append(name)
for code in range(119808,120483):
    char = chr(code)
    name = charname(char)
    codes.append(code)
    chars.append(char)
    names.append(name)        
for code in range(127248,127398):
    char = chr(code)
    if charcategory(char)=="Symbol":
        name = charname(char)
        if "Latin " in name:
            codes.append(code)
            chars.append(char)
            names.append(name)
for code in range(127462,127488):
    char = chr(code)
    name = charname(char)
    codes.append(code)
    chars.append(char)
    names.append(name)
    
dfsymbols= pd.DataFrame.from_dict({"Code":codes, "Char":chars, "CharName":names })
dfsymbols

Unnamed: 0,Code,Char,CharName
0,8448,℀,Account Of
1,8449,℁,Addressed To The Subject
2,8450,ℂ,Double-Struck Capital C
3,8451,℃,Degree Celsius
4,8452,℄,Centre Line Symbol
...,...,...,...
963,127483,🇻,Regional Indicator Symbol Letter V
964,127484,🇼,Regional Indicator Symbol Letter W
965,127485,🇽,Regional Indicator Symbol Letter X
966,127486,🇾,Regional Indicator Symbol Letter Y


In [18]:
dfsymbols.to_csv(rootdir / "latinsymbols.csv", sep=";")

# Generate normalizedchars.csv file

The first version of normalizedchgars.csv is created manually after a study of the dataset.

In [None]:
dfnorm = pd.read_csv(rootdir / "normalizedchars.csv", sep=";")
dfnorm

In [None]:
codes = []
chars = []
names = []
normcodes = []
normchars = []
normnames = []

for _,row in dfnorm.iterrows():
    code = row["Code"]
    char = chr(code)
    name = charname(char)
    normcode = row["NormCode"]
    normchar = chr(normcode)
    normname = charname(normchar)
    codes.append(code)
    chars.append(char)
    names.append(name)
    normcodes.append(normcode)
    normchars.append(normchar)
    normnames.append(normname)
    
dfnormfull = pd.DataFrame.from_dict({"Code":codes, "Char":chars, "CharName":names,  "NormCode":normcodes, "NormChar":normchars, "NormCharName":normnames})
dfnormfull

In [None]:
dfnormfull.to_csv(rootdir / "normalizedchars2.csv", sep=";")

# Generate characters stats after char normalization

In [121]:
import pandas as pd
from functools import partial
from operator import itemgetter
from io import StringIO

    
class TextNormalizer():
    
    def __init__(self, rootdir):
        
        # 1. Load Unicode character set data for latin script
        chardatadir = rootdir / "libdata" / "chars"
        # 1.1 Frequent encoding errors : windows1252 read as iso8859-1
        dfencodingwin1252 = pd.read_csv(chardatadir / "windows1252-iso8859-errors.csv", sep=";")
        win1252errorchars = {}
        for rowidx,row in dfencodingwin1252.iterrows():
            win1252errorchars[row["Char"]] = row["DecodedChar"]
        # 1.2 Frequent encoding errors : utf8 read as windows1252
        dfencodingutf8 = pd.read_csv(chardatadir / "utf8-windows1252-errors.csv", sep=";")
        utf8errorchars = {}
        for rowidx,row in dfencodingutf8.iterrows():
            utf8errorchars[row["ErrorSubstring"]] = row["DecodedChar"]
        utf8errorshdict = self.buildhierarchicaldict(utf8errorchars)
        # 1.3 Unicode combining chars
        dfcombiningchars = pd.read_csv(chardatadir / "combiningdiacritics.csv", sep=";")
        combiningchars = {}
        for rowidx,row in dfcombiningchars.iterrows():
            combiningchars[row["BaseChar"]+row["Char"]] = row["CombinedChar"]
        combiningcharshdict = self.buildhierarchicaldict(combiningchars)
        # 1.4 Control chars
        dfcontrolchars = pd.read_csv(chardatadir / "controlchars.csv", sep=";")
        dfcontrolchars.loc[0,"Char"] = chr(0) # chr(0) can't be saved in CSV file
        controlchars = set(dfcontrolchars["Char"])
        # 1.5 Latin letter symbols
        dflatinsymbols = pd.read_csv(chardatadir / "latinsymbols.csv", sep=";")
        latinlettersnolayout = {}
        latinlettersremovedlayout = {}
        for rowidx,row in dflatinsymbols.iterrows():
            latinlettersnolayout[row["Char"]] = row["NormString"]
            latinlettersremovedlayout[row["Char"]] = row["Layout"]
        # 1.6 Latin letters
        dflatinletters = pd.read_csv(chardatadir / "latinletters.csv", sep=";")
        latinletterstoupper = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["Char"] != row["UpperChar"]:
                latinletterstoupper[row["Char"]] = row["UpperChar"]
        latinlettersnodiacritics = {}
        latinlettersremoveddiacritics = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["IsDiacritic"]:
                latinlettersnodiacritics[row["Char"]] = row["BaseChar"]
                latinlettersremoveddiacritics[row["Char"]] = row["Diacritics"]
        latinlettersnoligatures = {}
        for rowidx,row in dflatinletters.iterrows():
            if row["IsLigature"]:
                latinlettersnoligatures[row["Char"]] = row["MultiChars"]
        # 1.7 Latin numbers and number symbols
        dflatinnumbers = pd.read_csv(chardatadir / "latinnumbers.csv", sep=";")
        latinnumbersnolayout = {}
        latinnumbersremovedlayout = {}
        for rowidx,row in dflatinnumbers.iterrows():
            if rowidx < 10:
                continue
            latinnumbersnolayout[row["Char"]] = row["NormString"]
            latinnumbersremovedlayout[row["Char"]] = row["Layout"]
        # 1.8 Variations on frequent chars to normalize
        dfnormchars = pd.read_csv(chardatadir / "normalizedchars.csv", sep=";")
        normalizedchars = {}
        for rowidx,row in dfnormchars.iterrows():
            normalizedchars[row["Char"]] = row["NormChar"]
        
        # 2.1 List successive transformations    
        self.transformsDescs = []
        transforms = []
        self.transformsDescs.append("Fix encoding errors : windows1252 read as iso8859-1")
        transforms.append(partial(self.replacechars1to1, 0, win1252errorchars))
        self.transformsDescs.append("Fix encoding errors : utf8 read as windows1252")
        transforms.append(partial(self.replacecharsNto1, 1, utf8errorshdict))
        self.transformsDescs.append("Merge Unicode combining chars")
        transforms.append(partial(self.replacecharsNto1, 2, combiningcharshdict))
        self.transformsDescs.append("Ignore control chars")
        transforms.append(partial(self.ignorechars, 3, controlchars))
        self.transformsDescs.append("Replace latin letter symbols")
        transforms.append(partial(self.replacechars1toN, 4, latinlettersnolayout))
        self.transformsDescs.append("Replace latin letter ligatures")
        transforms.append(partial(self.replacechars1toN, 5, latinlettersnoligatures))
        self.transformsDescs.append("Replace latin number symbols")
        transforms.append(partial(self.replacechars1toN, 6, latinnumbersnolayout))
        self.transformsDescs.append("Normalize equivalent chars") 
        transforms.append(partial(self.replacechars1to1, 7, normalizedchars))        
        
        # 2.2 Combine all transformations
        def func(x,y):
            ci = transforms[0](x,y)
            for transform in transforms[1:]:
                ci = transform(ci,y)
            return ci
        self.transformsFunc = func

    def __repr__(self):
        desc = StringIO()
        for idx,transformDesc in enumerate(self.transformsDescs):
            desc.write(f'{idx+1} - {transformDesc}\n')
        return desc.getvalue()
        
    def __call__(self, inputText):
        result = NormResult(inputText, self.transformsDescs)
        result.setOutput(self.tostring(self.transformsFunc(inputText,result)))
        return result
        
    @staticmethod
    def buildhierarchicaldict(idict):
        hdict = {}
        odicts = []
        for key in idict:
            if len(key) > 1:
                firstchar = key[0]
                remainingstring = key[1:]
                if not firstchar in hdict:
                    newdict = {}
                    hdict[firstchar] = newdict
                    odicts.append((firstchar,newdict))
                hdict[firstchar][remainingstring] = idict[key]
        for pkey,odict in odicts: 
            dictwithlongkey = False
            for key in odict:
                if len(key)>1:
                    dictwithlongkey = True
                    break
            if dictwithlongkey:
                hdict[pkey] = TextNormalizer.buildhierarchicaldict(odict)
        return hdict

    @staticmethod
    def ignorechars(layer, charset, chariterator, result):
        for index,char in enumerate(chariterator):
            if not char in charset:
                yield char
            else:
                result.addChange(layer, index, char, '')

    @staticmethod  
    def replacechars1to1(layer, chardict, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in chardict:
                resChar = chardict[char]
                result.addChange(layer, index, char, resChar)
                yield resChar
            else:
                yield char

    @staticmethod  
    def replacechars1toN(layer, chardict, chariterator, result):
        for index,char in enumerate(chariterator):
            if char in chardict:
                resStr = chardict[char]
                result.addChange(layer, index, char, resStr)
                for outchar in resStr:
                    yield outchar
            else:
                yield char

    @staticmethod
    def replacecharsNto1(layer, hierarchicaldict, chariterator, result):
        candidatechars = []
        candidatedicts = []
        for index,char in enumerate(chariterator):
            # Try to match previously started patterns
            if len(candidatechars)>0:    
                for idx,candidatedict in enumerate(candidatedicts):
                    if not candidatedict is None:
                        if char in candidatedict:
                            value = candidatedict[char]
                            if isinstance(value,dict):
                                candidatedicts[idx] = value
                            else:   
                                # Success : found a char to return
                                for ridx in range(0,idx):
                                    yield candidatechars[ridx]
                                replacedStr = "".join(candidatechars[idx:]) + char
                                result.addChange(layer, index-len(replacedStr)+1, replacedStr, value)
                                candidatechars = []
                                candidatedicts = []
                                char = None
                                yield value
                                break
                        else:   
                            candidatedicts[idx] = None
                # Clean oldest failed attemps and return accumulated chars           
                while len(candidatedicts)>0 and candidatedicts[0] is None:
                    candidatedicts.pop(0)                  
                    yield candidatechars.pop(0)
            # Handle the current char  
            if not char is None:
                if len(candidatechars)==0:
                    if char in hierarchicaldict:
                        value = hierarchicaldict[char]
                        if isinstance(value,dict):
                            candidatechars.append(char)
                            candidatedicts.append(value)
                        else:
                            result.addChange(layer, index, char, value)
                            yield value
                    else:
                        yield char
                else:
                    candidatechars.append(char)
                    if char in hierarchicaldict:
                        value = hierarchicaldict[char]
                        candidatedicts.append(value)
                    else:
                        candidatedicts.append(None)     
        if len(candidatechars)>0:
            for char in candidatechars:
                yield char
    
    @staticmethod
    def tostring(iterator):
        return "".join(iterator)
    
    
class NormResult():
    
    def __init__(self, inputText, transformsDescs):
        self.input, self.transforms = inputText, transformsDescs
        self.layerChanges = None
        self.output = ""
    
    def addChange(self, layer, index, charsInput, charsOutput, removedInfo=None):
        if self.layerChanges is None:
            self.layerChanges = []
        if layer > (len(self.layerChanges)-1):
            for i in range(0,layer-len(self.layerChanges)+1):
                self.layerChanges.append([])
        changes = self.layerChanges[layer]
        change = NormChange(layer,index,charsInput,charsOutput,removedInfo)
        changes.append(change)   
        
    def describeChanges(self):
         if self.layerChanges is None:
            return 'No change'
         else:
            desc = StringIO()
            previousString = self.input
            for changes in self.layerChanges:
                layer = changes[0].layer
                layerDesc = self.transforms[layer]
                desc.write(layerDesc+"\n")                
                dispInparts = []     
                outparts = []
                dispOutparts = []
                lastIndex = 0
                for change in changes:
                    if change.index > lastIndex:
                        samePart = previousString[lastIndex:change.index]
                        dispInparts.append(samePart)
                        outparts.append(samePart)
                        dispOutparts.append(samePart) 
                    dispInpart = change.input
                    outpart = change.output
                    dispOutpart = outpart
                    if len(dispInpart)>len(outpart):
                        dispOutpart = outpart + ("_"*(len(dispInpart)-len(outpart)))
                    elif len(outpart)>len(dispInpart):
                        dispInpart = dispInpart + (" "*(len(outpart)-len(dispInpart)))
                    dispInparts.append(' ['+dispInpart+'] ')
                    outparts.append(outpart)
                    dispOutparts.append(' ['+dispOutpart+'] ')
                    lastIndex = change.index + len(change.input)
                if lastIndex < len(previousString):
                    samePart = previousString[lastIndex:]
                    dispInparts.append(samePart)
                    outparts.append(samePart)
                    dispOutparts.append(samePart)
                previousString = "".join(outparts)
                desc.write(" < ")
                for inpart in dispInparts:
                    desc.write(inpart)
                desc.write('\n')
                desc.write(" < ")
                for outpart in dispOutparts:
                    desc.write(outpart)
                desc.write('\n')
            return desc.getvalue()
            
    def mapOutputIndexToInput(self,outputIndex):
        inputIndex = outputIndex
        for changes in self.layerChanges:
            outputIndex = inputIndex
            for change in changes:
                if outputIndex < change.index:
                    break
                elif outputIndex > (change.index + len(change.output)):
                    inputIndex = inputIndex + (len(change.input)-len(change.output))
                else:
                    inputIndex = inputIndex -(outputIndex-change.index)
                    break
        return inputIndex        
            
    def setOutput(self, outputText):
        self.output = outputText
        
    def __repr__(self):
        return self.output
    
class NormChange():
    
    def __init__(self, layer, index, charsInput, charsOutput, removedInfo=None):
        self.layer, self.index, self.input, self.output, self.removedInfo = layer, index, charsInput, charsOutput, removedInfo
        
    def __repr__(self):
        return f"{self.layer} - {self.index} : {self.input} => {self.output}"

In [42]:
for datasetfile in datasetsdf["DatasetFile"].unique():
    vocabdf = loadVocabulary(datasetfile)
    print(f"{datasetfile} : {len(vocabdf)}")

assurance : 159060
banque : 235269
bourse : 392732
comparateur : 186390
crédit : 48299
forum : 1873326
institution : 52982
presse-1 : 843583
presse-2 : 610211
presse-3 : 679899
presse-4 : 892912
presse-5 : 740413
presse-6 : 741077
siteinfo : 445874
wikipedia-1 : 1273696
wikipedia-2 : 1339014
wikipedia-3 : 1376058
wikipedia-4 : 1412029
wikipedia-5 : 1404220


In [122]:
norm = TextNormalizer(rootdir)

In [124]:
from collections import defaultdict

datasets = {}
for datasetfile in datasetsdf["DatasetFile"].unique():
    print(datasetfile)
    vocabdf = loadVocabulary(datasetfile)
    layers = []
    for layerIdx in range(0,len(norm.transformsDescs)):
        layers.append(defaultdict(int))        
    datasets[datasetfile] = layers
    vocabnorm = defaultdict(int)
    tw = len(vocabdf)
    wc = 0
    for idx,row in vocabdf.iterrows():
        wc = wc + 1
        if wc%10000 == 0:
            print(f"{wc/tw*100}")
        result = norm(row["Word"])
        normWord = result.output
        count = + row["Count"]
        vocabnorm[normWord] = vocabnorm[normWord] + count
        if result.layerChanges is not None:
            for changes in result.layerChanges:
                for change in changes:
                    chgDict = layers[change.layer]
                    chgDict[change.input] = chgDict[change.input] + count
    dfvocabnorm = pd.DataFrame.from_dict({"Word":list(vocabnorm.keys()), "Count":list(vocabnorm.values())})
    dfvocabnorm.sort_values(by="Count",inplace=True,ascending=False)
    dfvocabnorm.reset_index(drop=True,inplace=True)
    dfvocabnorm.to_feather(rootdir / "datasets" / (datasetfile+".vocabnorm.feather"))

replacements = {}
for dataset in datasets:
    for layerIdx,layer in enumerate(datasets[dataset]):
        for inp in layer:
            key = (layerIdx,inp)
            if not key in replacements:
                replacements[key] = {}
            replacements[key][dataset] = layer[inp]

layers = []
inputs = []
outputs = []
datasetCounts = []
for dataset in datasets:
    datasetCounts.append([])
for key in replacements:
    layerIdx,inp = key
    dictDatasets = replacements[key]
    layers.append(layerIdx)
    inputs.append(inp)
    outputs.append(norm(inp).output)
    for idx,dataset in enumerate(datasets):
        if dataset in dictDatasets:
            datasetCounts[idx].append(dictDatasets[dataset])
        else:
            datasetCounts[idx].append(0)
            
dfnorms = pd.DataFrame.from_dict({"Layer":layers, "Input":inputs,  "Output":outputs})
for idx,dataset in enumerate(datasets):
        dfnorms[dataset] = datasetCounts[idx]
dfnorms["Count"] = dfnorms[[dataset for dataset in datasets]].sum(axis=1)

dfnorms.sort_values(by=["Layer","Count"],ascending=[True,False],inplace=True)
dfnorms.reset_index(drop=True,inplace=True)

dfnorms.to_csv(rootdir / "datasets" / "normalization.stats.csv")
dfnorms

assurance
6.28693574751666
12.57387149503332
18.86080724254998
25.14774299006664
31.434678737583297
37.72161448509996
44.00855023261662
50.29548598013328
56.582421727649944
62.869357475166595
69.15629322268326
75.44322897019993
81.73016471771658
88.01710046523324
94.3040362127499
banque
4.250453735936311
8.500907471872623
12.751361207808934
17.001814943745245
21.252268679681556
25.502722415617868
29.75317615155418
34.00362988749049
38.2540836234268
42.50453735936311
46.754991095299424
51.005444831235735
55.255898567172046
59.50635230310836
63.75680603904467
68.00725977498098
72.25771351091728
76.5081672468536
80.7586209827899
85.00907471872623
89.25952845466254
93.50998219059885
97.76043592653517
bourse
2.5462656468023996
5.092531293604799
7.6387969404071985
10.185062587209599
12.731328234011999
15.277593880814397
17.823859527616797
20.370125174419197
22.916390821221597
25.462656468023997
28.008922114826397
30.555187761628794
33.1014534084312
35.647719055233594
38.19398470203599
40.740

17.6496803201652
19.120487013512303
20.5912937068594
22.062100400206504
23.5329070935536
25.0037137869007
26.474520480247804
27.945327173594904
29.416133866942
30.8869405602891
32.357747253636205
33.828553946983305
35.2993606403304
36.7701673336775
38.240974027024606
39.711780720371706
41.1825874137188
42.6533941070659
44.12420080041301
45.5950074937601
47.0658141871072
48.5366208804543
50.0074275738014
51.4782342671485
52.94904096049561
54.4198476538427
55.89065434718981
57.361461040536895
58.832267733884
60.30307442723111
61.7738811205782
63.2446878139253
64.71549450727241
66.1863012006195
67.65710789396661
69.1279145873137
70.5987212806608
72.06952797400791
73.540334667355
75.01114136070211
76.48194805404921
77.9527547473963
79.42356144074341
80.8943681340905
82.3651748274376
83.83598152078471
85.3067882141318
86.7775949074789
88.24840160082601
89.7192082941731
91.1900149875202
92.6608216808673
94.1316283742144
95.60243506756152
97.0732417609086
98.5440484542557
presse-4
1.119931191

88.71818707132628
89.50330377107252
90.28842047081879
91.07353717056503
91.85865387031129
92.64377057005754
93.42888726980378
94.21400396955003
94.99912066929629
95.78423736904253
96.56935406878878
97.35447076853504
98.13958746828129
98.92470416802753
99.70982086777379
wikipedia-2
0.7468181811392561
1.4936363622785123
2.2404545434177683
2.9872727245570245
3.7340909056962808
4.480909086835537
5.227727267974793
5.974545449114049
6.721363630253306
7.4681818113925615
8.21499999253182
8.961818173671073
9.70863635481033
10.455454535949587
11.202272717088842
11.949090898228098
12.695909079367354
13.442727260506611
14.189545441645867
14.936363622785123
15.68318180392438
16.42999998506364
17.176818166202892
17.923636347342146
18.670454528481407
19.41727270962066
20.164090890759915
20.910909071899173
21.65772725303843
22.404545434177685
23.151363615316942
23.898181796456196
24.644999977595454
25.391818158734708
26.138636339873965
26.885454521013223
27.632272702152477
28.379090883291735
29.125909

16.379199840480837
17.091338963980004
17.803478087479167
18.515617210978334
19.2277563344775
19.939895457976668
20.652034581475835
21.364173704975002
22.07631282847417
22.788451951973336
23.500591075472503
24.21273019897167
24.924869322470837
25.637008445970004
26.34914756946917
27.061286692968338
27.773425816467505
28.48556493996667
29.19770406346584
29.909843186965006
30.621982310464173
31.33412143396334
32.04626055746251
32.75839968096167
33.470538804460844
34.18267792796001
34.89481705145917
35.606956174958334
36.319095298457505
37.03123442195667
37.74337354545584
38.455512668955
39.16765179245417
39.879790915953336
40.59193003945251
41.30406916295167
42.01620828645084
42.728347409950004
43.440486533449175
44.15262565694834
44.86476478044751
45.57690390394667
46.28904302744584
47.001182150945006
47.713321274444176
48.42546039794334
49.13759952144251
49.849738644941674
50.561877768440844
51.27401689194001
51.98615601543918
52.69829513893834
53.41043426243751
54.122573385936676
54.83

In [145]:
datasetfile = "forum"
dfvocabnorm = pd.read_feather(rootdir / "datasets" / (datasetfile+".vocabnorm.feather"))
dfvocabnorm

Unnamed: 0,Word,Count
0,",",4675272
1,de,4591981
2,à,3167078
3,.,2997682
4,le,2981764
...,...,...
1867746,2427,1
1867747,1/16,1
1867748,684,1
1867749,1532,1


In [147]:
dfnorms = pd.read_csv(rootdir / "datasets" / "normalization.stats.csv")
dfnorms

Unnamed: 0.1,Unnamed: 0,Layer,Input,Output,assurance,banque,bourse,comparateur,crédit,forum,institution,presse-1,presse-2,presse-3,presse-4,presse-5,presse-6,siteinfo,wikipedia-1,wikipedia-2,wikipedia-3,wikipedia-4,wikipedia-5,Count
0,0,0,,',39,45,32,134,0,65079,0,1400,8359,1018,15,3,2,738,24,21,31,9,21,76970
1,1,0,,€,8,5,13,145,0,11797,0,16,465,136,3,21,37,1789,1,2,2,2,9,14451
2,2,0,,…,8,0,1,5,0,2502,0,56,1051,208,347,0,32,106,3,6,5,4,5,4339
3,3,0,,oe,0,0,4,4,0,663,2,139,567,61,6,2,3,95,2,3,0,3,1,1555
4,4,0,,-,0,21,0,8,0,1025,0,22,289,46,11,1,7,3,6,3,9,9,0,1460
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
458,458,7,⁾,),27,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,32
459,459,7,ℹ,•,0,5,0,0,0,0,1,1,0,3,5,16,0,0,0,0,0,0,0,31
460,460,7,○,•,3,13,3,0,0,2,0,0,0,0,1,0,0,0,2,2,1,0,1,28
461,461,7,ʺ,"""",2,0,0,0,0,0,0,12,1,0,0,0,0,0,6,2,2,0,2,27


In [158]:
normstats = pd.DataFrame.from_dict({"Transform":norm.transformsDescs, "Count":dfnorms.groupby(by="Layer")["Count"].sum()})
normstats

Unnamed: 0_level_0,Transform,Count
Layer,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Fix encoding errors : windows1252 read as iso8...,101193
1,Fix encoding errors : utf8 read as windows1252,2369
2,Merge Unicode combining chars,30358
3,Ignore control chars,986288
4,Replace latin letter symbols,5398
5,Replace latin letter ligatures,862087
6,Replace latin number symbols,88889
7,Normalize equivalent chars,29449375


# Generate new charset after normalization

In [161]:
def saveNormCharset(rootdir):
    print("Saving the normalized character set ...")
    charcounts = defaultdict(lambda:0)
    for datasetfile in datasetsdf["DatasetFile"].unique():
        print(datasetfile)
        vocabdf = pd.read_feather(rootdir / "datasets" / (datasetfile.lower()+".vocabnorm.feather"))
        for idx,row in vocabdf.iterrows():
            token = row["Word"]
            count = row["Count"]
            for char in token:
                charcode = ord(char)
                charcounts[charcode] = charcounts[charcode] + count
        charsetdf = pd.DataFrame({"Code" : [*charcounts.keys()], "Count" : [*charcounts.values()]})
        charsetdf.sort_values("Count", ascending=False, inplace=True)
        charsetdf.reset_index(inplace=True)
        charsetdf.drop('index', axis=1, inplace=True)
        enhanceCharset(charsetdf)
        charsetfile = rootdir / "datasets" / (datasetfile.lower()+".charsetnorm.feather")
        charsetdf.to_feather(charsetfile)
        charsetdf.to_csv(rootdir / "datasets" / (datasetfile.lower()+".charsetnorm.csv"),sep=";")
        print(f"- {len(charsetdf)} distinct characters")
            
def loadNormCharset(datasetfile):
    charsetfile = rootdir / "datasets" / (datasetfile.lower()+".charsetnorm.feather")
    chrdf = pd.read_feather(charsetfile)
    chrdf["Freq"] = 1000000*chrdf["Count"]/chrdf["Count"].sum()
    return chrdf

In [162]:
saveNormCharset(rootdir)

Saving the normalized character set ...
assurance
- 219 distinct characters
banque
- 499 distinct characters
bourse
- 686 distinct characters
comparateur
- 788 distinct characters
crédit
- 795 distinct characters
forum
- 1108 distinct characters
institution
- 1113 distinct characters
presse-1
- 1418 distinct characters
presse-2
- 1440 distinct characters
presse-3
- 1602 distinct characters
presse-4
- 2610 distinct characters
presse-5
- 3356 distinct characters
presse-6
- 3453 distinct characters
siteinfo
- 3495 distinct characters
wikipedia-1
- 8968 distinct characters
wikipedia-2
- 10526 distinct characters
wikipedia-3
- 11631 distinct characters
wikipedia-4
- 12385 distinct characters
wikipedia-5
- 13123 distinct characters


In [163]:
mergeddf = None
freqcolslist=[]
countcolslist=[]
for datasetfile in datasetsdf["DatasetFile"].unique():
    chrdf = loadNormCharset(datasetfile)
    del chrdf["Percent"]
    if(mergeddf is None):
        mergeddf = chrdf
    else:
        chrdf = chrdf[["Code","Char","Count","Freq"]]
        mergeddf = pd.merge(mergeddf, chrdf, how='outer', on="Code",suffixes=("", "_"+datasetfile))
        mergeddf["Char"].fillna(mergeddf["Char_"+datasetfile],inplace=True)
        del mergeddf["Char_"+datasetfile]
    mergeddf.rename(columns = {"Freq":"Freq_"+datasetfile}, inplace = True)
    freqcolslist.append("Freq_"+datasetfile)
    mergeddf.rename(columns = {"Count":"Count_"+datasetfile}, inplace = True)
    countcolslist.append("Count_"+datasetfile)
mergeddf["Name"] = mergeddf["Char"].map(lambda c:charname(c))
mergeddf["Category"] = mergeddf["Char"].map(lambda c:charcategory(c))
mergeddf["Subcategory"] = mergeddf["Char"].map(lambda c:charsubcategory(c))
mergeddf["Block"] = mergeddf["Char"].map(lambda c:charblock(c))
mergeddf.fillna(0,inplace=True)
mergeddf["Freq_max"] = mergeddf[freqcolslist].values.max(1)
mergeddf["Freq_min"] = mergeddf[freqcolslist].values.min(1)
mergeddf["Freq_mean"] = mergeddf[freqcolslist].values.mean(1)
mergeddf.sort_values(by="Freq_max",inplace=True,ascending=False)
mergeddf.reset_index(inplace=True)
del mergeddf["index"]
#mergeddf= mergeddf[mergeddf["Freq_max"]<0.7]
for col in freqcolslist:
    mergeddf["Rel"+col] = (100 * mergeddf[col] / mergeddf["Freq_mean"]).astype("int32")
mergeddf["CountBusiness"] = mergeddf[[col for col in mergeddf.columns if (col.startswith("Count_") and "wiki" not in col)]].sum(axis=1)
mergeddf["CountWikipedia"] = mergeddf[[col for col in mergeddf.columns if (col.startswith("Count_") and "wiki" in col)]].sum(axis=1)
mergeddf["Count"] = mergeddf[countcolslist].values.sum(1)
mergeddf["PerMillion"] = 1000000-mergeddf["Count"].cumsum()/mergeddf["Count"].sum()*1000000
mergeddf.to_csv(rootdir / "stats" / "charset-normalized.csv",sep=";")

In [188]:
mergeddf.to_csv(rootdir / "stats" / "charset-normalized.csv",sep=";")

In [189]:
normcharsdf = mergeddf[["Code","Char","Name","Category","Subcategory","Block","CountBusiness","CountWikipedia","Count"]]
normcharsdf.to_csv(rootdir / "libdata" / "chars" / "charsetstats_norm.csv",sep=";")

In [190]:
normcharsdf

Unnamed: 0,Code,Char,Name,Category,Subcategory,Block,CountBusiness,CountWikipedia,Count
0,101,e,Latin Small Letter E,Letter,Lowercase,Basic Latin,3.504972e+09,4.598059e+09,8.103030e+09
1,115,s,Latin Small Letter S,Letter,Lowercase,Basic Latin,1.960554e+09,2.534105e+09,4.494659e+09
2,97,a,Latin Small Letter A,Letter,Lowercase,Basic Latin,1.865560e+09,2.447291e+09,4.312851e+09
3,110,n,Latin Small Letter N,Letter,Lowercase,Basic Latin,1.819350e+09,2.388609e+09,4.207959e+09
4,114,r,Latin Small Letter R,Letter,Lowercase,Basic Latin,1.751622e+09,2.277707e+09,4.029330e+09
...,...,...,...,...,...,...,...,...,...
13118,25712,摰,Cjk Unified Ideograph-6470,Letter,Other,CJK Unified Ideographs,0.000000e+00,1.000000e+00,1.000000e+00
13119,39122,飒,Cjk Unified Ideograph-98D2,Letter,Other,CJK Unified Ideographs,0.000000e+00,1.000000e+00,1.000000e+00
13120,30364,皜,Cjk Unified Ideograph-769C,Letter,Other,CJK Unified Ideographs,0.000000e+00,1.000000e+00,1.000000e+00
13121,21989,嗥,Cjk Unified Ideograph-55E5,Letter,Other,CJK Unified Ideographs,0.000000e+00,1.000000e+00,1.000000e+00
