In [1]:
import re
import pandas as pd
from os import walk
from utils.Lexicon_process import build_lexicon, modifier_process

## Create a lexicon from description 

In [2]:
mydir='../../CF_data/synthese/Result_traitem/database_Memoris3/'

In [None]:
def build_lexicon(path=None, kind_list=None, df_dict=None, desc_col='Description', 
                  kw_com={}, com_kw_file=None, auto=True, update=False):
    """
    Generate a lexicon from lithological descriptions
    
    Parameters
    ------------
    path: str
        root dir from which CSV files (also in subdirs) containing lithological description will be retrieve
    kind_list : list
        list of lexicon thematic ['lithology','material','colour']. Default is ['lithology']
    desc_col : str
        name of the dataframe column that contains descriptions
    kw_com: dict
        dict of common keywords to be considered for each lexicon thematic
    
    Returns
    --------
    
    """
    from importlib import import_module
    
    if com_kw_file is None:
        kw_module = import_module('Lexicon_FRA') # ('common_keywords_FR')
    else:
        kw_module = import_module(com_kw_file)
    
    assert isinstance(path, str), "Expected a str for parameter *path*!"
    assert isinstance(kind_list, list), "Expected a list for parameter *kind*!"
    assert isinstance(desc_col, str), "Expected a str for parameter *desc_col*!"
    assert isinstance(kw_com, dict), "Expected a list for parameter *kw_com*!"
   
    kind_def = ['lithology', 'material', 'modifier', 'colour']
    
    LEXICON  =  kw_module.LEXICON
    litho_com = LEXICON['lithology'] 
    mat_com = LEXICON['material'] 
    modf_com = LEXICON['modifier'] 
    colour_com = LEXICON['colour']
    #split_com = LEXICON['splitters'] 
    
    #--------------------------- Processing --------------------------------
    def process(kind_list):
        flag = re.IGNORECASE
        kw, desc = [], []
        litho_lex, mat_lex, modf_lex, colour_lex = [], [], [], []
        filter_lex = []

        lex = {'lithology':litho_lex, 'material':mat_lex, 'modifier':modf_lex, 'colour':colour_lex, } # generated from the processing
        com = {'lithology':litho_com, 'material':mat_com, 'modifier':modf_com,  'colour':colour_com,} # manualy build to contain default values
        
        
        for i in range(len(df)):
            if not pd.isnull(df.loc[i, desc_col]): 
                kw = kw + df.loc[i, desc_col].split(' ')
        kw = list(set(kw))

        for i in range(len(kw)):
            if len(kw[i]) > 2 and not re.search('\d.*',kw[i]): # to eliminate all 'one letter words' and all numbers
                wlist  =  [re.sub(r"^/|\.|l'|d'","",kw[i]).rstrip('[.|...|,|;|(|)|?]').lstrip('?|(|+').replace(',…',"")]
                desc = desc + wlist
        desc = list(set(desc))

        if kind_list == ['all']:
            kind_list = kind_def
                
        for kind in kind_list:
            filter_lex = []
            tmp_lex = lex[kind]
            if len(kw_com) == 0: 
                tmp_com = com[kind]
            else: 
                tmp_com = kw_com[kind]

            if kind not in kind_def: 
                print("Parameter *kind* must be 'lithology' or compatible str : see docstring !")

            for v in tmp_com:
                if kind == 'colour':
                    w = v.rstrip('[e|t]')
                    r = re.compile("{:s}(e|es|s)?([-|/]\w+)?([â|a]tre)?$".format(w), flags=flag)
                elif kind == 'modifier':
                    w = v.rstrip('[e|es]')
                    r = re.compile('(?:-)?(\w+eu(?:x|se|ses))') #("^\w+-\w+(eu)+(x|se|ses)?$", flags=flag)
                elif kind == 'material':
                    w = v.rstrip('[e|es]')
                    r = re.compile("^{:s}(e|es)?(-)*(.^/)*$".format(w), flags=flag)
                elif kind == 'lithology':
                    w = v.rstrip('[e|es]')
                    r = re.compile("^{:s}(e|es|o)?([-|/]\w+)*$".format(w), flags=flag)
                else:
                    w = v.rstrip('[e|es]')
                    r = re.compile("^\w+-\w+(eu)+(x|se|ses)?$", flags=flag)
                
                tmp_lex = tmp_lex + list(filter(r.findall, desc))
                    
            for l in tmp_lex:
                w = l.capitalize()
                if re.search('.+o$|.+s$',w) and kind == 'lithology':
                    w = re.sub('o$','e',w)
                    w = re.sub('s$','',w)
                    
                if w not in filter_lex: 
                    filter_lex = filter_lex + [w]
            
            lex[kind] = lex[kind] + filter_lex
            
            if kind == 'modifier':
                print(f'{len(lex[kind])} {lex[kind]}')
            
            #print(f"{len(desc)} total keywords found")
            print(f"|>>> Processing for '{kind}' : {len(filter_lex)} keywords extracted")
            
            """#----------------------------------------------
            mdf = []
            for w in desc:
                r = re.search('(?:-)?(\w+eu(?:x|se|ses))',w)
                if r: 
                    mdf = mdf + [re.sub(r'(x|se|ses)$','(?:x|se|ses)?',r.group(1))]
            mdf = list(set(mdf))
            
            for v in LEXICON[kind]: # to manage composites lithologies  
                w = re.sub('(e|es)$','',v.replace('(?:s)?',''))
                #print(w)
                r = re.compile("^{:s}(\w+)?eu?".format(w), flags=flag)
                tmp_lex = tmp_lex + list(set(filter(r.findall, mdf)))
            print(f'{len(tmp_lex)} {tmp_lex}')
            
            #------------------------------------------"""
        
        return lex, desc
    
    #--------------------------- Global lexicon updating -----------------------
    def update_lexicon_db(lexicon_db=None, svg=None, colour_db=None):

        if svg is None: svg = 'Lexicon_FR.py'
        else: svg = f"{svg}.py"
        
        with open(svg, 'w+') as f:
            f.write(f'"""\nDefinition de mots clés pour les descriptions de cuttings de forages.\n'+
            ':copyright: 2021  Y. N\'DEPO & O. Kaufmann \n"""')

            f.write(f'\n\n#====================LEXIQUE================================ \n')
            f.write(f'LEXICON = {lexicon_db}')

            f.write(f'\n\n#===============================COULEURS=============================== \n')
            f.write(f'COLORS = {colour_db}')

        print('The lexicon have been updated')

    #------------------------------------- Main ----------------------------
    
    if auto and path is not None:
        flist, df_list = [], []

        for path, dirs, files in walk(mydir):
            for f in files:
                if f[0] != '.' and re.compile(r".+[L|l]ith.+\.csv").match(f) and f is not None:
                    p = path + "/" + f
                    flist.append('{}'.format(p))
                    df_list.append(pd.read_csv('{}'.format(p)))

        df_dict = dict(list(enumerate(df_list))) #dict(zip(keys, values)) where key = range(len(df_list))
    
    elif not auto and df_dict is not None: pass 
    else: pass
    
    if not isinstance(df_dict, type(None)):
        for nb, df in df_dict.items():
            print(f"\nkeywords extraction and filtering from '{flist[nb].replace(mydir,'..')}' ...")
            lexicon, desc = process(kind_list)
    
    #----------------- Lexicon_db update --------- 
    #(pas sûr de continuer avec cette idée, ça devient trop complexe au regard de la 
    # structure du fichier default_lexicon.py [présence d'altérations de mots [..|..] )
    if update:
        lexicon_db = LEXICON
        for kind in kind_def:
            lexicon_db[kind] = list(set(lexicon_db[kind] + lexicon[kind]))

        update_lexicon_db(lexicon_db)    
    else:
        print('\nNo lexicon update set !')

    return lexicon, desc

In [3]:
lexicon, desc=build_lexicon(mydir, ['all'])#, kw_com=colour_com, auto=True)

# put good keywords in ..._com for better results


keywords extraction and filtering from '../source_Lithology.csv' ...
|>>> Processing for 'lithology' : 14 keywords extracted
|>>> Processing for 'material' : 7 keywords extracted
|>>> Processing for 'modifier' : 48 keywords extracted
|>>> Processing for 'colour' : 51 keywords extracted

No lexicon update set !


In [5]:
def lexicon_process(desc, lexicon_mod=None,):    
        
    from importlib import import_module
    
    if lexicon_mod is None:
        lex_module = import_module('Lexicon_FRA') # ('common_keywords_FR')
    else:
        lex_module = import_module(lexicon_mod)
    
    LEXICON = lex_module.LEXICON
    l, md = '', ''
    mdf, litho = [], []
    mdf_lex, lack_mdf, filter_lex = [], [], []
    lex={'lithology':[]}
    
    match = filter(re.compile('(?:-)?(\w+eu(?:x|se|ses))').findall, desc)

    for i in match:
        for j in i.split('-'):
            # retrieve modifier
            if not re.search(r'\w+o$',j):
                md = re.sub(r'eu(x|se|ses)$','eu(?:x|se|ses)?',j)

            # retrieve lithology
            if re.search(r'(velo)$',j):
                #l = re.sub(r'(velo)$','vier(?:s)?',j)
                l = re.sub(r'(velo)$','vier',j)
            elif re.search(r'(ono)$',j):
                #l = re.sub(r'o$','(?:s)?',j)
                l = re.sub(r'o$','',j)
            elif re.search(r'o$',j):
                #l = re.sub(r'(o)$','e(?:s)?',j)
                l = re.sub(r'(o)$','e',j)

            if md not in mdf and md != '':
                mdf = mdf + [md]
            if l not in litho and l != '':
                litho = litho + [l]


    for v in LEXICON['lithology']: # to manage modifiers  
        w=re.sub('(e|es)$','',v.replace('(?:s)?',''))
        #print(w)
        r=re.compile("^{:s}\w?eu".format(w), flags=re.IGNORECASE)
        mdf_lex=mdf_lex+list(set(filter(r.findall, mdf)))

    lack_mdf = list(set(mdf)-set(mdf_lex))

    if len(lack_mdf) > 0 :   
        log_file = "Lexicon_Log.py"

        with open(log_file, 'w+') as f:
            f.write('"""This file contains all keywords skipped during lexicon building processes"""')
            f.write(f'\n\n#====================MODIFIERS LOG================================ \n')
            f.write(f'skipped_words = {lack_mdf}')

        print(f'Extraction completed with skipped modifiers : check log file ({log_file}) ! ')
    else:
        print('Extraction completed !')
    
    return mdf_lex, 

In [4]:
lexicon_process(desc)

NameError: name 'lexicon_process' is not defined

In [None]:
from Lexicon_FRA_mod import LEXICON

kind = 'lithology'
tmp_lex = []
filter_lex = []
lex={'lithology':[]}

for v in LEXICON[kind]: # to manage modifiers  
    w=re.sub('(e|es)$','',v.replace('(?:s)?',''))
    #print(w)
    r=re.compile("^{:s}(\w+)?eu?".format(w), flags=re.IGNORECASE)
    tmp_lex=tmp_lex+list(set(filter(r.findall, mdf)))
print(f'{len(tmp_lex)} {tmp_lex}')

In [43]:
re.compile('(?:-)?(\w+eu(?:x|se|ses))').match(desc)

TypeError: expected string or bytes-like object

In [None]:
for w in desc:
    r = re.search('(?:-)?(\w+eu(?:x|se|ses))',w)
    if r: 
        mdf = mdf + [re.sub(r'(x|se|ses)$','(?:x|se|ses)?',r.group(1))]
mdf=list(set(mdf))

r=re.compile("^{:s}(\w+)?eu?".format(w), flags=re.IGNORECASE)
tmp_lex=tmp_lex+list(set(filter(r.findall, mdf)))

In [None]:
mdf = []
lith = []
for w in desc:
    r = re.search('(?:-)?(\w+eu(?:x|se|ses))',w)
    if r: 
        mdf = mdf + [re.sub(r'(x|se|ses)$','(?:x|se|ses)?',r.group(1))]
mdf=list(set(mdf))
        
for w in desc: # to manage composite lithologies retrieving the primary and use the secondary as modifier
    r = re.search('(\w+o)(?:-|to|so)',w)
    if r:
        if re.search(r'(velo)$',r.group(1)):
            #l=re.sub(r'(velo)$','vier(?:s)?',r.group(1))
            l=re.sub(r'(velo)$','vier',r.group(1))
        elif re.search(r'(ono)$',r.group(1)):
            #l=re.sub(r'o$','(?:s)?',r.group(1))
            l=re.sub(r'o$','',r.group(1))
        elif re.search(r'o$',r.group(1)):
            #l=re.sub(r'(o)$','e(?:s)?',r.group(1))
            l=re.sub(r'(o)$','e',r.group(1))
            
        lith = lith + [l]
lith=list(set(lith))

print(f'{len(mdf)} {mdf} \n\n {len(lith)} {lith}') #modifiers

In [None]:
# TODO : create a function to add keywords in the lexicon (if not exist)

In [None]:
from Lexicon_FRA import LEXICON

kind = 'lithology'
tmp_lex = []
filter_lex = []
lex={'lithology':[]}

for v in LEXICON[kind]: # to manage composites lithologies  
    w=re.sub('(e|es)$','',v.replace('(?:s)?',''))
    #print(w)
    r=re.compile("^{:s}(\w+)?eu?".format(w), flags=re.IGNORECASE)
    tmp_lex=tmp_lex+list(set(filter(r.findall, mdf)))
print(f'{len(tmp_lex)} {tmp_lex}')

In [None]:
tmp_lex = []
filter_lex = []
lex={'lithology':[]}

for v in LEXICON[kind]: # to manage non composites lithologies  
    w=re.sub('(e|es)$','',v)#.replace('(?:s)?',''))
    #print(w)
    r=re.compile("^{:s}(e|es)?([-|/]\w+)*$".format(w), flags=re.IGNORECASE)
    tmp_lex=tmp_lex+list(set(filter(r.findall, desc)))
print(f'{len(tmp_lex)} {tmp_lex}')

for l in tmp_lex:
    w=l.capitalize()
    if re.search('.+[^èo]s$',w) and kind=='lithology':
        #w=re.sub('o$','e',w)
        w=re.sub('s$','',w)

    if w not in filter_lex: 
        filter_lex=filter_lex+[w]

lex[kind]=lex[kind]+filter_lex
print(f'{len(lex[kind])} {lex[kind]}')

In [None]:
LEXICON[kind][:5]

In [None]:
# look for a part of a word
print(list(filter(re.compile('(?:-)?\w+eu(?:x|se|ses)?').search, desc)))
#print(list(filter(lambda v: v if re.search('(?:-)?\w+eu(?:x|se|ses)?',v) else '', desc)))
#print(list(filter(lambda v:v if re.search('\+[B|b]',v) else '', desc)))
#re.match('.+([o-])?.$', desc).groups()#, re.sub('o$', 'e', desc)

#### striplog test

In [None]:
from striplog import Component, Legend, Decor, Lexicon
from Lexicon_FRA import LEXICON
#import Lexicon_FR as lexique

In [None]:
descrip="sables hétérogène gris foncé sablo-silteuses pluricentimétriques avec du naphtalène 20-25% charge pierreuse de type laitier"#, à peu de grains fins à. Vers 2 - 2.4 m, présence d'eau."
#desc='shaly sand with vf'

#lexicon=Lexicon(LEXICON) # our french lexicon
#lexicon=Lexicon.default() # the striplog default lexicon (english)

comp=Component.from_text(descrip, Lexicon(LEXICON))#lexique.LEXIQUE)
comp

In [None]:
re.search('-(\w+eu)(?:x|se|ses)?', descrip).group(1)

===================================== other tests ============================

In [None]:
kw=lexicon['colour']
d=[]
for i in range(len(kw)):
    if len(kw[i])>2 and not re.search('\d.*',kw[i]): #to eliminate all 'one letter words' and all numbers
        wlist=[re.sub(r"^/|\.|l'|d'","",kw[i]).rstrip('[.|...|,|;|(|)|?]').lstrip('?|(|+')\
               .replace(',…',"").replace('es',"e")]
        d=d+wlist
d=list(set(d))
print(len(d),'\n', d)

In [None]:
#print(list(filter(lambda v:v if re.search('[/|-]',v) else '', desc)))
print(list(filter(lambda v:v if re.search('[B|b]l',v) else '', d)))

In [None]:
tmp=[]
for v in colour_com: 
    w=v.rstrip('[e|es]')
    r=re.compile("{:s}(e|es|s)?([-|/]\w+)?([â|a]tre)?$".format(w), flags=re.IGNORECASE)
    tmp=tmp+list(filter(r.findall, desc))
    
print(tmp)

In [None]:
def update_lexicon_db(svg=None):

    if svg is None: svg='Lexicon_FR.py'
    else: svg=f"{svg}.py"

    with open(svg, 'w+') as f:
        f.write(f'"""\nDefinition de mots clés pour les descriptions de cuttings.\n'+
        ':copyright: 2021  Y. N\'DEPO & O. Kaufmann \n"""')

        f.write(f'\n\n# =========== LEXIQUE ===========\n')
        f.write(f'LEXICON = {lexicon_db}')

        f.write(f'\n\n# =========== COULEURS ===========\n')
        f.write(f'COLORS = {colour_db}')

========================== draft ================================================

# Definition of common keywords 