# ORGANISATION DES DONNEES

In [None]:
import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from difflib import get_close_matches

from utils.config import DEFAULT_POL_LEXICON, POL_NAMES_MODEL
from definitions import ROOT_DIR
from utils.io import dataframe_viewer, data_merger, data_validation, data_slicer, \
collect_time_data, replicate_values, gen_id_from_ech, na_col_drop, na_line_drop, col_ren, \
dble_col_drop, find_borehole_by_position, compute_borehole_length

### Creation du répertoire de sauvegarde

In [None]:
save_dir = ROOT_DIR + '/CF_data/Result_traitem/organisation/'

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

### Definition d'entêtes usuelles

In [None]:
MEAS_NAMES_MODEL = {'Fraction   2000 µm':'Fract_2000µ', 'Fraction   63 µm':'Fract_63µ', 
                    'Fraction   45 µm':'Fract_45µ','Fraction   16 µm':'Fract_16µ','Fraction   2 µm':'Fract_2µ', 
                    'Fraction 2 mm':'Fract_2','Fraction +2 mm':'Fract_2+','Fract_2':'Fract_2', 
                    'Mat. organique':'MO','Mat. sèche':'MS','Argile':'Fract_arg','Fraction argileuse':'Fract_arg'}

In [None]:
params_kw = ['O_diss','Niv_eau', 'temp', '^T$', '^CE$', 'pH$', 'ORP']
meas_kw_col = ['O_diss','pH','CE','ORP','Niv_eau_pz','Niv_eau_sol','Temp']
sufx = ['sup', 'prof', 'inf', '/\dM(\*)?']
prefx = ['eau forage ']
id_reg = '\s*(?P<id>(?:^canne |Piezair )*\w*\d+\w*)\s*'
pollutants_names = list(set(list(DEFAULT_POL_LEXICON.abbreviations.keys()) + list(POL_NAMES_MODEL.values())))

In [None]:
bh_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Sect_crep','Long_pz_sol','Ht_pz_sol',
           'Diam_for','Diam_int_pz','Diam_ext_pz','Ht_chbre','Refus','Societe','Zone','Sous_zone','Etude','Method','Resp_chantier',
           'Emplacement','Rmq']))

mes_cols = list(set(['Date_mes','ID','ID_ech','X','Y','Z','Zsol','pH_H2O','Temp_pH_H2O','Temp_pH_CaCl2','pH_CaCl2',
            'Temp_pH_KCl','pH_KCl','Residu_perte_feu','Fract_arg','Fract_min_2µ','Fract_min_50µ','Fract_min_2',
            'Temp_pH_mes','pH_H20','Fract_min_2µ', 'Fract_min_50µ', 'Fract_min_2', 'pH_KCl', 'Temp_pH_mes', 
            'pH_H20','sulfures_tot''N_Kjdl','Temp_CE','Temp_pH','Nappe','Rmq','Fract_2000µ','Fract_63µ','Fract_45µ',
            'Fract_16µ','Fract_2µ','Temp_ech', 'Periode'] + meas_kw_col + list(MEAS_NAMES_MODEL.values())))

eqp_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type_equip','Equip_base','Equip_top',
                     'Equip_epais','Rmq']))

litho_cols = list(set(['Date_for','ID','ID_ech','X','Y','Z','Zsol','Long_for','Litho_top','Litho_base','Intv_top',
                       'Intv_base','Litho_epais','Intv_epais','Description','Rmq']))

an_cols = list(set(['ID','X','Y','Z','Zsol','Date_ech','ID_ech','Type_ech','Ech_top','Ech_base','Ech_epais',
                    'Intv_top','Intv_base','Description','Nappe','Organo','Intensite', 'Min_organo', 'Max_organo',
                    'Polluant','Surnageant','Sousnageant','Caractere','Opacite','Rmq'] + pollutants_names))

ukw_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Method','Societe','Rmq']))

cols_dict = {'borehole': bh_cols, 'measure': mes_cols, 'lithology': litho_cols, 'analysis': an_cols, 
 'equipement': eqp_cols, 'unknown': ukw_cols}

In [None]:
bh_crit = ['ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Diam_for','Diam_int_pz','Diam_ext_pz']

mes_crit = ['Date_mes'] + meas_kw_col + list(MEAS_NAMES_MODEL.values())

eqp_crit = ['Type_equip','Equip_base','Equip_top']

litho_crit = ['Litho_top','Litho_base','Intv_top','Intv_base','Description']

an_crit = ['ID_ech','Type_ech','Organo','Surnageant','Sousnageant'] + list(DEFAULT_POL_LEXICON.abbreviations.keys()) 

ukw_crit = ['ID','X','Y','Z','Zsol','Long_for','Type']

crit_dict = {'borehole': bh_crit, 'measure': mes_crit, 'lithology': litho_crit, 'analysis': an_crit, 
 'equipement': eqp_crit, 'unknown': ukw_crit}

variables utilisées par jeu de données
================================
- bh 	: 	forages (simple ou piezo)
- equip	:	equipements d'un forage (outils, méthodes utilisés, ...)
- ukw	:	objets physiques indéterminés
- litho :	descriptions lithologiques
- an 	: 	analyses de contaminants sur des échantillons (sol, eau)
- mes	:	mesures de propriétés sur des échantillons (sol, eau), de paramètres hydrochimiques, ...


# ---------------------------------------------------------

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 14-Logs_forages_vUmons_2018-03-20.xlsx
* **Sheet : 'Analyse_eau_Phases1&2'**

In [None]:
tmp_dir= save_dir + 'vUmons_logsFor/'
sheet='Analyse_eau_Phases1&2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_eau_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
df=col_ren(df,mode=1,name=[re.sub('9999','-',x) for x in df.columns])

In [None]:
drop_it = []
for c in df.columns:
    if re.search('_vn', c): drop_it.append(c)
df.drop(columns=drop_it, inplace=True)

In [None]:
name=['ID', 'ID_ech', 'Date_ech', 'X', 'Y', 'Z', 'Long_for','Long_pz_sol', 'Niv_eau_sol', 'pH', 'CE', 'Temp', 
      'Arsenic', 'Cadmium', 'Chrome', 'Chrome VI', 'Cuivre', 'Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'Cyanures (libres)', 'CN_totaux', 'cyanure (APE)', 'cyanure complex', 'thiocyanate', 'Benzène', 'Toluene', 
      'Éthylbenzène', 'Orthoxylène', 'Para_métaxylène', 'Xylenes', 'Styrène', 'Phénol', 'Naphtalène', 
      'Acénaphtylène', 'Acénaphtène', 'Fluorène', 'Phénanthrène', 'Anthracène', 'Fluoranthène', 'Pyrène', 
      'Benzo(a)anthracène', 'Chrysène', 'Benzo(b)fluoranthène', 'Benzo(k)fluoranthène', 'Benzo(a)pyrène', 
      'Dibenzo(ah)anthracène', 'Benzo(ghi)pérylène', 'Indéno(1,2,3-cd)pyrène', 'HAP Totaux (16) - EPA', 
      '1,1-Dichloroéthane', '1,2-Dichloroéthane', '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène', 
      'Totaux (cis,trans) 1,2-dichloroéthènes', 'Trans 1,2-dichloroéthylène', 'Dichlorométhane', 
      '1,2-dichloropropane', 'Tétrachloroéthylène ', 'Tétrachlorométhane', '1,1,1-Trichloroéthane', 
      '1,1,2-Trichloroéthane', 'Trichloroéthylène', 'Chloroforme', 'Chlorure de vinyle', 'fraction aromat. >C6-C7',
      'fraction aromat. >C7-C8', 'fraction aromat. >C8-C10', 'fraction aliphat. C5-C6', 'fraction aliphat. >C6-C8',
      'fraction aliphat. >C8-C10', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C10-C12', 'Fraction C12-C16',
      'Fraction C16 - C21', 'Fraction C21 - C35', 'Hydrocarbures totaux C10-C35', 'MTBE', 'Chlorures']
df=col_ren(df, mode=1,name=name)

In [None]:
df = col_ren(df,mode=1, name=POL_NAMES_MODEL)#, verbose=True)

In [None]:
df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)
df.insert(1,'Type_ech','Eau')

In [None]:
df.drop([20,39], axis=0,inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.loc[38:,'Date_ech']=df.loc[38:,'Date_ech'].apply(lambda x : dtm.datetime.fromordinal(dtm.datetime(1900, 1, 1).toordinal() + x - 2))

In [None]:
for i in df.index:
    if pd.isnull(df.loc[i,'ID_ech']): 
        df.loc[i,'ID_ech']=df.loc[i,'ID'].rstrip('M')

In [None]:
df['Date_ech'] = df['Date_ech'].astype('datetime64')
df['Date_mes'] = df['Date_ech']

In [None]:
df['Type'] = 'Piezo'

In [None]:
df['ID'] = df['ID'].apply(lambda x: re.sub('^P', 'F', str(x)) if not pd.isnull(x) else x)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_an = an
source_bh = bh
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Analyse_sol_Phases1&2'**

In [None]:
tmp_dir= save_dir + 'vUmons_logsFor/'
sheet='Analyse_sol_Phases1&2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_sol_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID_ech','Date_ech','ID','X','Y','Z','Nature_ech','Organo','Long_for','Refus','Ech_top','Ech_base',
      'MS','Broyage < 150 µm','Broyage ','Fract_2','Fract_2+','Arsenic', 'Cadmium', 'Chrome', 'Chrome VI','Cuivre',
      'Mercure', 'Plomb', 'Nickel', 'Zinc', 'Cyanure (libre)', 'Cyanure (totaux)', 'cyanure (APE)', 
      'cyanure complex', 'thiocyanate', 'Benzène', 'Toluène', 'Éthylbenzène', 'Orthoxylène', 'Para- et métaxylène',
      'Xylènes', 'Styrène', 'Phénol', 'Naphtalène', 'Acénaphtylène', 'Acénaphtène', 'Fluorène', 'Phénanthrène', 
      'Anthracène', 'Fluoranthène', 'Pyrène', 'Benzo(a)anthracène', 'Chrysène', 'Benzo(b)fluoranthène', 
      'Benzo(k)fluoranthène', 'Benzo(a)pyrène', 'Dibenzo(ah)anthracène', 'Benzo(ghi)pérylène', 
      'Indéno(1,2,3-cd)pyrène', 'HAP Totaux (16) - EPA', '1,1-Dichloroéthane', '1,2-Dichloroéthane', 
      '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène', 'Trans 1,2-dichloroéthylène', 'Dichlorométhane', 
      'Totaux (cis,trans) 1,2-dichloroéthènes', '1,2-dichloropropane', 'Tétrachloroéthylène', 
      'Tétrachlorométhane', '1,1,1-Trichloroéthane', '1,1,2-Trichloroéthane', 'Trichloroéthylène', 'Chloroforme', 
      'Chlorure de vinyle', 'fraction aromat. >C6-C7', 'fraction aromat. >C7-C8', 'fraction aromat. >C8-C10', 
      'fraction aliphat. C5-C6', 'fraction aliphat. >C6-C8', 'fraction aliphat. >C8-C10', 'Fraction C5 - C8', 
      'Fraction C8 - C10', 'Fraction C10-C12', 'Fraction C12-C16', 'Fraction C16 - C21', 'Fraction C21 - C35', 
      'Hydrocarbures totaux C10-C35']
df=col_ren(df, mode=1, name=name)

In [None]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [None]:
for i in df.index:
    x = df.loc[i,'Nature_ech']
    if x in ['R','R ']: df.loc[i,'Nature_ech']='Remblais'
    elif x in ['L']: df.loc[i,'Nature_ech']='Limons'
    elif x in ['LA']: df.loc[i,'Nature_ech']='Limons et argiles'
    elif x in ['LS']: df.loc[i,'Nature_ech']='Limons et sables'

df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
df.insert(1,'Type_ech','Sol')

In [None]:
df.drop(14, axis=0, inplace=True)
df.drop(['Broyage < 150 µm', 'Broyage '], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.insert(1,'Description', df.pop('Nature_ech'))

In [None]:
df.loc[8, 'ID_ech']='F4/2M'
df.loc[31, 'ID_ech']='F19/1'
df.loc[32, 'ID_ech']='F19/2'

In [None]:
df = col_ren(df, mode=1, name=POL_NAMES_MODEL) #,verbose=True)

In [None]:
df['Date_ech'] = df['Date_ech'].astype('datetime64')
df['Date_mes'] = df['Date_ech']

In [None]:
df['Type'] = 'Piezo'

In [None]:
df['ID'] = df['ID'].apply(lambda x: re.sub('^P', 'F', str(x)) if not pd.isnull(x) else x)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_for_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Synthèse'**

In [None]:
tmp_dir= save_dir + 'vUmons_logsFor/'
sheet='Synthese'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Synthèse', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df[:29]
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')

In [None]:
name=['ID','X','Y','Z', 'Refus','Long_for', 'RB', 'ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)

In [None]:
cols=['ID','X','Y','Z', 'Refus','Long_for']

for i in range(len(df)):
    if not pd.isnull(df.loc[i, 'RB']): 
        df.loc[i, 'Nappe']='Remblais'
        df.loc[i, 'Litho_top']=0
        if not pd.isnull(df.loc[i, 'Rb_base']):
            df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else:
            df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'ALL']):
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Nappe']='Alluvions'
        df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        if not pd.isnull(df.loc[i, 'S_A']):
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else:
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'S_A']):
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Nappe']='Socle altéré'
        df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        if not pd.isnull(df.loc[i, 'S_S']):
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else:
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if not pd.isnull(df.loc[i, 'S_S']):
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Nappe']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S', 'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df['Description'] = df['Nappe']
df['Type'] = 'Forage'

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Type_x':list(conflict_df.index), 'Long_for_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_bh = dataset

In [None]:
source_litho = litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Sond2017v2'**

In [None]:
tmp_dir= save_dir + 'vUmons_logsFor/'
sheet='Sond2017v2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Sond2017v2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if x==1 else '')

In [None]:
name=['R_ID','ID','X','Y','Z','Refus','Date_for','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','cote_rb','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)
df=df[['ID','X','Y','Z','Refus','Date_for','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']]

In [None]:
cols=['ID','Date_for','X','Y','Z','Z_fond','Refus','Long_for']

for i in range(len(df)):    
    if df.loc[i, 'RB']==1: 
        df.loc[i, 'Nappe']='Remblais'
        df.loc[i, 'Litho_top']=0
        
        if not pd.isnull(df.loc[i, 'Rb_base']): df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else: df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    val_def=df.loc[i, 'Litho_base'] # temporary value of litho_base if nan
    
    if df.loc[i, 'ALL']==1:
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Nappe']='Alluvions'
        
        if not pd.isnull(df.loc[i, 'All_top']): df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        else: df.loc[i+.2, 'Litho_top']=val_def #df.loc[i, 'litho_base']
            
        if df.loc[i, 'S_A']==1: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if df.loc[i, 'S_A']==1:
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Nappe']='Socle altéré'
        
        if not pd.isnull(df.loc[i, 'Soc_alt_top']): df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.5, 'Litho_top']=val_def #df.loc[i+.2, 'litho_base']
        
        if df.loc[i, 'S_S']==1: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if df.loc[i, 'S_S']==1:
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Nappe']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S','Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df['Description'] = df['Nappe']
df['Type'] = 'Forage'

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Type_x':list(conflict_df.index), 'Long_for_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_bh = dataset

In [None]:
source_litho, conflict_df = data_merger(source_litho, litho, how='outer', on=['ID', 'Litho_top'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_litho
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Litho_base_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_litho = dataset

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

# Processing for new data added - April 2021

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 15-Résultats SOL extension pilote et piézairs.xlsx
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'result_sol_ext_pilote/'
sheet='Result_Sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Resultats SOL extension pilote et piezairs.xlsx', 
                   sheet_name='Résult SOL', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,3)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df.Description)

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x in ['R','R ']: ech_df.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Description']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [None]:
for i in range(len(ech_df)):
    x=ech_df.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        ech_df.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech','col_35':'Phénanthrène'}, inplace=True)

In [None]:
an=an[an.columns[:-17]]

In [None]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an = na_line_drop(an, 1)
an.insert(1,'Type_ech','Sol')

In [None]:
data = an
for i in range(len(data)):
    x=data.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        data.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
df, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
df = gen_id_from_ech(df, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
to_drop = []
for i in df.index:
    if not re.search('\d+', df.loc[i,'ID']): to_drop.append(i)
df.drop(index=to_drop, inplace=True)

In [None]:
df['Date_ech'] = df['Date_ech'].astype('datetime64')
df['Date_mes'] = df['Date_ech']

In [None]:
df['Type'] = 'Piezo'

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_mes = mes
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'inorganiques et composés majeur'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Inorg_comp_majeur'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Resultats SOL extension pilote et piezairs.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:20] # not really interesting here!
an=df.loc[21:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an.loc[1.5] = df.loc[2]
an = an.sort_index().reset_index(drop=True)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech', 'Date de prélèvement':'Date_ech'}, inplace=True)

In [None]:
an=an[an.columns[:-7]]

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,2)
an = na_line_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)#, verbose=True)

In [None]:
an = dble_col_drop(an)

In [None]:
data = an
for i in range(len(data)):
    x=data.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        data.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
an['Date_ech'] = an['Date_ech'].astype('datetime64')

In [None]:
df = gen_id_from_ech(an, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
source_an, conflict_df = data_merger(source_an, df, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 16-Profils de sol et données de terrain 2019.xlsx
* **Sheet : 'Log'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Log'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Log', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name = ['ID','Litho_top', 'Litho_base', 'Keyword', 'Description']
df = col_ren(df, name=name, mode=1, )
df = df[1:]
df['Date_for'] = dtm.datetime(2019,12,18)

In [None]:
df.drop(index=df.query('Litho_base.isnull() or Litho_top.isnull()').index, inplace=True)
df.drop(index=[64], inplace=True)

In [None]:
df['Type'] = 'Piezo'
df.loc[65:80, 'Type'] = 'piezair'

In [None]:
df['Emplacement'] = 'Extension Pilote'
df.loc[83:, 'Emplacement'] = 'Mini-pilote' #piezair

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
litho = df
source_litho = litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Echantillon'+'Organoleptique**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Echantillon'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Echantillon', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Ech_top', 'Ech_base', 'ID_ech']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [None]:
df.drop(index=[43,44,55,56,66], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
ech=df.copy()

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Organoleptique', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,4)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Pol_top', 'Pol_base','Polluant','Intensite']
df=col_ren(df, name=name, mode=1)

In [None]:
df.drop(index=[10,11,14,15], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df, conflict_df =data_merger(ech, df, on='ID', how='outer')

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
an = df
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Données de forage'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Donnees_forage'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Données de forage', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID', 'X', 'Y', 'Z', 'Date_for', 'Long_for', 'Method', 'Diam_for','Rmq', 'Long_pz', 'Diam_pz', 
      'Sect_crep','Societe', 'Resp_chantier']
df=col_ren(df, name=name, mode=1)
df.drop(index=[16,23], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.insert(5, 'Type', 'Piezo')
df.loc[16:21,'Type']='Piezair'

In [None]:
df.loc[9,'ID']='224 bis'

In [None]:
df['Refus'] = ''
df['Type_refus']=''

for i in range(len(df['Rmq'])):
    val = str(df.loc[i,'Rmq'])
    if re.search('[Bb]loqué', val) :
        df.loc[i,'Refus'] = 'x'
        
        if re.search('[lL]aitier', val):
            df.loc[i,'Type_refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Type_refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Type_refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Refus'] = '' 

df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[1]) if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[0]) if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x) if not pd.isnull(x) else x)

df.insert(10, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(11, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Rmq', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines
df.reset_index(drop=True, inplace=True)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
bh = df
source_bh = bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Equipement'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Equipement'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Equipement', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID','Equip_top', 'Equip_base', 'Diam_for', 'Diam_ext_pz', 'Legende']
df=col_ren(df, mode=1, name=name)

In [None]:
df.drop(index=[24,25], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df['Type'] = 'Piezo'

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
eqp = df
source_eqp = eqp

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Piézométrie'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='piezometrie'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Niv_pz_sol', 'Type_ech', 'Date_mes']
df=col_ren(df, name=name, mode=1)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
mes = df
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 17-coordonnees extension pilote.xls
* **Sheet : 'échant sol'**

In [None]:
tmp_dir= save_dir + 'coord_ext_pilote/'
sheet='échant sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/coordonnees extension pilote.xls', 
                   sheet_name=sheet, skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name = ['ID','X','Y','Z']
df = col_ren(df, name=name, mode=1, )
df = df[3:19]
df['Date_for'] = dtm.datetime(2019,10,9)

In [None]:
df['Emplacement'] = 'Extension Pilote'

In [None]:
df.loc[9, 'ID'] = 219

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
bh = df
source_bh = bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'canne chauffe'**

In [None]:
tmp_dir= save_dir + 'coord_ext_pilote/'
sheet='canne chauffe'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/coordonnees extension pilote.xls', 
                   sheet_name=sheet, skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name = ['ID','X','Y','Z']
df = col_ren(df, name=name, mode=1, )
df = df[3:33]
df['Date_for'] = dtm.datetime(2019,10,9)

In [None]:
df['Emplacement'] = 'Extension Pilote'

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
bh = df

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Feuil1'**

In [None]:
tmp_dir= save_dir + 'coord_ext_pilote/'
sheet='Feuil1'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/coordonnees extension pilote.xls', 
                   sheet_name=sheet, skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name = ['ID','X','Y','Z']
df = col_ren(df, name=name, mode=1, )
df = df[3:]
df['Date_for'] = dtm.datetime(2019,10,9)

In [None]:
df['Emplacement'] = 'Extension Pilote'

In [None]:
k = 1
for i in df.index:
    if pd.isnull(df.loc[i, 'ID']):
        df.loc[i, 'ID'] = f'BH_name_{k}'
        k+=1

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
bh = df

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['X','Y'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'ID_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')