# ORGANISATION DES DONNEES

In [None]:
from utils.io import update_dict, gen_dated_id, dataframe_viewer, gen_geodf_geom, data_merger, data_validation, \
data_slicer, replicate_values, collect_measure, collect_time_data, gen_id_from_ech, na_col_drop, na_line_drop, col_ren, \
dble_col_drop, dict_viewer

from utils.config import DEFAULT_POL_LEXICON, POL_NAMES_MODEL 
from difflib import get_close_matches

import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR

In [None]:
def compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Intv_top', base_col='Intv_base', verbose=False):
    
    if length_col_name in df.columns:
        raise(NameError(f'{length_col_name} is already in columns. Give another name'))
    
    for i in df.index:
        try:
            float(df.loc[i, top_col])
        except ValueError:
            df.loc[i, top_col] = np.nan

        try:
            float(df.loc[i, base_col])
        except ValueError:
            df.loc[i, base_col] = np.nan

    df[top_col] = df[top_col].astype('float64')
    df[base_col] = df[base_col].astype('float64')

    # compute length based on litho_top and litho_base
    id_list = []

    for i in df.index:
        id_ = df.loc[i,id_col]
        
        if verbose : print(i, id_, df.loc[i, top_col], df.loc[i, base_col])
        if id_ not in id_list:
            id_list.append(id_)
            if isinstance(id_, str):
                sql_id = f"{id_}"
            elif isinstance(id_, float) or isinstance(id_, int):
                sql_id = id_
                
            tmp = df[df[id_col] == sql_id]
            
            if verbose : print(len(tmp))
            #if len(tmp) > 0:
            df.loc[tmp.index, length_col_name] = float(max(tmp[base_col])) - float(min(tmp[top_col]))
    
    df.drop(index=df.query(f'{base_col}.isnull() and {top_col}.isnull()').index, inplace=True)
    df.insert(df.columns.to_list().index(id_col)+1, length_col_name, df.pop(length_col_name))
    #df.reset_index(drop=True, inplace=True)
    

### Creation du répertoire de sauvegarde

In [None]:
save_dir = ROOT_DIR + '/CF_data/Result_traitem/organisation/'

In [None]:
os.system(f"rm -fr {save_dir}") 
os.makedirs(save_dir) 

### Definition de variables usuelles

In [None]:
MEAS_NAMES_MODEL = {'Fraction   2000 µm':'Fract_2000µ', 'Fraction   63 µm':'Fract_63µ', 'Fraction   45 µm':'Fract_45µ', 'Fraction   16 µm':'Fract_16µ', 
                    'Fraction   2 µm':'Fract_2µ', 'Fraction 2 mm':'Fract_2', 'Fraction +2 mm':'Fract_2+', 'Fract_2':'Fract_2', 'Fract_2+':'Fract_2+', 
                    'Mat. organique':'MO', 'Mat. sèche':'MS', 'Argile':'Fract_arg', 'Fraction argileuse':'Fract_arg'}

In [None]:
POL_NAMES_MODEL = {**POL_NAMES_MODEL, **MEAS_NAMES_MODEL}

In [None]:
params_kw = ['O_diss','Niv_eau', 'temp', '^T$', '^CE$', 'pH$', 'ORP']
meas_kw_col = ['O_diss','pH','CE','ORP','Niv_eau_pz','Niv_eau_sol','Temp']
sufx = ['sup', 'prof', 'inf', '/\dM(\*)?']
prefx = ['eau forage ']
id_reg = '\s*(?P<id>(?:^canne |Piezair )*\w*\d+\w*)\s*'
pollutants_names = list(set(list(DEFAULT_POL_LEXICON.abbreviations.keys()) + list(POL_NAMES_MODEL.values())))

In [None]:
bh_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Sect_crep','Long_pz_sol','Ht_pz_sol',
           'Diam_for','Diam_int_pz','Diam_ext_pz','Ht_chbre','Refus','Societe','Zone','Sous_zone','Etude','Method','Resp_chantier',
           'Emplacement','ID_date','Rmq']))

mes_cols = list(set(['Date_mes','ID','ID_ech','X','Y','Z','Zsol','pH_H2O', 'Temp_pH_H2O', 'Temp_pH_CaCl2','pH_CaCl2','Temp_pH_KCl',
            'pH_KCl','Residu_perte_feu','Fract_arg','Fract_min_2µ','Fract_min_50µ','Fract_min_2','Temp_pH_mes',
            'pH_H20', 'Fract_min_2µ', 'Fract_min_50µ', 'Fract_min_2', 'pH_KCl', 'Temp_pH_mes', 'pH_H20', 'sulfures_tot''N_Kjdl','Temp_CE','Temp_pH','Nappe','Rmq','Fract_2000µ','Fract_63µ','Fract_45µ','Fract_16µ',
            'Fract_2µ','Temp_ech', 'Periode'] + meas_kw_col + list(MEAS_NAMES_MODEL.values())))

eqp_cols = list(set(list(set(['Date_for','ID','X','Y','Z','Zsol','Type_equip','Equip_base','Equip_top','Rmq']))))

litho_cols = list(set(['Date_for','ID','ID_ech','X','Y','Z','Zsol','Long_for','Litho_top','Litho_base','Intv_top','Intv_base',
              'Description','Rmq']))

an_cols = list(set(['ID','X','Y','Z','Zsol','Date_ech','ID_ech','Type_ech','Ech_top','Ech_base','Intv_top','Intv_base',
           'Description','Nappe','Organo','Intensite', 'Min_organo', 'Max_organo', 'Polluant',
           'Surnageant','Sousnageant','Caractere','Opacite','Rmq'] + pollutants_names))

ukw_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Method','Societe','Rmq']))

cols_dict = {'borehole': bh_cols, 'measure': mes_cols, 'lithology': litho_cols, 'analysis': an_cols, 
 'equipement': eqp_cols, 'unknown': ukw_cols}

In [None]:
bh_crit = ['ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Diam_for','Diam_int_pz','Diam_ext_pz']

mes_crit = ['ID','ID_ech','Date_mes'] + meas_kw_col

eqp_crit = ['Type_equip','Equip_base','Equip_top']

litho_crit = ['Litho_top','Litho_base','Intv_top','Intv_base','Description']

an_crit = ['ID_ech','Type_ech','Organo','Surnageant','Sousnageant'] + list(DEFAULT_POL_LEXICON.abbreviations.keys()) 

ukw_crit = ['ID','X','Y','Z','Zsol','Long_for','Type']

crit_dict = {'borehole': bh_crit, 'measure': mes_crit, 'lithology': litho_crit, 'analysis': an_crit, 
 'equipement': eqp_crit, 'unknown': ukw_crit}

variables utilisées par jeu de données
================================
- bh 	: 	forages (simple ou piezo)
- equip	:	equipements d'un forage (outils, méthodes utilisés, ...)
- ukw	:	objets physiques indéterminés
- litho :	descriptions lithologiques
- an 	: 	analyses de contaminants sur des échantillons (sol, eau)
- mes	:	mesures de propriétés sur des échantillons (sol, eau), de paramètres hydrochimiques, ...


# ---------------------------------------------------------

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 1- Profils sols et données forages.xls
* **Sheet : 'Données de forage'**

In [None]:
tmp_dir= save_dir + 'profils_sols_donnees_forages/'
sheet='donnees_forage'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', 
                   sheet_name='Données de forage')#, skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'Date':'Date_for','Profondeur':'Long_for', 'Méthode':'Method', 
                        'Diamètre forage':'Diam_for','Niv. Eau p/r sol':'Niv_eau_sol',
                        'PZ Prof.':'Long_pz', 'PZ Diamètre':'Diam_pz','PZ L.crépinée':'Sect_crep', 
                        'Société forage':'Societe', 'Resp. chantier':'Resp_chantier'}, inplace=True)

In [None]:
df['Type'] = df['Long_pz'].apply(lambda x: 'Forage' if pd.isnull(x) else 'Piezo')
df['Refus'] = ''

for i in range(len(df['Remarque'])):
    val = str(df.loc[i,'Remarque'])
    if re.search('[Bb]loqué', val) :        
        if re.search('[lL]aitier', val):
            df.loc[i,'Refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Refus'] = np.nan

# convert diameter values unit from mm to m
df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace(' mm','').split('x')[1].strip(' m'))/1000 
                                        if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace(' mm','').split('x')[0].strip(' m'))/1000 
                                        if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)

df.insert(7, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(8, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Remarque', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines
df['Date_mes'] = df['Date_for']

In [None]:
if 'Date_for' in df.columns:
    df['Date_for'] = df['Date_for'].astype('datetime64')
if 'Date_mes' in df.columns:
    df['Date_mes'] = df['Date_mes'].astype('datetime64')

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_mes = mes
source_bh = bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Piézométrie'**

In [None]:
tmp_dir= save_dir + 'profils_sols_donnees_forages/'
sheet='piezometrie'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Piézométrie', skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
sdf = na_col_drop(df[:12], 3)
sdf.rename(columns={'z':'Z'}, inplace=True)

In [None]:
a=0
for x in df.columns:
    if pd.isnull(df.loc[16,x]):
        df.loc[16,x]='col'+str(a)
    a+=1

In [None]:
if not 'tmp_df' in vars().keys():
    tmp_df = df.copy()
    
df = tmp_df.copy()
df.loc[16]=df.loc[16].apply(lambda x : x if not pd.isnull(x) else '')
df.columns = df.loc[16]

In [None]:
df=df[17:]
df.reset_index(inplace=True, drop=True)

#df.drop(columns=[df.columns.to_list()[x] for x in range(0,8)
#                      if re.compile(r"col|unnamed").match(df.columns.to_list()[x])], axis=1, inplace=True) 

In [None]:
df.rename(columns={'col8':'Date_mes', 'col9':'Nappe', 'col10':'ID', 'NP/piézo [m]':'Niv_eau_pz', 
                        'dim. piezo hors sol [m]':'Ht_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 
                        'Prof. piézo/piézo [m]':'Long_pz', 'Prof. piézo/sol [m]':'Long_pz_sol', 
                        't° [°C]':'Temp', 'Observations':'Rmq'}, inplace=True)

In [None]:
df = na_col_drop(df, 3)
df.reset_index(drop=True, inplace=True)

In [None]:
df['CE'] = df[['CE [µS/cm]', 'CE [mS/cm]']].apply(lambda x: x[0]/1000 if pd.isnull(x[1]) else x[1], axis=1) # mS/cm
df.drop(columns=['CE [µS/cm]', 'CE [mS/cm]'], inplace=True)
df['ID'] = df['ID'].apply(lambda x: re.sub('P','F',x) if not pd.isnull(x) else x)
df.insert(0, 'ID', df.pop('ID')) # move to first column
df['Type'] = 'Piezo'

In [None]:
df.rename_axis(None, inplace=True, axis=1)
df.drop(df.query("ID!=ID").index, inplace=True) # supprimer les lignes avec ID='NaN'
df.reset_index(inplace=True, drop=True)

In [None]:
if 'Date_for' in df.columns:
    df['Date_for'] = df['Date_for'].astype('datetime64')
if 'Date_mes' in df.columns:
    df['Date_mes'] = df['Date_mes'].astype('datetime64')

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

##### Data merging

In [None]:
bh, conflict_df = data_merger(bh, sdf[['ID', 'Z']], how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
mdf, conflict_df = data_merger(source_bh, bh, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = mdf
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_pz_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_bh = dataset.copy()

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
cols_rep = ['X', 'Y', 'Z']
source_bh = replicate_values(source_bh, id_col='ID', cols_to_replicate=cols_rep, suffix=['sup', 'inf'], replace_id=True)
source_mes = replicate_values(source_mes, id_col='ID', cols_to_replicate=cols_rep, suffix=['sup', 'inf'], replace_id=False)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Equipement'**

In [None]:
tmp_dir= save_dir + 'profils_sols_donnees_forages/'
sheet='Equipement'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', 
                   sheet_name='Equipement')#, skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID', 'Equip_top', 'Equip_base', 'Diam_for','Diam_int_pz', 'Type_equip']
df=col_ren(df, mode=1, name=name)

In [None]:
compute_BH_length(df, id_col='ID', length_col_name='Long_pz', top_col='Equip_top', base_col='Equip_base')

In [None]:
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)
df['Diam_int_pz'] = df['Diam_int_pz'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)

In [None]:
bh_ = source_bh[['ID', 'X', 'Y', 'Z']]
df, conflict_df = data_merger(bh_, df, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
df = na_line_drop(df, 3, 2)

In [None]:
if 'Date_for' in df.columns:
    df['Date_for'] = df['Date_for'].astype('datetime64')

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

##### Data merging

In [None]:
mdf, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_pz_x':list(conflict_df.index), 'Diam_for_y':list(conflict_df.index), 
                           'Diam_int_pz_y':list(conflict_df.index)})

In [None]:
dataset = mdf
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_bh = mdf.copy()

In [None]:
data = source_bh
source_bh = replicate_values(data, 'ID', list(data.columns)).drop_duplicates(list(data.columns))
source_bh.reset_index(drop=True, inplace=True)

In [None]:
source_eqp = eqp

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheets: 'Echantillon' + 'Organoleptique'**

In [None]:
tmp_dir= save_dir + 'profils_sols_donnees_forages/'
sheet='Echant-organo'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Echantillon')#, skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'De':'Ech_top', 'A':'Ech_base', 'Numéro':'ID_ech'}, inplace=True)

In [None]:
# df, conflict_df = data_merger(df, sdf, 'outer', ['ID', 'Ech_top', 'Ech_base'])
df['Type_ech']='Sol'

In [None]:
if 'Date_for' in df.columns:
    df['Date_for'] = df['Date_for'].astype('datetime64')
if 'Date_mes' in df.columns:
    df['Date_mes'] = df['Date_mes'].astype('datetime64')

##### Data merging

In [None]:
bh_ = source_bh[['ID', 'X', 'Y', 'Z']]
df, conflict_df = data_merger(bh_, df, how='inner', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Log'**

In [None]:
tmp_dir= save_dir + 'profils_sols_donnees_forages/'
sheet='Log'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Log')#, skiprows=1)
dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'De':'Litho_top', 'A':'Litho_base'}, inplace=True)

In [None]:
compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Litho_top', base_col='Litho_base')

In [None]:
bh_ = source_bh[['ID', 'X', 'Y', 'Z','Long_for']]
df, conflict_df = data_merger(bh_, df, how='inner', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
data_validation(overall_data=df, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_for_x':list(conflict_df.index)})

In [None]:
dataset = df
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
dataframe_viewer(df, rows=10, un_val=['ID','ID_ech'])

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(bh, source_bh, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
source_litho=litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

### $\color{red}{\textbf{Excel data final merge}}$

In [None]:
bh_coords = source_bh[['ID', 'X', 'Y', 'Z','Date_for']].copy()

In [None]:
source_eqp, conflict_df = data_merger(bh_coords, source_eqp, how='inner', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
source_litho, conflict_df = data_merger(bh_coords, source_litho, how='inner', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
source_an, conflict_df = data_merger(bh_coords, source_an, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 2-Database MEMORIS3.xlsx
* **Sheet : 'PROFILS_SOL'**

In [None]:
tmp_dir= save_dir + 'database_Memoris3/'
sheet='Profils_sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. '+
                        'Siterem - 2017/Database MEMORIS3.xlsx', sheet_name='PROFILS_SOL')#, skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

In [None]:
df = na_col_drop(df, 3)

In [None]:
df.rename({'Date':'Date_for', 'N°':'Ref', 'Id':'idx', 'Piézo':'Type', 'Unnamed: 6':'Societe',
                'MFT Ø145':'MFT_145', 'Gouge Ø75':'Gouge_75', 'Liner Ø60': 'Liner_60'}, axis=1, inplace=True)

In [None]:
print(list(set(df['Date_for'].apply(lambda x: x.year if not pd.isnull(x) else x))))

In [None]:
df.loc[df.fillna('').query("Societe.str.contains('x|X')").index, 'Type']='X'

In [None]:
df.loc[df.fillna('').query("Gouge_75.str.contains('SBS|SITER')").index, 'Societe']='SBS Environnement'
df.loc[df.fillna('').query("Gouge_75.str.contains('SBS|SITER')").index, 'Gouge_75']=''

In [None]:
for i in range(len(df['Date_for'])-1):
    if not pd.isnull(df.loc[i, 'Date_for']) and pd.isnull(df.loc[i+1, 'Date_for']):
        df.loc[i+1, 'Date_for']=df.loc[i, 'Date_for']
        
    if not pd.isnull(df.loc[i, 'Societe']) and pd.isnull(df.loc[i+1, 'Societe']):
        df.loc[i+1, 'Societe']=df.loc[i, 'Societe']
        
    if not pd.isnull(df.loc[i, 'Type']) and pd.isnull(df.loc[i+1, 'Type']) and \
       df.loc[i, 'Ref']==df.loc[i+1, 'Ref']:
        df.loc[i+1, 'Type']=df.loc[i, 'Type']

In [None]:
for i in range(len(df['idx'])-1):    
    if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
    and re.findall('Forage',df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
        w=df.loc[i, 'Profondeur'][0]
    elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])
    
    if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
    and re.findall('Tranch',df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
        w=df.loc[i, 'Profondeur'][0]
    elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])
     
   # if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
   # and re.findall('Moni',df.loc[i, 'Profondeur']):
   #     df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
   #     w=df.loc[i, 'Profondeur'][0]
   # elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
   #     df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])

In [None]:
df['Ref']=df['idx'].apply(lambda x : x if re.findall('F|T', str(x)) else '')
df['Ref']=df['idx'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)

In [None]:
df['Type']=df['Type'].apply(lambda x: 'Piezo' if not pd.isnull(x) else '')

In [None]:
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.1","a",str(x)) if re.search(r"\.1", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.2","b",str(x)) if re.search(r"\.2", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.3","c",str(x)) if re.search(r"\.3", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.4","d",str(x)) if re.search(r"\.4", str(x)) else x)

In [None]:
gen_dated_id(df, ref_col='Ref', date_col='Date_for')

In [None]:
df.loc[df.query('Profondeur!=Profondeur' ).index,'Profondeur']=''

In [None]:
df['Method']=''
            
for i in range(len(df['Method'])):
    if not pd.isnull(df.loc[i, 'Gouge_75']) : df.loc[i, 'Method']='Gouge_75'
    if not pd.isnull(df.loc[i, 'MFT_145']) : df.loc[i, 'Method']='MFT_145'
    if not pd.isnull(df.loc[i, 'Liner_60']) : df.loc[i, 'Method']='Liner_60'
    if not pd.isnull(df.loc[i, 'carottier']) : df.loc[i, 'Method']='carrotier'
    if not pd.isnull(df.loc[i, 'tarrière']) : df.loc[i, 'Method']='tarrière'

In [None]:
df.drop(df.query('Profondeur.str.contains("Forage") and Profondeur!="Forage bloqué"', engine='python').index, inplace=True)
df.drop(df.query('Profondeur.str.contains("Tranc") and Profondeur!="Tranchée bloqué"', engine='python').index, inplace=True)
df.drop(df.query('Profondeur.str.contains(".orage|..ranch", regex=True)', engine='python').index, inplace=True)
df.drop(df.fillna('').query('Description.str.contains("^.orage bloq|^.ranc.* bloq|^.*efus", regex=True)', engine='python').index, inplace=True)
df.drop(df.query('Ref!=Ref').index, inplace=True)
df.drop(columns=['MFT_145','Gouge_75','Liner_60', 'carottier', 'tarrière', 'idx'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df['Litho_top'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[0].strip(' m'))
df['Litho_base'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[-1].strip(' m'))

In [None]:
df.rename({'Ref':'ID'}, axis=1, inplace=True)
if 'Profondeur' in df.columns: df.drop(columns=['Profondeur'], axis=1, inplace=True)

In [None]:
set([x[0] for x in list(set(df.ID)) if isinstance(x,str)])

In [None]:
df.loc[df.query('ID_date.str.contains("T")', engine='python').index, 'Type'] = 'Tranchee'
df.loc[df.query('Type==""', engine='python').index, 'Type'] = 'Forage'

In [None]:
df.loc[1268, ['ID_date','ID']] = df.loc[1267, ['ID_date','ID']]
df.loc[df.query('Description.isnull() or Description.str.len()<1').index, 'Description'] = ''

In [None]:
df.drop(index=df.query('Litho_base.isnull() or Litho_base.str.len()<1').index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Litho_top', base_col='Litho_base')

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
ukw = bh.loc[bh.query('Type=="Tranchee"', engine='python')[list(ukw.columns)].index] # trenches
ukw['Type'] = 'Inconnu'
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)

bh = bh.drop(index=ukw.index).reset_index(drop=True)

In [None]:
source_litho = litho
source_bh = bh
source_ukw = ukw

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'DONNEES PIEZOS'**

In [None]:
tmp_dir= save_dir + 'database_Memoris3/'
sheet='Donnees_piezos'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. '+
                        'Siterem - 2017/Database MEMORIS3.xlsx', sheet_name='DONNEES PIEZOS', skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

In [None]:
names = ['Ref_id','ID','Societe','Zone','Sous_zone','X','Y','Zsol','Z','Nappe','Long_pz','Sect_crep',
         'Diam_int_pz','Niv_eau_pz_27/04/2010','Niv_eau_pz_08/09/2010','Niv_eau_sol_27/04/2010',
         'Niv_eau_sol_08/09/2010','Surnageant','Sousnageant','Caractere','Opacite','Rmq']
df = col_ren(df, mode=1, name=names)
df = na_col_drop(df, 3)

In [None]:
df=df.query("ID==ID")
df.replace('-',np.nan, inplace=True)

In [None]:
df['Sousnageant']=df['Sousnageant'].apply(lambda x: x/100 if not pd.isnull(x) else x) #convert unit in [m]
df['Surnageant']=df['Surnageant'].apply(lambda x: x/100 if not pd.isnull(x) else x)
df['Type']=df['Sect_crep'].apply(lambda x: 'Piezo' if not pd.isnull(x) else 'Inconnu')

In [None]:
df = df[['ID','X','Y','Z','Zsol','Type','Long_pz','Diam_int_pz','Sect_crep','Nappe','Societe','Zone','Sous_zone',
         'Niv_eau_pz_27/04/2010','Niv_eau_pz_08/09/2010','Niv_eau_sol_27/04/2010','Niv_eau_sol_08/09/2010',
         'Surnageant','Sousnageant','Caractere',
      'Opacite','Rmq']]

In [None]:
df = collect_time_data(df)

In [None]:
df = gen_id_from_ech(df, id_ech_col='ID', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

id_col = 'ID_ech'
if 'X' in df.columns: 
    df = df.query(f'{id_col}=={id_col} and X==X')
else:
    df = df.query(f'{id_col}=={id_col}')

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
an['Type_ech'] = 'Eau'
an = an.drop_duplicates('ID_ech').reset_index(drop=True)

##### Data merging

In [None]:
source_an = an

In [None]:
source_mes = mes

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_ukw, conflict_df = data_merger(source_ukw, ukw, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'DRAINS ET PIEZOS ENEL'**

In [None]:
tmp_dir= save_dir + 'database_Memoris3/'
sheet='Drains_Pz_ENEL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/Database MEMORIS3.xlsx', 
                        sheet_name='DRAINS ET PIEZOS ENEL', skiprows=1)

df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

In [None]:
df.insert(5, 'Z', df.pop('PZ absolue (m)'))
df.rename(columns={'N°':'ID', 'Date ':'Date_ech','Hauteur de la chambre ':'Ht_chbre','T':'Temp', 'ETUDE':'Etude',
                   'Niv_EAU_SOL (m)': 'Niv_eau_sol_01/10/2013', 'Niv_EAU_SOL (m).1':'Niv_eau_sol_14/12/2016', 
                   'Prof_PZ':'Long_pz','Section_crépinée':'Sect_crep', 'Diamètre_int':'Diam_int_pz', 'Odiss':'O_diss',
                   '\nC5-C8':'C5-C8'}, inplace=True)
df = df.query('ID==ID')

In [None]:
df = collect_time_data(df)

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan) # -> in mS/cm

In [None]:
df.drop(index=df.query('ID.str.contains("nan", regex=True)', engine='python').index, inplace=True)

In [None]:
df = gen_id_from_ech(df, id_ech_col='ID', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
df = col_ren(df, name=POL_NAMES_MODEL, mode=1)

In [None]:
data = df.copy()
drop = []
for c in data.columns:
    c = re.sub('\s+$|\\n','', c)
    if re.match('\s*\w+\s*-\s*\w+\s*', c):
        c_mod = c.replace(' ','')
        data.rename(columns={c:c_mod}, inplace=True)
        c = c_mod
    if re.search('\w+_<\d*>', c):
        drop.append(c)
data.drop(columns=drop, inplace=True)
df = data.copy()

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
bh.insert(0, 'Type', 'Piezo')
an.insert(0, 'Type_ech', 'Eau')

##### Data merging

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID','Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'RESULTS_EAU' (F)**

In [None]:
tmp_dir= save_dir + 'database_Memoris3/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/Database MEMORIS3.xlsx', 
                        sheet_name='RESULTS_EAU', skiprows=1)

df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'Campagne':'Societe','N_piezo.':'ID','Z tête PZ':'Z', 'Prof_PZ':'Long_pz',
                   'Niv_EAU_TETE (m)':'Niv_eau_pz_27/04/2010','Niv_EAU_SOL (m)':'Niv_eau_sol_27/04/2010',
                   'Unnamed: 13':'Niv_eau_pz_08/09/2010','Unnamed: 15':'Niv_eau_sol_08/09/2010','T':'Temp',
                   'Section_crépinée':'Sect_crep','Diamètre_int':'Diam_int_pz','Description éch.':'Opacite',
                   'Odiss':'O_diss','Remarques':'Rmq','Aquifère_échantillonné':'Nappe', 
                   'Caractéristique':'Caractere'}, inplace=True)

df=df.query("ID ==ID")
df.replace('-',np.nan, inplace=True)

In [None]:
df['Type']=df['Sect_crep'].apply(lambda x: 'Piezo' if not pd.isnull(x) else 'Inconnu')
df.insert(8, 'Type', df.pop('Type'))

In [None]:
# to express value in [m]
df['Surnageant']=df['Surnageant'].apply(lambda x: x/100)
df['Sousnageant']=df['Sousnageant'].apply(lambda x: x/100)
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                        if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
df = collect_time_data(df)

In [None]:
data = df.copy()
drop = []
for c in data.columns:
    c_mod = re.sub('\s+$|\n','', c)
    if re.match('\s*\w+\s*-\s*\w+\s*', c_mod):
        c_mod = c_mod.replace(' ','')
    if re.search('\w+_<\d*>', c_mod):
        drop.append(c)
    data.rename(columns={c:c_mod}, inplace=True)
data.drop(columns=drop, inplace=True)

In [None]:
df = data.copy()

In [None]:
df = col_ren(df, name=POL_NAMES_MODEL, mode=1, cutoff=0.7)#, verbose=True)

In [None]:
df.rename(columns={'3,5+2,3-dimethylphénol+4-ethylphénol' : 'DMetPhn_4-EthPhn', 'chrome (VI)': 'Cr_VI',
                   '2,4+2,5-dichlorophénol' : '2.4_5-DCPhn', 'sulfites':'Sulfite'}, inplace=True)

In [None]:
df = gen_id_from_ech(df, id_ech_col='ID', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
data = an
data.drop_duplicates(list(data.columns), inplace=True)
data.reset_index(drop=True, inplace=True)
data['Type_ech'] = 'eau'

In [None]:
an = data.copy()

##### data merging

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Z', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_ukw, conflict_df = data_merger(source_ukw, ukw, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'RESULTS_SOL'**

In [None]:
tmp_dir= save_dir + 'database_Memoris3/'
sheet='Result_sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/'
                   'Database MEMORIS3.xlsx', sheet_name='RESULTS_SOL', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

In [None]:
df.rename(columns={'Unnamed: 92':'EOX', 'Unnamed: 93':'Idc_phenol','Campagne':'Societe','N_forage':'ID','refus':'Refus',
                   'Prof.\nforage':'Long_for', 'N_ech':'ID_ech', 'Min_Ech':'Ech_top','Max_Ech':'Ech_base',
                   'Terrain':'Nappe','Epaisseur remblais':'Ep_remb', 'Epaisseur alluvions':'Ep_alluv',
                   'pH H2O':'pH_H2O','T° pH H2O':'Temp_pH_H2O','T° pH CaCl2':'Temp_pH_CaCl2','pH CaCl2':'pH_CaCl2', 
                   'T° pH KCl':'Temp_pH_KCl', 'pH KCl':'pH_KCl', 'T° CE':'Temp_CE', 'Argile ':'Argile', 
                   'Résidus chauffage':'Residu_perte_feu','Nature':'Polluant', 'Intensité':'Intensite',
                   'Libres':'CN_libre','Fraction   2000 µm':'Fract_2000µ','Fraction   63 µm':'Fract_63µ', 
                   'Fraction   45 µm':'Fract_45µ','Fraction   16 µm':'Fract_16µ','Fraction   2 µm':'Fract_2µ',
                   'Totaux':'CN_tot'
                  }, inplace=True)

In [None]:
df.drop(columns=[df.columns.to_list()[x] for x in range(len(df.columns))
                      if re.search(r"Unnamed",df.columns.to_list()[x])], axis=1, inplace=True) 
df.replace(r'<|>','', inplace=True, regex=True)
df=df.query('ID==ID')
df['ID']=df['ID'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)
df['ID_ech']=df['ID_ech'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)
df.replace('-',np.nan, inplace=True)
df.insert(5, 'Type', 'Piezo')
df.insert(6, 'Type_ech', 'Sol')

In [None]:
for i in df.index:
    #r=re.search('(\w+)/.+',str(df.loc[i, 'ID_ech']))
    #if r : df.loc[i, 'ID']=r.group(1)
    r=re.search('^\d+',str(df.loc[i, 'ID']))
    if r : df.loc[i, 'ID']='F'+str(df.loc[i, 'ID'])

In [None]:
df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
df.replace('#',np.nan, inplace=True)

In [None]:
for i in df.index:
    x=df.loc[i,'Nappe']
    if not re.search('^F|^Mo', str(df.loc[i,'ID'])) : df.loc[i,'Type']='Inconnu'
        
    if re.search('[R|r]em', str(x)) : df.loc[i,'Nappe']='Remblais'
    elif re.search('[A|a]ll', str(x)) : df.loc[i,'Nappe']='Alluvions'
    elif re.search('[S|s]oc', str(x)) : df.loc[i,'Nappe']='Socle'
    elif re.search('[A|a]rg', str(x)) : df.loc[i,'Nappe']='Argile'
    else : df.loc[i,'Nappe']=''

In [None]:
df['Date_mes'] = '2050-01-01'
#df['Date_mes'] = df['Date_mes'].astype('datetime64')

In [None]:
df = col_ren(df, name=POL_NAMES_MODEL, mode=1, cutoff=0.7)#, verbose=True)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

##### data merging

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID',
                valid_dict={'Societe_x':list(conflict_df.index), 'Type_x':list(conflict_df.index),
                            'Long_for_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_bh = dataset.copy()

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID','Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_mes
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset.copy()

In [None]:
source_ukw, conflict_df = data_merger(source_ukw, ukw, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

### $\color{red}{\textbf{Excel data final merge}}$

In [None]:
bh_coords = source_bh[['ID', 'X', 'Y', 'Z','Date_for']].copy()

In [None]:
source_an, conflict_df = data_merger(source_an, bh_coords, how='left', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
source_litho, conflict_df = data_merger(source_litho, bh_coords, how='left', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_litho
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Date_for_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_litho = dataset.copy()

In [None]:
source_mes, conflict_df = data_merger(source_mes, bh_coords, how='left', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_ukw, conflict_df = data_merger(source_ukw, bh_coords, how='left', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 3-obsrevations terrain et mesures piézos phase 2.xlsx

* **Sheet : 'Piézométrie'**

In [None]:
tmp_dir= save_dir + 'observ_terrain/'
sheet='Piezometrie'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'obsrevations terrain et mesures piézos phase 2.xlsx', sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

In [None]:
sdf=df[df.columns.to_list()[:3]]
sdf=na_line_drop(sdf,0)
sdf.rename(columns={'Niveau \npiézométrique':'Niv_eau_sol', 'Commentaires ':'Date_ech'}, inplace=True)

In [None]:
sdf2=df.loc[:11, df.columns.to_list()[3:-1]]
sdf2.rename(columns={'Unnamed: 7':'Date_mes', 'Unnamed: 8':'Nappe', 'Unnamed: 9':'ID', 'NP/piézo [m]':'Niv_eau_pz',
       'dim. piezo hors sol [m]':'Ht_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 'Prof. piézo/piézo [m]':'Long_pz',
       'Prof. piézo/sol [m]':'Long_pz_sol', 'CE [mS/cm]':'CE','t° [°C]':'Temp','O2 dissous\n[%]':'O_diss', 
        'Observations':'Rmq'}, 
           inplace=True)

In [None]:
for i in range(len(sdf2['ID'])):
    sdf2.loc[i,'ID']=re.sub(r'^P','F', sdf2.loc[i,'ID'])
    
    if pd.isnull(sdf2.loc[i,'CE']) and not pd.isnull(sdf2.loc[i,'CE [µS/cm]']):
        sdf2.loc[i,'CE']=sdf2.loc[i,'CE [µS/cm]']/1000

sdf2.drop(['CE [µS/cm]'], axis=1, inplace=True)

In [None]:
df=df.loc[14:, df.columns.to_list()[3:-1]]
df.rename(columns={'Unnamed: 7':'Date_mes', 'Unnamed: 8':'Nappe', 'Unnamed: 9':'ID', 'NP/piézo [m]':'Niv_eau_pz',
       'dim. piezo hors sol [m]':'Ht_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 'Prof. piézo/piézo [m]':'Long_pz',
       'Prof. piézo/sol [m]':'Long_pz_sol', 'CE [mS/cm]':'CE','t° [°C]':'Temp','O2 dissous\n[%]':'O_diss', 
        'Observations':'Rmq'}, 
           inplace=True)
df.drop([19,20], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
for i in range(len(df['ID'])):
    df.loc[i,'ID']=re.sub(r'^P','F', df.loc[i,'ID'])
    
    if pd.isnull(df.loc[i,'CE']) and not pd.isnull(df.loc[i,'CE [µS/cm]']):
        df.loc[i,'CE']=df.loc[i,'CE [µS/cm]']/1000
        
df.drop(['CE [µS/cm]', 'O_diss'], axis=1, inplace=True)

In [None]:
df, conflict_df=data_merger(sdf2, df, how='outer', on='ID')

In [None]:
df = na_col_drop(df, 5)
df['Type'] = 'Piezo'

In [None]:
dataframe_viewer(df, rows=3, un_val=['ID','ID_ech'])

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 4-profondeur de contact campagne de forages octobre 2019.xlsx

* **Sheet : 'Feuil1'**

In [None]:
tmp_dir= save_dir + 'Prof_contact_sol_forage/'
sheet='Feuil1'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/profondeur de contact campagne de forages octobre 2019.xlsx', 
                   sheet_name='Feuil1', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

In [None]:
df.rename(columns={'n°forage ':'ID','profondeur(m)':'Long_for','x':'X', 'y':'Y', 'z':'Z'}, inplace=True)
df['Type']='Forage' # type is not defined clearly in data
df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

bh=df

In [None]:
source_bh=bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 5-Forages_Pilote_Decoupe.xlsx

* **Sheet : 'leve'**

In [None]:
tmp_dir= save_dir + 'Forage_Pilote/'
sheet='leve_Z_elect_pos'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/geometrie_electrodes_et_sondes/Forages_Pilote_Decoupe.xlsx', 
                   sheet_name='leve')#, skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'Ref_puits':'ID','Niveau mesuré':'Z_mes', 'Niveau corrigé':'Z','Z_diff [m] repere_local':'Diff_Z_local',
                   'long_fin [m]':'Long_for','Pos_Inox_#1 [m]':'Pos_Inox_#1', 'Unnamed: 11':'Rmq',
                   'Pos_Inox_#6 [m]':'Pos_Inox_#6', 'Pos_Impol_#3 [m]':'Pos_Impol_#3'}, inplace=True)

In [None]:
df['Type']='Forage' # type is not defined clearly in data
df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

elc = df[['ID','Pos_Inox_#6', 'Pos_Impol_#3']] # 'ID' is for boreholes
bh = df[['ID','Z','Diff_Z_local','Long_for', 'Type']]# Z_local origin = 145.5 [m]

In [None]:
dataframe_viewer(df, rows=3, un_val=['ID','ID_ech'])

In [None]:
source_bh = bh
source_elc = elc

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
elc.to_csv(tmp_dir+sheet+'_Electrodes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_elc.to_csv(tmp_dir+'source_merge/source_Electrodes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_elect:{len(source_elc)} ;')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 6-Liste XY investigations.xlsx
* **Sheet : 'SOL_EAU'**

In [None]:
tmp_dir= save_dir + 'Liste_XY/'
sheet='Sol_Eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='SOL')#, skiprows=4)
df['Type_ech']='Sol'
df.rename(columns={'N°':'ID_ech'}, inplace=True)

df1 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU PR')#, skiprows=4)
df1['Type_ech']='Eau'
df1['Nappe']='Socle'
df1.rename(columns={'N°':'ID_ech'}, inplace=True)

df2 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU RB')#, skiprows=4)
df2['Type_ech']='Eau'
df2['Nappe']='remblais'
df2.rename(columns={'N°':'ID_ech'}, inplace=True)

df3 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU ALL')#, skiprows=4)
df3['Type_ech']='Eau'
df3['Nappe']='Alluvions'
df3.rename(columns={'N°':'ID_ech'}, inplace=True)

In [None]:
dataframe_viewer(df, rows=3, un_val=['ID','ID_ech'])

In [None]:
mdf, conflict_df=data_merger(df1, df, 'outer', 'ID_ech')

In [None]:
mdf, conflict_df=data_merger(mdf, df2, 'outer', 'ID_ech')

In [None]:
mdf, conflict_df=data_merger(mdf, df3, 'outer', 'ID_ech')

In [None]:
dataset = mdf
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Nappe_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
mdf = dataset.copy()

In [None]:
source_an = mdf
#source_an.insert(0,'ID', source_an.pop('ID_ech'))

In [None]:
source_an = gen_id_from_ech(source_an, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 7-Résultats phase 1_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'Phase_1_Memoris/'
sheet='Result_sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x=='R': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='A': ech_df.loc[i,'Description']='Argiles'
    elif x=='S': ech_df.loc[i,'Description']='Sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('x|X', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène', 'col_63':'EOX'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)#, verbose=True)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
an = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)
an['ID'] = an['ID'].apply(lambda x: x+'M')

In [None]:
df_dict = data_slicer(an, cols_dict, crit_dict)

In [None]:
dataframe_viewer(an, rows=10, un_val=['ID','ID_ech'])

In [None]:
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir= save_dir + 'Phase_1_Memoris/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech','Date_ech','Num_maille','Affectation','X','Y','Zsol','Long_for','Prof_crep','Long_pz',
      'Niv_eau_sol','pH','CE','Temp']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [None]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]

ech_df['Type_equip'] = 'Crepine'
ech_df.drop(columns=['Prof_crep'], inplace=True)

In [None]:
#ech_df['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech', 'col_43':'phénanthrène'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
df['Type'] = 'Piezo'
df['Date_mes'] = df['Date_ech']

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_bh = bh
source_mes = mes
source_eqp = eqp

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 8-Résultats phase 2_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'Phase_2_Memoris/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Date_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df['Description'])

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x=='R': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='LA': ech_df.loc[i,'Description']='Limons et argiles'
    elif x=='LS': ech_df.loc[i,'Description']='Limons et sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
df['Type'] = 'Forage'

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir= save_dir + 'Phase_2_Memoris/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Date_ech','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol',
      'Niv_eau_sol','pH', 'CE', 'Temp']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [None]:
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]
    
ech_df.drop(columns=['Prof_crep'], inplace=True)
ech_df['Type_equip'] = 'Crepine'
ech_df['Type']='Piezo'

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_43':'phénanthrène'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes = mes
source_eqp = eqp

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 9-Ensemble des résultats Memoris version Seafile.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'Memoris_seafile/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:37]
an=df.loc[38:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.drop(columns=ech_df.columns[[-3,-4]], axis=1, inplace=True)

In [None]:
name=['ID_ech', 'Date_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df['Description'])

In [None]:
for i in ech_df.index:
    x = ech_df.loc[i,'Description']
    if x=='R' or x=='R ': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='LA': ech_df.loc[i,'Description']='Limons et argiles'
    elif x=='LS': ech_df.loc[i,'Description']='Limons et sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)

In [None]:
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène'}, inplace=True)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Type'] = 'Forage'

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir= save_dir + 'Memoris_seafile/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.drop(columns=ech_df.columns[[2]], axis=2, inplace=True)

In [None]:
name=['ID_ech', 'Date_ech','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol', 
      'Niv_eau_sol','pH', 'CE', 'Temp']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [None]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]
    
ech_df.drop(columns=['Prof_crep'], inplace=True)
ech_df['Type_equip'] = 'Crepine'

In [None]:
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech','Température pour mes. pH':'Temp_pH', 'col_43':'phénanthrène'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
dataframe_viewer(df, rows=3, un_val=['ID','ID_ech'])

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes = mes
source_eqp = eqp

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 10-Résultats SOL container phyto t=0_décret sol.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'Container_phyto/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:21]
an=df.loc[22:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.drop(columns=ech_df.columns[[-3]], axis=1, inplace=True)

In [None]:
name=['ID_ech', 'Ech_top', 'Ech_base','MS','Date_ech','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.rename(columns={an.columns[0]:'ID_ech',  'col_35':'phénanthrène'}, inplace=True)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Paramètres agro.'**

In [None]:
tmp_dir= save_dir + 'Container_phyto/'
sheet='Param_agro'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Paramètres agro.', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)
df=col_ren(df, 0)

In [None]:
df.drop(list(range(1)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df=dble_col_drop(df)

In [None]:
df=na_col_drop(df,1)
df=na_line_drop(df,3)
df.reset_index(drop=True, inplace=True)

In [None]:
df.drop(columns=df.columns[[5,6]], axis=2, inplace=True)

In [None]:
name=['ID_ech','Ech_top','Ech_base','MS','Date_ech','MO','Residu_perte_feu','COT','Fract_arg','Fract_min_2µ', 
      'Fract_min_50µ', 'Fract_min_2', 'Fract_2', 'Fract_2+', 'pH_KCl','Temp_pH_mes', 'pH_H20', 'Sulfure_tot', 
      'Chlorure', 'N_Kjdl']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [None]:
#mdf['Type'] = 'Forage'
df['Date_mes'] = df['Date_ech']

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_an
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Fract_2_x':list(conflict_df.index), 'Fract_2+_x':list(conflict_df.index),
                           'MS_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_an = dataset

In [None]:
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 11-Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir= save_dir + 'Siterem_Ext_Pilote/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:31]
an=df.loc[list(range(0,4))+list(range(32, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','pH','Temp_ech','Temp_pH']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech', 'col_35':'phénanthrène', 'Période ':'Periode', 
                   'Date de prélèvement':'Date_ech'}, inplace=True)

In [None]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'Periode', 'Emplacement','Date_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre',
      'Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE)", "cyanure complex - méthode interne ", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g"]

an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-7]

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_an = an
source_bh = bh
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Param physico'**

In [None]:
tmp_dir= save_dir + 'Siterem_Ext_Pilote/'
sheet='Param_physico'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [None]:
df=col_ren(df, 1)

In [None]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [None]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

In [None]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

In [None]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)

In [None]:
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','Temp_ech','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [None]:
sdf=sdf.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Niv_eau_chbre','pH','Niv_eau_sol','Long_pz',
      'Temp_ech ','CE','ORP','O_diss']
sdf=col_ren(sdf, mode=1, name=name)

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)

In [None]:
data=[df, sdf]
for d in data:
    d['Rmq']=''
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        n=str(d.loc[i, 'ID_ech'])
        d.loc[i,'ID_ech']=n.replace('*', '')
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan
        
        if re.match('\d+\*{1}$',n, re.I): 
            d.loc[i,'Rmq']="mesures faites dans un seau (débit non continu ou peu de débit)"
        elif re.match('\d+\*{2}$',n, re.I): 
            d.loc[i,'Rmq']="mésures faites dans une eau quasi-stagnante (Piezo rempli de sédiment et débit très faible)"

In [None]:
df.insert(1, 'Type_ech', 'Eau')
sdf.insert(1, 'Type_ech', 'Eau')

In [None]:
ech_df=data_merger(sdf, df, 'outer', 'ID_ech')[0]

In [None]:
ech_df=na_col_drop(df,2)
ech_df=na_line_drop(df,1)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
for i in ech_df.index:
    if not pd.isnull(ech_df.loc[i, 'Emplacement']):
        val = ech_df.loc[i, 'Emplacement']
    else:
        ech_df.loc[i, 'Emplacement'] = val

In [None]:
mdf = ech_df

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
# conflict with 'emplacement' (ext_pilote | simulateur)
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Emplacement_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_bh = dataset

In [None]:
source_an['Date_ech'] = source_an['Date_ech'].astype('datetime64')

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes.info()

In [None]:
source_mes['Date_mes'] = source_mes['Date_mes'].astype('datetime64')

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Periode_y':list(conflict_df.index), 'pH_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Inorganiques et composés majeurs'**

In [None]:
tmp_dir= save_dir + 'Siterem_Ext_Pilote/'
sheet='Inorganic_major'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:21]
an=df.loc[list(range(0,4))+list(range(22, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(2)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,2)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Temp_ech']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an=na_col_drop(an,3)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech','ammoniaque - libre':'ammoniaque libre','Période ':'Periode', 
                   'Date de prélèvement':'Date_ech', 'Emplacement ':'Emplacement'}, inplace=True)

In [None]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)#,verbose=True)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Periode_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 12-Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir= save_dir + 'Siterem_Pilote/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:32]
an=df.loc[list(range(0,4))+list(range(33, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.columns

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Niv_eau_pz','Niv_eau_chbre','pH','Temp_ech','CE','ORP',
      'O_diss','col_29','Temp_pH']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [None]:
ech_df.drop(columns=['col_29'], inplace=True)
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
ech_df['Periode'].replace('\n',' ', regex=True, inplace=True)
ech_df.replace('\n','', regex=True, inplace=True)

In [None]:
data=[ech_df]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech','Période ':'Periode', 'Emplacement \n- P : Pilote \n- HZP : Hors zone pilote':'Emplacement',
                  'Date de prélèvement':'Date_ech'}, inplace=True)

In [None]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'Periode', 'Emplacement', 'Date_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre','Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE)", "cyanure complex", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g","h"]

In [None]:
an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-8]

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an['Periode'].replace('\n',' ', regex=True, inplace=True)
an.replace('\n','', regex=True, inplace=True)

In [None]:
data=[an]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_mes = mes
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Param physico'**

In [None]:
tmp_dir= save_dir + 'Siterem_Pilote/'
sheet='Param_physico'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [None]:
df=col_ren(df, 1)

In [None]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [None]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

In [None]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

In [None]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)

In [None]:
df=df.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','Temp_ech','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [None]:
sdf.drop(columns=['col_29'], inplace=True)
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','pH','Niv_eau_sol','Temp_ech','CE',
      'ORP','O_diss','Temp_pH']
sdf=col_ren(sdf, mode=1, name=name)

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)
sdf.drop(columns=["Niv_eau_sol"], inplace=True)

In [None]:
set(sdf['Emplacement'])

In [None]:
data=[df, sdf]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
df.replace('\*|à compléter',np.nan, inplace=True, regex=True)

In [None]:
mdf, conflict_df = data_merger(sdf, df, 'outer', 'ID_ech')

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [None]:
mdf = na_col_drop(mdf, 3)

In [None]:
mdf['Type'] = 'Piezo'
mdf.rename(columns={'Date_ech':'Date_mes', 'ID_ech':'ID'}, inplace=True)

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Periode_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Inorganiques et composés majeurs'**

In [None]:
tmp_dir= save_dir + 'Siterem_Pilote/'
sheet='Inorganic_major'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:21]
an=df.loc[list(range(0,4))+list(range(22, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(2)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,2)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Temp_ech']
ech_df.replace(r'\n',' ', inplace=True, regex=True)
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an=na_col_drop(an,3)

In [None]:
an.rename(columns={'Période ':'Periode', 'Emplacement \n- S : Simulateur \n- HZS : Hors zone simulateur':'Emplacement',
                  'Date de prélèvement':'Date_ech', 'col_9':'ammoniaque libre'}, inplace=True)

In [None]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
data=[ech_df, an]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)
#an=an.iloc[:,:-7]

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = mdf
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Periode_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
mdf = dataset

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [None]:
dataframe_viewer(mdf, rows=3)

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Emplacement_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_bh = dataset

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Temp_ech_x':list(conflict_df.index), 'Periode_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 13-Resultats_Siterem_SOL.xlsx
* **Sheet : 'Résult SOL ext. pilote'**

In [None]:
tmp_dir= save_dir + 'Siterem_Result_Sol/'
sheet='Result_sol_ExtP'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='Résult SOL ext. pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:22]
an=df.loc[23:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df.Description)

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x in ['R','R ']: ech_df.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Description']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech', 'col_33':'Phénanthrène'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')
mdf['Date_mes'] = mdf['Date_ech']

In [None]:
mdf = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
mdf['Type'] = 'Piezo'

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_mes = mes
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'SOL T1 pilote'**

In [None]:
tmp_dir= save_dir + 'Siterem_Result_Sol/'
sheet='SOL_T1_Pilote'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='SOL T1 pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Description','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x in ['R','R ']: ech_df.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Description']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an=an.iloc[:,:-17]
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,3)
an.insert(1,'Type_ech','Sol')

In [None]:
an.rename(columns={an.columns[0]:'ID_ech', 'col_35':'Phénanthrène'}, inplace=True)

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')
mdf['Date_mes'] = mdf['Date_ech']

In [None]:
mdf = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
mdf['Type'] = 'Piezo'

In [None]:
for i in mdf.index:
    if re.search('\w+\s+\d+', mdf.loc[i, 'ID']): 
        mdf.loc[i, 'Type'] = 'Forage'

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 14-Logs_forages_vUmons_2018-03-20.xlsx
* **Sheet : 'Analyse_eau_Phases1&2'**

In [None]:
tmp_dir= save_dir + 'vUmons_logsFor/'
sheet='Analyse_eau_Phases1&2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_eau_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
df=col_ren(df,mode=1,name=[re.sub('9999','-',x) for x in df.columns])

In [None]:
drop_it = []
for c in df.columns:
    if re.search('_vn', c): drop_it.append(c)
df.drop(columns=drop_it, inplace=True)

In [None]:
name=['ID', 'ID_ech', 'Date_ech', 'X', 'Y', 'Z', 'Long_for','Long_pz_sol', 'Niv_eau_sol', 'pH', 'CE', 'Temp', 
      'Arsenic', 'Cadmium', 'Chrome', 'Chrome VI', 'Cuivre', 'Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'Cyanures (libres)', 'CN_totaux', 'cyanure (APE)', 'cyanure complex', 'thiocyanate', 'Benzène', 'Toluene', 
      'Éthylbenzène', 'Orthoxylène', 'Para_métaxylène', 'Xylenes', 'Styrène', 'Phénol', 'Naphtalène', 
      'Acénaphtylène', 'Acénaphtène', 'Fluorène', 'Phénanthrène', 'Anthracène', 'Fluoranthène', 'Pyrène', 
      'Benzo(a)anthracène', 'Chrysène', 'Benzo(b)fluoranthène', 'Benzo(k)fluoranthène', 'Benzo(a)pyrène', 
      'Dibenzo(ah)anthracène', 'Benzo(ghi)pérylène', 'Indéno(1,2,3-cd)pyrène', 'HAP Totaux (16) - EPA', 
      '1,1-Dichloroéthane', '1,2-Dichloroéthane', '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène', 
      'Totaux (cis,trans) 1,2-dichloroéthènes', 'Trans 1,2-dichloroéthylène', 'Dichlorométhane', 
      '1,2-dichloropropane', 'Tétrachloroéthylène ', 'Tétrachlorométhane', '1,1,1-Trichloroéthane', 
      '1,1,2-Trichloroéthane', 'Trichloroéthylène', 'Chloroforme', 'Chlorure de vinyle', 'fraction aromat. >C6-C7',
      'fraction aromat. >C7-C8', 'fraction aromat. >C8-C10', 'fraction aliphat. C5-C6', 'fraction aliphat. >C6-C8',
      'fraction aliphat. >C8-C10', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C10-C12', 'Fraction C12-C16',
      'Fraction C16 - C21', 'Fraction C21 - C35', 'Hydrocarbures totaux C10-C35', 'MTBE', 'Chlorures']
df=col_ren(df, mode=1,name=name)

In [None]:
df = col_ren(df,mode=1, name=POL_NAMES_MODEL)#, verbose=True)

In [None]:
df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)
df.insert(1,'Type_ech','Eau')

In [None]:
df.drop([20,39], axis=0,inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.loc[38:,'Date_ech']=df.loc[38:,'Date_ech'].apply(lambda x : dtm.datetime.fromordinal(dtm.datetime(1900, 1, 1).toordinal() + x - 2))

In [None]:
for i in df.index:
    if pd.isnull(df.loc[i,'ID_ech']): 
        df.loc[i,'ID_ech']=df.loc[i,'ID'].rstrip('M')

In [None]:
mdf = df

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')
mdf['Date_mes'] = mdf['Date_ech']

In [None]:
mdf['Type'] = 'Piezo'

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_an = an
source_bh = bh
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Analyse_sol_Phases1&2'**

In [None]:
tmp_dir= save_dir + 'vUmons_logsFor/'
sheet='Analyse_sol_Phases1&2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_sol_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID_ech','Date_ech','ID','X','Y','Z','Nature_ech','Organo','Long_for','Refus','Ech_top','Ech_base',
      'MS','Broyage < 150 µm','Broyage ','Fraction 2 mm','col_+2 mm','Arsenic', 'Cadmium', 'Chrome', 'Chrome VI','Cuivre',
      'Mercure', 'Plomb', 'Nickel', 'Zinc', 'Cyanure (libre)', 'Cyanure (totaux)', 'cyanure (APE)', 
      'cyanure complex', 'thiocyanate', 'Benzène', 'Toluène', 'Éthylbenzène', 'Orthoxylène', 'Para- et métaxylène',
      'Xylènes', 'Styrène', 'Phénol', 'Naphtalène', 'Acénaphtylène', 'Acénaphtène', 'Fluorène', 'Phénanthrène', 
      'Anthracène', 'Fluoranthène', 'Pyrène', 'Benzo(a)anthracène', 'Chrysène', 'Benzo(b)fluoranthène', 
      'Benzo(k)fluoranthène', 'Benzo(a)pyrène', 'Dibenzo(ah)anthracène', 'Benzo(ghi)pérylène', 
      'Indéno(1,2,3-cd)pyrène', 'HAP Totaux (16) - EPA', '1,1-Dichloroéthane', '1,2-Dichloroéthane', 
      '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène', 'Trans 1,2-dichloroéthylène', 'Dichlorométhane', 
      'Totaux (cis,trans) 1,2-dichloroéthènes', '1,2-dichloropropane', 'Tétrachloroéthylène', 
      'Tétrachlorométhane', '1,1,1-Trichloroéthane', '1,1,2-Trichloroéthane', 'Trichloroéthylène', 'Chloroforme', 
      'Chlorure de vinyle', 'fraction aromat. >C6-C7', 'fraction aromat. >C7-C8', 'fraction aromat. >C8-C10', 
      'fraction aliphat. C5-C6', 'fraction aliphat. >C6-C8', 'fraction aliphat. >C8-C10', 'Fraction C5 - C8', 
      'Fraction C8 - C10', 'Fraction C10-C12', 'Fraction C12-C16', 'Fraction C16 - C21', 'Fraction C21 - C35', 
      'Hydrocarbures totaux C10-C35']
df=col_ren(df, mode=1, name=name)

In [None]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [None]:
for i in df.index:
    x = df.loc[i,'Nature_ech']
    if x in ['R','R ']: df.loc[i,'Nature_ech']='Remblais'
    elif x in ['L']: df.loc[i,'Nature_ech']='Limons'
    elif x in ['LA']: df.loc[i,'Nature_ech']='Limons et argiles'
    elif x in ['LS']: df.loc[i,'Nature_ech']='Limons et sables'

df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
df.insert(1,'Type_ech','Sol')

In [None]:
df.drop(14, axis=0, inplace=True)
df.drop(['Broyage < 150 µm', 'Broyage '], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)
df.insert(1,'Description', df.pop('Nature_ech'))

In [None]:
df.loc[8, 'ID_ech']='F4/2M'
df.loc[31, 'ID_ech']='P19/1'
df.loc[32, 'ID_ech']='P19/2'

In [None]:
df = col_ren(df, mode=1, name=POL_NAMES_MODEL)#,verbose=True)

In [None]:
df.rename(columns={'col_+2 mm':'Fract_+2'}, inplace = True)

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')
#mdf['Date_mes'] = mdf['Date_ech']

In [None]:
mdf['Type'] = 'Piezo'

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_an
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'CN_tot_APE_y':list(conflict_df.index), 'CN_tot_y':list(conflict_df.index), 
                            'CN_libre_y':list(conflict_df.index), 'CN_cplx_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_an = dataset

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Synthèse'**

In [None]:
tmp_dir= save_dir + 'vUmons_logsFor/'
sheet='Synthese'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Synthèse', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df[:29]
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')

In [None]:
name=['ID','X','Y','Z', 'Refus','Long_for', 'RB', 'ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)

In [None]:
cols=['ID','X','Y','Z', 'Refus','Long_for']

for i in range(len(df)):
    if not pd.isnull(df.loc[i, 'RB']): 
        df.loc[i, 'Nappe']='Remblais'
        df.loc[i, 'Litho_top']=0
        if not pd.isnull(df.loc[i, 'Rb_base']):
            df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else:
            df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'ALL']):
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Nappe']='Alluvions'
        df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        if not pd.isnull(df.loc[i, 'S_A']):
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else:
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'S_A']):
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Nappe']='Socle altéré'
        df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        if not pd.isnull(df.loc[i, 'S_S']):
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else:
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if not pd.isnull(df.loc[i, 'S_S']):
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Nappe']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S', 'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df['Description'] = df['Nappe']
df['Type'] = 'Forage'

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Type_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_bh = dataset

In [None]:
source_litho = litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Sond2017v2'**

In [None]:
tmp_dir= save_dir + 'vUmons_logsFor/'
sheet='Sond2017v2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Sond2017v2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if x==1 else '')

In [None]:
name=['R_ID','ID','X','Y','Z','Refus','Date_for','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','cote_rb','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)
df=df[['ID','X','Y','Z','Refus','Date_for','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']]

In [None]:
cols=['ID','Date_for','X','Y','Z','Z_fond','Refus','Long_for']

for i in range(len(df)):    
    if df.loc[i, 'RB']==1: 
        df.loc[i, 'Nappe']='Remblais'
        df.loc[i, 'Litho_top']=0
        
        if not pd.isnull(df.loc[i, 'Rb_base']): df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else: df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    val_def=df.loc[i, 'Litho_base'] # temporary value of litho_base if nan
    
    if df.loc[i, 'ALL']==1:
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Nappe']='Alluvions'
        
        if not pd.isnull(df.loc[i, 'All_top']): df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        else: df.loc[i+.2, 'Litho_top']=val_def #df.loc[i, 'litho_base']
            
        if df.loc[i, 'S_A']==1: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if df.loc[i, 'S_A']==1:
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Nappe']='Socle altéré'
        
        if not pd.isnull(df.loc[i, 'Soc_alt_top']): df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.5, 'Litho_top']=val_def #df.loc[i+.2, 'litho_base']
        
        if df.loc[i, 'S_S']==1: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if df.loc[i, 'S_S']==1:
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Nappe']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S','Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df['Description'] = df['Nappe']
df['Type'] = 'Forage'

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Type_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_bh = dataset

In [None]:
source_litho, conflict_df = data_merger(source_litho, litho, how='outer', on=['ID', 'Litho_top'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_litho
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Litho_base_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_litho = dataset

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

# Processing for new data added - April 2021

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 15-Résultats SOL extension pilote et piézairs.xlsx
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'result_sol_ext_pilote/'
sheet='Result_Sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Resultats SOL extension pilote et piezairs.xlsx', 
                   sheet_name='Résult SOL', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,3)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df.Description)

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x in ['R','R ']: ech_df.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Description']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [None]:
for i in range(len(ech_df)):
    x=ech_df.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        ech_df.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech','col_35':'Phénanthrène'}, inplace=True)

In [None]:
an=an[an.columns[:-17]]

In [None]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an = na_line_drop(an, 1)
an.insert(1,'Type_ech','Sol')

In [None]:
data = an
for i in range(len(data)):
    x=data.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        data.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf = gen_id_from_ech(mdf, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
to_drop = []
for i in mdf.index:
    if not re.search('\d+', mdf.loc[i,'ID']): to_drop.append(i)
mdf.drop(index=to_drop, inplace=True)

In [None]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')
mdf['Date_mes'] = mdf['Date_ech']

In [None]:
mdf['Type'] = 'Piezo'

In [None]:
source_vars = [source_bh, source_mes, source_litho, source_an, source_eqp, source_ukw]
[i for i in range(len(source_vars)) if len(source_vars[i]) != 0]

In [None]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_mes = mes
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'inorganiques et composés majeur'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Inorg_comp_majeur'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Resultats SOL extension pilote et piezairs.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:20] # not really interesting here!
an=df.loc[21:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an.loc[1.5] = df.loc[2]
an = an.sort_index().reset_index(drop=True)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech', 'Date de prélèvement':'Date_ech'}, inplace=True)

In [None]:
an=an[an.columns[:-7]]

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,2)
an = na_line_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)#, verbose=True)

In [None]:
an = dble_col_drop(an)

In [None]:
data = an
for i in range(len(data)):
    x=data.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        data.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
an['Date_ech'] = an['Date_ech'].astype('datetime64')

In [None]:
an = gen_id_from_ech(an, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 16-Profils de sol et données de terrain 2019.xlsx
* **Sheet : 'Log'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Log'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Log', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name = ['ID','Litho_top', 'Litho_base', 'Keyword', 'Description']
df = col_ren(df, name=name, mode=1, )
df = df[1:]
df['Date_for'] = dtm.datetime(2019,12,18)

In [None]:
df.drop(index=df.query('Litho_base.isnull() or Litho_top.isnull()').index, inplace=True)

In [None]:
compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Litho_top', base_col='Litho_base',)

In [None]:
df['Type'] = 'Piezo'
df.loc[65:80, 'Type'] = 'piezair'

In [None]:
df['Emplacement'] = 'Extension Pilote'
df.loc[83:, 'Emplacement'] = 'Mini-pilote' #piezair

In [None]:
litho = df
source_litho = litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Echantillon'+'Organoleptique**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Echantillon'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Echantillon', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Ech_top', 'Ech_base', 'ID_ech']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [None]:
df.drop(index=[43,44,55,56,66], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
ech=df.copy()

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Organoleptique', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,4)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Pol_top', 'Pol_base','Polluant','Intensite']
df=col_ren(df, name=name, mode=1)

In [None]:
df.drop(index=[10,11,14,15], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
mdf, conflict_df =data_merger(ech, df, on='ID', how='outer')

In [None]:
an = mdf
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Données de forage'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Donnees_forage'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Données de forage', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID', 'X', 'Y', 'Z', 'Date_for', 'Long_for', 'Methode', 'Diam_for','Rmq', 'Long_pz', 'Diam_pz', 
      'Sect_crep','Societe', 'Resp_chantier']
df=col_ren(df, name=name, mode=1)
df.drop(index=[16,23], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.insert(5, 'Type', 'Piezo')
df.loc[16:21,'Type']='Piezair'

In [None]:
df.loc[9,'ID']='224 bis'

In [None]:
df['Refus'] = ''
df['Type_refus']=''

for i in range(len(df['Rmq'])):
    val = str(df.loc[i,'Rmq'])
    if re.search('[Bb]loqué', val) :
        df.loc[i,'Refus'] = 'x'
        
        if re.search('[lL]aitier', val):
            df.loc[i,'Type_refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Type_refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Type_refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Refus'] = '' 

df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[1]) if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[0]) if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x) if not pd.isnull(x) else x)

df.insert(10, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(11, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Rmq', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines
df.reset_index(drop=True, inplace=True)

#gen_dated_id(df,'ID','Date_for')  

In [None]:
bh = df
source_bh = bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Equipement'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='Equipement'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Equipement', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID','Equip_top', 'Equip_base', 'Diam_for', 'Diam_ext_pz', 'Legende']
df=col_ren(df, mode=1, name=name)

In [None]:
df.drop(index=[24,25], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
compute_BH_length(df, top_col='Equip_top', base_col='Equip_base')

In [None]:
df['Type'] = 'Piezo'

In [None]:
dataframe_viewer(df)

In [None]:
eqp = df
source_eqp = eqp

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Piézométrie'**

In [None]:
tmp_dir= save_dir + 'donnees_terrain_2019/'
sheet='piezometrie'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Niv_pz_sol', 'Type_ech', 'Date_mes']
df=col_ren(df, name=name, mode=1)

In [None]:
mes = df
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')