# DATA ORGANIZATION

In [None]:
from utils.io import update_dict, gen_id_dated, dataframe_viewer, gen_geodf_geom, data_merger, data_validation, \
data_slicer, replicate_values, collect_measure, collect_time_data, na_col_drop, na_line_drop, col_ren, \
dble_col_drop, dict_viewer

from utils.config import DEFAULT_POL_LEXICON, pol_field_model 
from difflib import get_close_matches

import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR

In [None]:
def compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Intv_top', base_col='Intv_base', verbose=False):
    
    if length_col_name in df.columns:
        raise(NameError(f'{length_col_name} is already in columns. Give another name'))
    
    for i in df.index:
        try:
            float(df.loc[i, top_col])
        except ValueError:
            df.loc[i, top_col] = np.nan

        try:
            float(df.loc[i, base_col])
        except ValueError:
            df.loc[i, base_col] = np.nan

    df[top_col] = df[top_col].astype('float64')
    df[base_col] = df[base_col].astype('float64')

    # compute length based on litho_top and litho_base
    id_list = []

    for i in df.index:
        id_ = df.loc[i,id_col]
        
        if verbose : print(i, id_, df.loc[i, top_col], df.loc[i, base_col])
        if id_ not in id_list:
            id_list.append(id_)
            if isinstance(id_, str):
                sql_id = f"{id_}"
            elif isinstance(id_, float) or isinstance(id_, int):
                sql_id = id_
                
            tmp = df[df[id_col] == sql_id]
            
            if verbose : print(len(tmp))
            #if len(tmp) > 0:
            df.loc[tmp.index, length_col_name] = float(max(tmp[base_col])) - float(min(tmp[top_col]))
    
    df.drop(index=df.query(f'{base_col}.isnull() and {top_col}.isnull()').index, inplace=True)
    df.insert(df.columns.to_list().index(id_col)+1, length_col_name, df.pop(length_col_name))
    #df.reset_index(drop=True, inplace=True)
    

In [None]:
bh_cols = ['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Long_crep','Long_pz-sol','haut_pz-sol',
           'Diam_for','Diam_int_pz','Diam_ext_pz','Refus','Societe','Method','ID_date','Rmq']

mes_cols = ['Date_mes','ID','X','Y','Z','Zsol','Params','Rmq']

eqp_cols = ['Date_for','ID','X','Y','Z','Zsol','Type_equip','Equip_base','Equip_top','Rmq']

litho_cols = ['Date_for','ID','X','Y','Z','Zsol','Long_for','Intv_top','Intv_base','Description','Rmq']

an_cols = ['Date_for','ID','X','Y','Z','Zsol','Date_ech','ID_ech','Type_ech','Intv_top','Intv_base','Organo','Rmq']

ukw_cols = ['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Method','Type','Societe','Rmq']

coi_dict = {'borehole': bh_cols, 'measure': mes_cols, 'lithology': litho_cols, 'analysis': an_cols, 
 'equipement': eqp_cols, 'unknown': ukw_cols}

In [None]:
params_kw = ['niv_eau', 'temp', 'CE', 'pH', 'ORP']

variables utilisées 
==========================
- bh 	: 	forages (simple ou piezo)
- equip	:	equipements d'un forage (outils, méthodes utilisés, ...)
- ukw	:	objets physiques indéterminés
- litho :	descriptions lithologiques
- an 	: 	analyses de contaminants sur des échantillons (sol, eau)
- mes	:	mesures de propriétés sur des échantillons (sol, eau), de paramètres hydrochimiques, ...


source dataframes initialization

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 1- Profils sols et données forages.xls
* **Sheet : 'Données de forage'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='donnees_forage'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', 
                   sheet_name='Données de forage')#, skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'Date':'Date_for','Profondeur':'Long_for', 'Méthode':'Method', 
                        'Diamètre forage':'Diam_for','Niv. Eau p/r sol':'Niv_eau_sol',
                        'PZ Prof.':'Long_pz', 'PZ Diamètre':'Diam_pz','PZ L.crépinée':'Long_crep', 
                        'Société forage':'Societe'}, inplace=True)

In [None]:
df['Type'] = df['Long_pz'].apply(lambda x: 'Forage' if pd.isnull(x) else 'Piezo')
df['Refus'] = ''

for i in range(len(df['Remarque'])):
    val = str(df.loc[i,'Remarque'])
    if re.search('[Bb]loqué', val) :        
        if re.search('[lL]aitier', val):
            df.loc[i,'Refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Refus'] = np.nan

# convert diameter values unit from mm to m
df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace(' mm','').split('x')[1].strip(' m'))/1000 
                                        if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace(' mm','').split('x')[0].strip(' m'))/1000 
                                        if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)

df.insert(7, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(8, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Remarque', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines

gen_id_dated(df,'ID','Date_for')  

In [None]:
df['Date_for'] = df['Date_for'].astype('datetime64')

In [None]:
bh = df[['ID_date','ID','X','Y','Z','Date_for','Type','Long_for','Diam_for','Long_pz','Long_crep',
         'Diam_ext_pz','Diam_int_pz','Refus','Method','Societe']]

mes = df[['ID','X','Y','Z','Date_for','Niv_eau_sol']]
mes.rename(columns={'Date_for':'Date_mes'}, inplace=True)

In [None]:
mes = collect_measure(mes, params_kw)

In [None]:
source_mes = mes
source_bh = bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Piézométrie'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='piezometrie'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Piézométrie', skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
sdf = na_col_drop(df[:12], 3)
sdf.rename(columns={'z':'Z'}, inplace=True)

In [None]:
a=0
for x in df.columns:
    if pd.isnull(df.loc[16,x]):
        df.loc[16,x]='col'+str(a)
    a+=1

In [None]:
if not 'tmp_df' in vars().keys():
    tmp_df = df.copy()
    
df = tmp_df.copy()
df.loc[16]=df.loc[16].apply(lambda x : x if not pd.isnull(x) else '')
df.columns = df.loc[16]

In [None]:
df=df[17:]
df.reset_index(inplace=True, drop=True)

#df.drop(columns=[df.columns.to_list()[x] for x in range(0,8)
#                      if re.compile(r"col|unnamed").match(df.columns.to_list()[x])], axis=1, inplace=True) 

In [None]:
df.rename(columns={'col8':'Date_mes', 'col9':'Nappe', 'col10':'ID', 'NP/piézo [m]':'Niv_eau_pz', 
                        'dim. piezo hors sol [m]':'haut_pz-sol', 'NP/sol [m]':'Niv_eau_sol', 
                        'Prof. piézo/piézo [m]':'Long_pz', 'Prof. piézo/sol [m]':'Long_pz-sol', 
                        't° [°C]':'Temp', 'Observations':'Rmq'}, inplace=True)

In [None]:
df = na_col_drop(df, 3)
df.reset_index(drop=True, inplace=True)

In [None]:
df['CE'] = df[['CE [µS/cm]', 'CE [mS/cm]']].apply(lambda x: x[0]/1000 if pd.isnull(x[1]) else x[1], axis=1) # mS/cm
df.drop(columns=['CE [µS/cm]', 'CE [mS/cm]'], inplace=True)
df['ID'] = df['ID'].apply(lambda x: re.sub('P','F',x) if not pd.isnull(x) else x)
df.insert(0, 'ID', df.pop('ID')) # move to first column
df['Type'] = 'Piezo'

In [None]:
df.rename_axis(None, inplace=True, axis=1)
df.drop(df.query("ID!=ID").index, inplace=True) # supprimer les lignes avec ID='NaN'
df.reset_index(inplace=True, drop=True)

In [None]:
df = collect_measure(df, params_kw)

In [None]:
bh = df[['ID', 'Type','Long_pz', 'haut_pz-sol', 'Long_pz-sol']]
mes = df[['ID', 'Date_mes', 'Params', 'Nappe', 'Rmq']]

In [None]:
mes['Date_mes'] = mes['Date_mes'].astype('datetime64')

##### Data merging

In [None]:
bh, conflict_df = data_merger(bh, sdf[['ID', 'Z']], how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
mdf, conflict_df = data_merger(source_bh, bh, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_pz_y':list(conflict_df.index)})

In [None]:
dataset = mdf
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_bh = mdf.copy()

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
cols_rep = ['X', 'Y', 'Z']
source_bh = replicate_values(source_bh, id_col='ID', cols_to_replicate=cols_rep, suffix=['sup', 'inf'], replace_id=True)
source_mes = replicate_values(source_mes, id_col='ID', cols_to_replicate=cols_rep, suffix=['sup', 'inf'], replace_id=False)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Equipement'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='Equipement'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', 
                   sheet_name='Equipement')#, skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID', 'Equip_top', 'Equip_base', 'Diam_for','Diam_int_pz', 'Type_equip']
df=col_ren(df, mode=1, name=name)

In [None]:
compute_BH_length(df, id_col='ID', length_col_name='Long_pz', top_col='Equip_top', base_col='Equip_base')

In [None]:
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)
df['Diam_int_pz'] = df['Diam_int_pz'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)

In [None]:
bh_ = source_bh[['ID', 'X', 'Y', 'Z']]
df, conflict_df = data_merger(bh_, df, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
df = na_line_drop(df, 3, 2)

In [None]:
bh = df[['ID', 'Long_pz','Diam_for', 'Diam_int_pz']]
bh.drop_duplicates(['ID'], inplace=True)
eqp = df[['ID', 'Equip_top', 'Equip_base', 'Type_equip']]

##### Data merging

In [None]:
mdf, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_pz_x':list(conflict_df.index), 'Diam_for_y':list(conflict_df.index), 
                           'Diam_int_pz_y':list(conflict_df.index)})

In [None]:
dataset = mdf
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_bh = mdf.copy()

In [None]:
data = source_bh
source_bh = replicate_values(data, 'ID', list(data.columns)).drop_duplicates(list(data.columns))

In [None]:
source_eqp = eqp

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheets: 'Echantillon' + 'Organoleptique'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='Echant-organo'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Echantillon')#, skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'De':'Ech_top', 'A':'Ech_base', 'Numéro':'ID_ech'}, inplace=True)

In [None]:
sdf = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Organoleptique')#, skiprows=1)
sdf.replace(r'<|>','', inplace=True, regex=True)
sdf.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(sdf, rows=5)

In [None]:
name=['ID', 'Ech_top', 'Ech_base', 'Polluant','Intensite']
sdf=col_ren(sdf, mode=1, name=name)

In [None]:
sdf = collect_measure(sdf, params_kw=['pol', 'inten'], params_col='Organo')

##### Data merging

In [None]:
mdf, conflict_df = data_merger(df, sdf, 'outer', ['ID', 'Ech_top', 'Ech_base'])
mdf['Type_ech']='Sol'
mdf.insert(4, 'Type_ech', mdf.pop('Type_ech'))

In [None]:
cnt = 0
for i in mdf.index:
    if pd.isnull(mdf.loc[i, 'ID_ech']):
        cnt +=1
        mdf.loc[i, 'ID_ech'] = mdf.loc[i, 'ID'] + '_org_' + str(cnt)

In [None]:
an = mdf.copy()
source_an = an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Log'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='Log'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Log')#, skiprows=1)
dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'De':'Litho_top', 'A':'Litho_base'}, inplace=True)

In [None]:
q=df.query('Keyword.str.contains(".ointe", regex=True)', engine='python').index
df.drop(q, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Litho_top', base_col='Litho_base')

In [None]:
litho=df
source_litho=litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

### $\color{red}{\textbf{Excel data final merge}}$

In [None]:
bh_coords = source_bh[['ID', 'X', 'Y', 'Z','Date_for']]

In [None]:
source_eqp, conflict_df = data_merger(bh_coords, source_eqp, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
source_an, conflict_df = data_merger(bh_coords, source_an, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
source_litho, conflict_df = data_merger(bh_coords, source_litho, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 2-Database MEMORIS3.xlsx
* **Sheet : 'PROFILS_SOL'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Profils_sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. '+
                        'Siterem - 2017/Database MEMORIS3.xlsx', sheet_name='PROFILS_SOL')#, skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

In [None]:
df = na_col_drop(df, 3)

In [None]:
df.rename({'Date':'Date_for', 'N°':'Ref', 'Id':'idx', 'Piézo':'Type', 'Unnamed: 6':'Societe',
                'MFT Ø145':'MFT_145', 'Gouge Ø75':'Gouge_75', 'Liner Ø60': 'Liner_60'}, axis=1, inplace=True)

In [None]:
print(list(set(df['Date_for'].apply(lambda x: x.year if not pd.isnull(x) else x))))

In [None]:
df.loc[df.fillna('').query("Societe.str.contains('x|X')").index, 'Type']='X'

In [None]:
df.loc[df.fillna('').query("Gouge_75.str.contains('SBS|SITER')").index, 'Societe']='SBS Environnement'
df.loc[df.fillna('').query("Gouge_75.str.contains('SBS|SITER')").index, 'Gouge_75']=''

In [None]:
for i in range(len(df['Date_for'])-1):
    if not pd.isnull(df.loc[i, 'Date_for']) and pd.isnull(df.loc[i+1, 'Date_for']):
        df.loc[i+1, 'Date_for']=df.loc[i, 'Date_for']
        
    if not pd.isnull(df.loc[i, 'Societe']) and pd.isnull(df.loc[i+1, 'Societe']):
        df.loc[i+1, 'Societe']=df.loc[i, 'Societe']
        
    if not pd.isnull(df.loc[i, 'Type']) and pd.isnull(df.loc[i+1, 'Type']) and \
       df.loc[i, 'Ref']==df.loc[i+1, 'Ref']:
        df.loc[i+1, 'Type']=df.loc[i, 'Type']

In [None]:
for i in range(len(df['idx'])-1):    
    if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
    and re.findall('Forage',df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
        w=df.loc[i, 'Profondeur'][0]
    elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])
    
    if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
    and re.findall('Tranch',df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
        w=df.loc[i, 'Profondeur'][0]
    elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])
     
   # if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
   # and re.findall('Moni',df.loc[i, 'Profondeur']):
   #     df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
   #     w=df.loc[i, 'Profondeur'][0]
   # elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
   #     df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])

In [None]:
df['Ref']=df['idx'].apply(lambda x : x if re.findall('F|T', str(x)) else '')
df['Ref']=df['idx'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)

In [None]:
df['Type']=df['Type'].apply(lambda x: 'Piezo' if not pd.isnull(x) else '')

In [None]:
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.1","a",str(x)) if re.search(r"\.1", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.2","b",str(x)) if re.search(r"\.2", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.3","c",str(x)) if re.search(r"\.3", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.4","d",str(x)) if re.search(r"\.4", str(x)) else x)

In [None]:
gen_id_dated(df, ref_col='Ref', date_col='Date_for')

In [None]:
df.loc[df.query('Profondeur!=Profondeur' ).index,'Profondeur']=''

In [None]:
df['Method']=''
            
for i in range(len(df['Method'])):
    if not pd.isnull(df.loc[i, 'Gouge_75']) : df.loc[i, 'Method']='Gouge_75'
    if not pd.isnull(df.loc[i, 'MFT_145']) : df.loc[i, 'Method']='MFT_145'
    if not pd.isnull(df.loc[i, 'Liner_60']) : df.loc[i, 'Method']='Liner_60'
    if not pd.isnull(df.loc[i, 'carottier']) : df.loc[i, 'Method']='carrotier'
    if not pd.isnull(df.loc[i, 'tarrière']) : df.loc[i, 'Method']='tarrière'

In [None]:
df.drop(df.query('Profondeur.str.contains("Forage") and Profondeur!="Forage bloqué"', engine='python').index, inplace=True)
df.drop(df.query('Profondeur.str.contains("Tranc") and Profondeur!="Tranchée bloqué"', engine='python').index, inplace=True)
df.drop(df.query('Profondeur.str.contains(".orage|..ranch", regex=True)', engine='python').index, inplace=True)
df.drop(df.fillna('').query('Description.str.contains("^.orage bloq|^.ranc.* bloq|^.*efus", regex=True)', engine='python').index, inplace=True)
df.drop(df.query('Ref!=Ref').index, inplace=True)
df.drop(columns=['MFT_145','Gouge_75','Liner_60', 'carottier', 'tarrière', 'idx'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df['Litho_top'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[0].strip(' m'))
df['Litho_base'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[-1].strip(' m'))

In [None]:
df.rename({'Ref':'ID'}, axis=1, inplace=True)
if 'Profondeur' in df.columns: df.drop(columns=['Profondeur'], axis=1, inplace=True)

In [None]:
set([x[0] for x in list(set(df.ID)) if isinstance(x,str)])

In [None]:
df.loc[df.query('ID_date.str.contains("T")', engine='python').index, 'Type'] = 'Tranchee'
df.loc[df.query('Type==""', engine='python').index, 'Type'] = 'Forage'

In [None]:
df.loc[1268, ['ID_date','ID']] = df.loc[1267, ['ID_date','ID']]
df.loc[df.query('Description.isnull() or Description.str.len()<1').index, 'Description'] = ''

In [None]:
df.drop(index=df.query('Litho_base.isnull() or Litho_base.str.len()<1 or ' 
                       'Description.str.contains("Bloqu")').index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Litho_top', base_col='Litho_base')

In [None]:
df.columns

In [None]:
litho = df[['Date_for','ID','Long_for','Litho_top','Litho_base','Description']]
df = df[['Date_for','ID','Type','Long_for','Societe','Method']]

In [None]:
df.drop_duplicates('ID', inplace=True)
ukw = df.loc[df.query('Type=="Tranchee"', engine='python').index] # trenches
ukw['Type'] = 'inconnu'
bh = df.drop(index=ukw.index)

In [None]:
source_litho = litho
source_bh = bh
source_ukw = ukw

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'DONNEES PIEZOS'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Donnees_piezos'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. '+
                        'Siterem - 2017/Database MEMORIS3.xlsx', sheet_name='DONNEES PIEZOS', skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

In [None]:
names = ['Ref_id','ID','Societe','Zone','Sous-zone','X','Y','Zsol','Z','Nappe','Long_pz','Sect_crep',
         'Diam_int_pz','Niv_eau_pz_27/04/2010','Niv_eau_pz_08/09/2010','Niv_eau_sol_27/04/2010',
         'Niv_eau_sol_08/09/2010','Surnageant','Sousnageant',
         'Caractere','Opacite','Rmq']
df = col_ren(df, mode=1, name=names)
df = na_col_drop(df, 3)

In [None]:
df=df.query("ID==ID")
df.replace('-',np.nan, inplace=True)

In [None]:
df['Sousnageant']=df['Sousnageant'].apply(lambda x: x/100 if not pd.isnull(x) else x) #convert unit in [m]
df['Surnageant']=df['Surnageant'].apply(lambda x: x/100 if not pd.isnull(x) else x)
df['Type']=df['Sect_crep'].apply(lambda x: 'Piezo' if not pd.isnull(x) else 'inconnu')

In [None]:
df = df[['ID','X','Y','Z','Zsol','Type','Long_pz','Diam_int_pz','Sect_crep','Nappe','Societe','Zone','Sous-zone',
         'Niv_eau_pz_27/04/2010','Niv_eau_pz_08/09/2010','Niv_eau_sol_27/04/2010','Niv_eau_sol_08/09/2010',
         'Surnageant','Sousnageant','Caractere',
      'Opacite','Rmq']]

In [None]:
df = collect_time_data(df)

In [None]:
bh = df[['ID','X','Y','Z','Zsol','Type','Long_pz','Diam_int_pz','Sect_crep','Societe',
         'Zone','Sous-zone']]
bh.drop_duplicates('ID', inplace=True)

mes = df[['Date_mes','ID','X','Y','Z','Niv_eau_pz','Niv_eau_sol']]

an = df[['Date_mes','ID','X','Y','Z','Nappe','Surnageant','Sousnageant','Caractere','Opacite','Rmq']]
an['Type_ech'] = 'Eau'
an.drop_duplicates('ID', inplace=True)
an.insert(1, 'ID_ech', an.ID)

In [None]:
mes = collect_measure(mes, ['niv'])

In [None]:
ukw=bh.query("Type!='Piezo'") # unknown facilities' type (it seems they are not boreholes)
ukw=ukw[['ID','X','Y','Z','Zsol','Type','Societe']]
ukw['Type'] = 'inconnu'

##### Data merging

In [None]:
source_an = an

In [None]:
source_mes = mes

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_ukw, conflict_df = data_merger(source_ukw, ukw, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'DRAINS ET PIEZOS ENEL'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Drains_Pz_ENEL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/Database MEMORIS3.xlsx', 
                        sheet_name='DRAINS ET PIEZOS ENEL', skiprows=1)

df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

In [None]:
df.insert(5, 'Z', df.pop('PZ absolue (m)'))
df.rename(columns={'N°':'ID', 'Date ':'Date_prv','Hauteur de la chambre ':'Ht_Chbre','T':'Temp', 'ETUDE':'Etude',
                   'Niv_EAU_SOL (m)': 'Niv_eau_sol_01/10/2013', 'Niv_EAU_SOL (m).1':'Niv_eau_sol_14/12/2016', 
                   'Prof_PZ':'Long_pz','Section_crépinée':'Sect_crep', 'Diamètre_int':'Diam_int_pz', 
                   '\nC5-C8':'C5-C8'}, inplace=True)
df = df.query('ID==ID')

In [None]:
df = collect_time_data(df)

In [None]:
cols = list(df.columns)[:-2]
df = replicate_values(df, 'ID', cols, suffix=['prof', 'sup', 'inf'], replace_id=False)

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan) # -> in mS/cm

In [None]:
params_kw = ['Niv_','pH$','CE','Temp','ORP','Odiss']
df = collect_measure(df, params_kw) # $ to fix the end of te word
df.drop(index=df.query('ID.str.contains("nan", regex=True)', engine='python').index, inplace=True)

In [None]:
mes = df[['ID','X', 'Y', 'Z','Etude','Date_mes','Params']]

In [None]:
df.drop_duplicates('ID', inplace=True)
bh = df[['ID','Etude','X','Y','Z','Zsol','Ht_Chbre','Long_pz','Sect_crep','Diam_int_pz']]
an = df[['ID','Date_prv','Etude','X','Y','Z','arsenic','cadmium','chrome','cobalt','cuivre','mercure','plomb',
         'nickel','zinc','CN_libre','CN_totaux','CN_totaux.1','CN_totaux.2','thiocyanate','benzène','toluène',
         'éthylbenzène','orthoxylène','para- et métaxylène','xylènes','BTEX total','styrène','Iph.','naphtalène',
         'anthracène','phénanthrène','fluoranthène','benzo(a)anthracène','chrysène','benzo(a)pyrène',
         'benzo(ghi)pérylène','benzo(k)fluoranthène','indéno(1,2,3-cd)pyrène','C5-C8','C8-C10','C10-C12','C12-C16',
         ' C16 - C21','C21 - C35','C35 - C40','totaux C10-C35','C10-C12.1','C12-C22','C22-C30','C30-C40','Totaux C10-C40']]

In [None]:
bh.insert(1, 'Type', 'Piezo')
an.insert(2, 'Type_ech', 'Eau')
an.insert(2, 'ID_ech', an.ID)

In [None]:
data = an
for c in data.columns:
    if re.match('\s*\w+\s*-\s*\w+\s*', c):
        c_mod = c.replace(' ','')
        data.rename(columns={c:c_mod}, inplace=True)
an = data.copy()

In [None]:
an = col_ren(an, name=pol_field_model, mode=1)

In [None]:
an.drop(columns=['CN_totaux_<1>', 'CN_totaux_<2>', 'C12_<1>'], inplace=True)

##### Data merging

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID','Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'RESULTS_EAU' (F)**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/Database MEMORIS3.xlsx', 
                        sheet_name='RESULTS_EAU', skiprows=1)

df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'Campagne':'Societe','N_piezo.':'ID','Z tête PZ':'Z', 'Prof_PZ':'Long_pz',
                   'Niv_EAU_TETE (m)':'Niv_eau_tete_27/04/2010','Niv_EAU_SOL (m)':'Niv_eau_sol_27/04/2010',
                   'Unnamed: 13':'Niv_eau_tete_08/09/2010','Unnamed: 15':'Niv_eau_sol_08/09/2010',
                   'Section_crépinée':'Sect_crep','Diamètre_int':'Diam_int_pz','Description éch.':'Opacite_eau',
                   'Remarques':'Rmq','Aquifère_échantillonné':'Nappe', 'Caractéristique':'Caractere'}, inplace=True)

df=df.query("ID ==ID")
df.replace('-',np.nan, inplace=True)

In [None]:
an = df[['ID','X','Y','Z','Zsol'] + list(df.columns)[26:]]
df = df[list(df.columns)[:26]]

In [None]:
df = collect_time_data(df)

In [None]:
params_kw = ['pH$', 'CE$', '^T$', 'ORP', 'Odiss', 'Niv_']
df = collect_measure(df, params_kw, params_col='Params')

In [None]:
df.columns

In [None]:
#dataframe_viewer(test.query('ID=="F16M"'), rows=5)
#dataframe_viewer(data.sort_values('ID'), rows=3)
dataframe_viewer(df, rows=5, cols=12, un_val=['ID'])
#dataframe_viewer(source_litho, rows=5), dataframe_viewer(source_an, rows=5)
#dataframe_viewer(bh, rows=5), dataframe_viewer(eqp, rows=5)

In [None]:
df_dict = data_slicer(df, coi_dict)

In [None]:
df_dict['borehole']

In [None]:
# splitting
sdf=df[['ID','X','Y','Z','Zsol']+df.columns.to_list()[12:16]+df.columns.to_list()[21:26]]
an=df[['ID','X','Y','Z','Zsol']+df.columns.to_list()[26:]]
prv_eau=df[df.columns.to_list()[:3]+['X','Y','Z','Zsol']+df.columns.to_list()[16:21]+['Nappe']]
df=df[df.columns.to_list()[:12]]

In [None]:
df['Type']=df['Long_crep'].apply(lambda x: 'Piezo' if not pd.isnull(x) else 'inconnu')
df.insert(8, 'Type', df.pop('Type'))
df['Diam_int_pz'] = df['Diam_int_pz'].apply(lambda x: x*1000 if not pd.isnull(x) else x)

In [None]:
pz=df.query("Type=='Piezo'")
ukw=df.query("Type!='Piezo'")[['ID', 'Societe', 'Zone', 'Sous_zone', 'X', 'Y', 'Z', 'Type']]

In [None]:
prv_eau['Surnageant']=prv_eau['Surnageant'].apply(lambda x: x/100) # to express value in [m]
prv_eau['Sousnageant']=prv_eau['Sousnageant'].apply(lambda x: x/100)

In [None]:
df_tmp=sdf[['ID','X','Y','Z','Zsol']]
cols=sdf.columns.to_list()[9:]
ID_mes=0
d=['27/04/2010', '08/09/2010']

for i in range(len(sdf)):
    for k in [0,1]:
        df_tmp.loc[ID_mes,'ID_mes']='Mes_'+str(ID_mes)
        df_tmp.loc[ID_mes,'ID']=str(sdf.loc[i,'ID'])
        
        dt=d[k].split('/')
        df_tmp.loc[ID_mes,'Date_mes']=dtm.date(int(dt[2]), int(dt[1]),int(dt[0]))
        df_tmp.loc[ID_mes,'Niv_eau_pz']=sdf.iloc[i,k+1]
        df_tmp.loc[ID_mes,'Niv_eau_sol']=sdf.iloc[i,k+3]
        df_tmp.loc[ID_mes, cols]=list(sdf.loc[i,cols])
        ID_mes+=1          
        
df_tmp.replace('-', np.nan, inplace=True)
df_tmp=df_tmp.sort_values('Date_mes').reset_index(drop=True)
df_tmp['ID_mes']=df_tmp['ID'].apply(lambda x: 'Mes_'+str(x))
df_tmp.insert(0, 'ID_mes', df_tmp.pop('ID_mes'))

In [None]:
q=df_tmp.query('Niv_eau_pz.isnull() and Niv_eau_sol.isnull()').index
df_tmp.drop(q, inplace=True)
df_tmp['CE']=df_tmp['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
mes_pz=df_tmp
mes_pz['Type_mes'] = 'Phys-chim'

In [None]:
source_mes_pz=source_mes_pz.sort_values('Date_mes').reset_index(drop=True)

In [None]:
dataframe_viewer(mes_pz, rows=3), dataframe_viewer(source_mes_pz, rows=3)

In [None]:
source_mes_pz.columns, mes_pz.columns

In [None]:
# source_mes_pz, conflict_df = data_merger(source_mes_pz, mes_pz, on=['ID', 'Date_mes'], how='outer')
test, conflict_df = data_merger(source_mes_pz, mes_pz, on=['ID', 'Date_mes'], how='outer')

In [None]:
dataframe_viewer(conflict_df, rows=3)

In [None]:
pause

In [None]:
conflict_df.columns

In [None]:
data_validation(overall_data=source_litho, conflict_data=conflict_df, index_col='index', pass_col='ID',
                valid_dict={'Description_x':list(conflict_df.index), 'Societe_x':list(conflict_df.index)})

In [None]:
dataset = source_mes_pz
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
an=na_line_drop(an,col_n=3)

In [None]:
an=dble_col_drop(an)

In [None]:
an.insert(1, 'Type_ech', 'Eau')
an.rename(columns={'ID':'ID_ech'}, inplace=True)
#an['Anl_ID']=an['ID'].apply(lambda x: 'Anl_'+str(x))
#an.insert(an.columns.to_list().index('ID')+1, 'Type_ech', 'Eau')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

data merging

In [None]:
source_pz, conflict_df=data_merger(source_pz, pz, 'outer', 'ID')

In [None]:
source_mes_pz, conflict_df=data_merger(source_mes_pz, mes_pz, 'outer', 'ID')
if len(conflict_df)>0 :dataframe_viewer(conflict_df, rows=3)

In [None]:
dataframe_viewer(source_mes_pz, rows=3)

In [None]:
source_ukw, conflict_df=data_merger(source_ukw, ukw, 'outer', 'ID')
if len(conflict_df)>0 :dataframe_viewer(conflict_df, rows=3)

In [None]:
len(source_an.columns),len(set(source_an.columns))
source_an.columns

In [None]:
source_an, conflict_df=data_merger(source_an, an, 'outer', 'ID_ech')
if len(conflict_df)>0 :dataframe_viewer(conflict_df, rows=3)

In [None]:
dataframe_viewer(source_an)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all lithologies or descriptions data in the source
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_prv_sol:{len(source_prv_sol)} ; source_prv_eau:{len(source_prv_eau)} ;\n'
     f'source_mes_pz:{len(source_mes_pz)} ; source_an:{len(source_an)} ;')

* **Sheet : 'RESULTS_SOL'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Result_sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/'
                   'Database MEMORIS3.xlsx', sheet_name='RESULTS_SOL', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

In [None]:
df.rename(columns={'Unnamed: 92':'EOX', 'Unnamed: 93':'Idc_phenol','Campagne':'Societe','N_forage':'ID','refus':'Refus',
                   'Prof.\nforage':'Long', 'N_ech':'ID_ech', 'Min_Ech':'Ech_top','Max_Ech':'Ech_base',
                  'Terrain':'Nappe','Epaisseur remblais':'Ep_remb', 'Epaisseur alluvions':'Ep_alluv', 
                   'Nature':'Polluant','Min_organo':'Pol_top', 'Max_organo':'Pol_base', 'Fraction   2000 µm':'Fract_2000µ',
                   'Fraction   63 µm':'Fract_63µ', 'Fraction   45 µm':'Fract_45µ','Fraction   16 µm':'Fract_16µ',
                   'Fraction   2 µm':'Fract_2µ'}, inplace=True)

In [None]:
df.drop(columns=[df.columns.to_list()[x] for x in range(len(df.columns))
                      if re.search(r"Unnamed",df.columns.to_list()[x])], axis=1, inplace=True) 
df.replace(r'<|>','', inplace=True, regex=True)
df=df.query('ID==ID')
df['ID']=df['ID'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)
df['ID_ech']=df['ID_ech'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)
df.replace('-',np.nan, inplace=True)

In [None]:
for i in range(len(df['ID'])):
    #r=re.search('(\w+)/.+',str(df.loc[i, 'ID_ech']))
    #if r : df.loc[i, 'ID']=r.group(1)
    r=re.search('^\d+',str(df.loc[i, 'ID']))
    if r : df.loc[i, 'ID']='F'+str(df.loc[i, 'ID'])

In [None]:
sdf=df[['ID','X','Y','Z','Long','ID_ech', 'Ech_top', 'Ech_base', 'Description','Nappe','Ep_remb','Ep_alluv','Refus','Societe','Zone','Sous_zone']]
sdf.insert(5, 'Type', '')

prv_sol=df[['ID','ID_ech', 'X','Y','Z', 'Ech_top', 'Ech_base','Polluant','Intensité', 'Pol_top','Pol_base','MS','pH H2O',
            'T° pH H2O', 'T° pH CaCl2','pH CaCl2','T° pH KCl','pH KCl','T° CE','CE','MO',
       'Résidus chauffage', 'Argile ', 'Fract_2000µ','Fract_63µ','Fract_45µ','Fract_16µ','Fract_2µ']]
#mes_sol=df[['ID','ID_ech','MS','pH H2O', 'T° pH H2O', 'T° pH CaCl2','pH CaCl2','T° pH KCl','pH KCl','T° CE','CE','MO',
#       'Résidus chauffage', 'Argile ', 'Fract_2000','Fract_63','Fract_45','Fract_16','Fract_2']]
prv_sol.insert(2, 'Type_ech', 'Sol')

an=df[['ID','ID_ech','X','Y','Z','Arsenic','Cadmium','Chrome_total','Chrome_VI','Cobalt','Cuivre','Mercure','Plomb','Nickel','Zinc','Libres',
       'Totaux', 'Non chloro destruct.', 'Thiocyantes', 'Cyanures totaux EPA','Benzène', 'Toluène', 'Ethylbenzène',
       'o-Xylènes','mp-Xylènes','Xylènes','SOM BTEX','Styrène','Naphtalène','Anthracene','Phénanthrène',
       'Fluoranthène', 'Benzoaanthracène', 'Chrysène','Benzo(a)pyrene','Benzo(ghi)pérylène','Benzo(k)fluoranthène',
       'Indéno[123cd]pyrène', 'Acenaphtylene', 'Acenaphthene', 'Fluorène','Pyrène', 'Benzo_b_fluoranthene', 
       'Dibenzo[ah]anthracène','SOM VROM 10', 'SOM EPA 16', 'C5_C8', 'C8_C10', 'C10_C12', 'C12_C16','C16_C21', 
       'C21_C35', 'C35_C40', 'SOM_C5_C35', 'C21_C30', 'C30_C35','SOM C10_C40', 'EOX', 'Idc_phenol']]

dfs processing

In [None]:
sdf['Refus']=sdf['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')

In [None]:
for i in range(len(sdf['Nappe'])):
    x=sdf.loc[i,'Nappe']
    if re.search('[R|r]em', str(x)) : sdf.loc[i,'Nappe']='Remblais'
    elif re.search('[A|a]ll', str(x)) : sdf.loc[i,'Nappe']='Alluvions'
    elif re.search('[S|s]oc', str(x)) : sdf.loc[i,'Nappe']='Socle'
    elif re.search('[A|a]rg', str(x)) : sdf.loc[i,'Nappe']='Argile'
    else : sdf.loc[i,'Nappe']=''

In [None]:
litho=sdf #lithologies and all facilities without distinction here (because type of facility not defined clearly !)

In [None]:
prv_sol=na_line_drop(prv_sol, 3)

In [None]:
prv_sol=na_col_drop(prv_sol, col_non_na=5, verbose=False)

In [None]:
an.replace('#',np.nan, inplace=True)
an=na_line_drop(an, 2)
an.insert(1, 'Type_ech', 'Sol')
#an['Anl_ID']=an['ID'].apply(lambda x: 'Anl_'+str(x))
#an.insert(0, 'Anl_ID', an.pop('Anl_ID'))

data merging

In [None]:
dataframe_viewer(litho, rows=3), dataframe_viewer(source_litho, rows=3)

In [None]:
#source_mes_sol=mes_sol
source_prv_sol=prv_sol

In [None]:
dataframe_viewer(conflict_df, rows=3) 

In [None]:
dataset = source_litho
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_an, conflict_df=data_merger(source_an, an, 'outer', 'ID_ech', ) 

In [None]:
dataframe_viewer(prv_sol, un_val='ID', rows=3), dataframe_viewer(an, un_val='ID', rows=3) 

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bhs, conflict_df = data_merger(source_bh, source_pz, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = excel_bhs
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
excel_bh_litho, conflict_df = data_merger(source_bh, source_litho, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bh_soil_sp, conflict_df = data_merger(source_bh, source_prv_sol, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
source_bh.columns, source_mes_pz.columns

In [None]:
excel_bh_mes, conflict_df = data_merger(source_bh, source_mes_pz, how='outer', on='ID', dist_max=1., 
                                        drop_skip_col=['index'])

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)

In [None]:
dataframe_viewer(excel_bh_mes, rows=3)

#### ------------------------------------------------------------------------------------------------------------------

## 3-obsrevations terrain et mesures piézos phase 2.xlsx

* **Sheet : 'Piézométrie'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/observ_terrain/'
sheet='Piezometrie'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'obsrevations terrain et mesures piézos phase 2.xlsx', sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

In [None]:
sdf=df[df.columns.to_list()[:3]]
sdf=na_line_drop(sdf,0)
sdf.rename(columns={'Niveau \npiézométrique':'Niv_eau_sol', 'Commentaires ':'Date_prv'}, inplace=True)

In [None]:
sdf2=df.loc[:11, df.columns.to_list()[3:-1]]
sdf2.rename(columns={'Unnamed: 7':'Date_prv', 'Unnamed: 8':'Nappe', 'Unnamed: 9':'ID', 'NP/piézo [m]':'Niv_eau_pz',
       'dim. piezo hors sol [m]':'Dim_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 'Prof. piézo/piézo [m]':'Long_pz',
       'Prof. piézo/sol [m]':'Long_pz_sol', 'CE [mS/cm]':'CE','t° [°C]':'Temp','O2 dissous\n[%]':'O_diss', 
        'Observations':'Rmq'}, 
           inplace=True)

In [None]:
for i in range(len(sdf2['ID'])):
    sdf2.loc[i,'ID']=re.sub(r'^P','F', sdf2.loc[i,'ID'])
    
    if pd.isnull(sdf2.loc[i,'CE']) and not pd.isnull(sdf2.loc[i,'CE [µS/cm]']):
        sdf2.loc[i,'CE']=sdf2.loc[i,'CE [µS/cm]']/1000

sdf2.drop(['CE [µS/cm]'], axis=1, inplace=True)

In [None]:
df=df.loc[14:, df.columns.to_list()[3:-1]]
df.rename(columns={'Unnamed: 7':'Date_prv', 'Unnamed: 8':'Nappe', 'Unnamed: 9':'ID', 'NP/piézo [m]':'Niv_eau_pz',
       'dim. piezo hors sol [m]':'Dim_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 'Prof. piézo/piézo [m]':'Long_pz',
       'Prof. piézo/sol [m]':'Long_pz_sol', 'CE [mS/cm]':'CE','t° [°C]':'Temp','O2 dissous\n[%]':'O_diss', 
        'Observations':'Rmq'}, 
           inplace=True)
df.drop([19,20], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
for i in range(len(df['ID'])):
    df.loc[i,'ID']=re.sub(r'^P','F', df.loc[i,'ID'])
    
    if pd.isnull(df.loc[i,'CE']) and not pd.isnull(df.loc[i,'CE [µS/cm]']):
        df.loc[i,'CE']=df.loc[i,'CE [µS/cm]']/1000
        
df.drop(['CE [µS/cm]', 'O_diss'], axis=1, inplace=True)

In [None]:
df=na_col_drop(df, 5)
sdf2=na_col_drop(sdf2, 5,)

In [None]:
prv_eau, conflict_df=data_merger(sdf2, df, how='outer', on='ID')

In [None]:
dataframe_viewer(prv_eau, rows=5, un_val='ID')

In [None]:
prv_eau=prv_eau[['ID','Date_prv','Long_pz', 'Long_pz_sol','Dim_pz_sol','Nappe','Niv_eau_sol', 'Niv_eau_pz',
                 'pH', 'Temp', 'CE', 'ORP','Rmq']]
prv_eau.insert(1,'Type_ech','Eau')

source_prv_eau=prv_eau

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)

source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

--------------------------------------------------------------------------------------------------------

## 4-profondeur de contact campagne de forages octobre 2019.xlsx

* **Sheet : 'Feuil1'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Prof_contact_sol_forage/'
sheet='Feuil1'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/profondeur de contact campagne de forages octobre 2019.xlsx', 
                   sheet_name='Feuil1', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

In [None]:
df.rename(columns={'n°forage ':'ID','profondeur(m)':'Long_for','x':'X', 'y':'Y', 'z':'Z'}, inplace=True)
df['Type']='Forage' # type is not defined clearly in data
df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

bh=df

In [None]:
source_bh=bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

--------------------------------------------------------------------------------------------------------

## 5-Forages_Pilote_Decoupe.xlsx

* **Sheet : 'leve'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Forage_Pilote/'
sheet='leve_Z_elect_pos'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/geometrie_electrodes_et_sondes/Forages_Pilote_Decoupe.xlsx', 
                   sheet_name='leve')#, skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.columns

In [None]:
df.rename(columns={'Ref_puits':'ID','Niveau mesuré':'Z_mes', 'Niveau corrigé':'Z','Z_diff [m] repere_local':'Diff_Z_local',
                   'long_fin [m]':'Long_for','Pos_Inox_#1 [m]':'Pos_Inox_#1', 
                   'Pos_Inox_#6 [m]':'Pos_Inox_#6', 'Pos_Impol_#3 [m]':'Pos_Impol_#3'}, inplace=True)

In [None]:
df['Type']='Forage' # type is not defined clearly in data
df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

elc = df[['ID','Pos_Inox_#6', 'Pos_Impol_#3']] # 'ID' is for boreholes
bh = df[['ID','Z','Diff_Z_local','Long_for', 'Type']]# Z_local origin = 145.5 [m]

In [None]:
source_bh = bh
source_elc = elc

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
elc.to_csv(tmp_dir+sheet+'_Electrodes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_elc.to_csv(tmp_dir+'source_merge/source_Electrodes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

--------------------------------------------------------------------------------------------------------

## 6-Liste XY investigations.xlsx
* **Sheet : 'SOL_EAU'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Liste_XY/'
sheet='Sol_Eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='SOL')#, skiprows=4)
df['Type_ech']='Sol'

df1 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU PR')#, skiprows=4)
df1['Type_ech']='Eau'
df1['Nappe']='Socle'

df2 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU RB')#, skiprows=4)
df2['Type_ech']='Eau'
df2['Nappe']='remblais'

df3 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU ALL')#, skiprows=4)
df3['Type_ech']='Eau'
df3['Nappe']='Alluvions'

In [None]:
df2=na_line_drop(df2,0)
df2=na_col_drop(df2,1)

In [None]:
mdf, conflict_df=data_merger(df1, df2, 'outer', 'N°')

In [None]:
mdf=mdf.append(df3)
mdf=mdf.dropna(how='any', subset=['N°'])

In [None]:
mdf, conflict_df=data_merger(mdf, df, 'outer', 'N°') 

In [None]:
mdf.rename(columns={'N°':'ID'}, inplace=True)
mdf['Type'] = 'Piezo'
pz=mdf
source_pz = pz # we only have boreholes 'ID' here, no Z, no date

In [None]:
dataframe_viewer(source_bh, rows=5)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)    
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

--------------------------------------------------------------------------------------------------------

## 7-Résultats phase 1_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Phase_1_Memoris/'
sheet='Result_sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [None]:
prv_sol.drop(list(range(5)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [None]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x=='R': prv_sol.loc[i,'Description']='Remblais'
    elif x=='L': prv_sol.loc[i,'Description']='Limons'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not re.search('x|X', str(x)) else '')
prv_sol.insert(1,'Type_ech','Sol')

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
source_prv_sol=prv_sol
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

In [None]:
excel_soil_an, conflict_df = data_merger(source_an, source_prv_sol, how='outer', on='ID_ech', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_soil_an['ID'] = excel_soil_an['ID_ech'].apply(lambda x: x.split('/')[0]) 

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

excel_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Phase_1_Memoris/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(df, rows=5)

In [None]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [None]:
prv_eau['CE']=prv_eau['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
prv_eau.drop(list(range(5)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
prv_eau=dble_col_drop(prv_eau)

In [None]:
prv_eau=na_col_drop(prv_eau,1)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech','Date_prv','Num_maille','Affectation','X','Y','Zsol','Long_for','Prof_crep','Long_pz',
      'Niv_eau_sol','pH','CE','T']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau.insert(1,'Type_ech','Eau')

In [None]:
prv_eau['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(prv_eau)):
    c=prv_eau.loc[i,'Prof_crep']
    prv_eau.loc[i,'Equip_top']=c.split('-')[0]
    prv_eau.loc[i,'Equip_base']=c.split('-')[1]

prv_eau['Type_equip'] = 'Crepine'
prv_eau.drop(columns=['Prof_crep'], inplace=True)

In [None]:
prv_eau['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
prv_eau['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
pz=prv_eau[['ID_ech','X', 'Y', 'Zsol', 'Long_for','Long_pz', 'Equip_top', 'Equip_base', 'Type_equip']]
pz.rename(columns={'ID_ech':'ID'}, inplace=True)
pz['Type'] = 'Piezo'

In [None]:
for i in range(len(pz.ID)):
    c=pz.loc[i, 'ID']
    pz.loc[i, 'ID']=re.search("(\w+\d+(?:\w)?)",c).group(1)

In [None]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
an.columns

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
source_pz=pz
source_prv_eau=prv_eau
#source_an=source_an.append(an)

In [None]:
excel_water_an, conflict_df = data_merger(an, source_prv_eau, how='outer', on='ID_ech', dist_max=1., drop_skip_col=['index'])

In [None]:
dataframe_viewer(excel_water_an, rows=5)

In [None]:
for i in range(len(excel_water_an.ID_ech)):
    c=excel_water_an.loc[i, 'ID_ech']
    excel_water_an.loc[i, 'ID']=re.search("(\w+\d+(?:\w)?)",c).group(1)

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

excel_water_an.to_csv(save_dir+'Water_analysis.csv', index=False)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

## 8-Résultats phase 2_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Phase_2_Memoris/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(df, rows=5)

In [None]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [None]:
prv_sol.drop(list(range(5)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Date_prv', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [None]:
set(prv_sol['Description'])

In [None]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x=='R': prv_sol.loc[i,'Description']='Remblais'
    elif x=='L': prv_sol.loc[i,'Description']='Limons'
    elif x=='LA': prv_sol.loc[i,'Description']='Limons et argiles'
    elif x=='LS': prv_sol.loc[i,'Description']='Limons et sables'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
prv_sol.insert(1,'Type_ech','Sol')

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
dataframe_viewer(an, rows=5, cols=20) 

In [None]:
source_prv_sol=prv_sol
source_an=an

In [None]:
excel_soil_an, conflict_df = data_merger(prv_sol, an, how='outer', on='ID_ech', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_soil_an['ID'] = excel_soil_an['ID_ech'].apply(lambda x: x.split('/')[0]) 

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

excel_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Phase_2_Memoris/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [None]:
prv_eau['CE']=prv_eau['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
prv_eau.drop(list(range(5)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
prv_eau=dble_col_drop(prv_eau)

In [None]:
prv_eau=na_col_drop(prv_eau,1)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
prv_eau.columns

In [None]:
name=['ID_ech', 'Date_prv','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol',
      'Niv_eau_sol','pH', 'CE', 'T']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau.insert(1,'Type_ech','Eau')

In [None]:
prv_eau['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
prv_eau['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
prv_eau['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(prv_eau)):
    c=prv_eau.loc[i,'Prof_crep']
    prv_eau.loc[i,'Equip_top']=c.split('-')[0]
    prv_eau.loc[i,'Equip_base']=c.split('-')[1]
    
prv_eau.drop(columns=['Prof_crep'], inplace=True)
prv_eau['Type_equip'] = 'Crepine'

In [None]:
prv_eau['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
prv_eau['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
pz=prv_eau[['ID_ech', 'X', 'Y', 'Zsol', 'Long_for','Long_pz_sol', 'Equip_top', 'Equip_base']]
pz.rename(columns={'ID_ech':'ID'}, inplace=True)
pz['Type']='Piezo'

In [None]:
for i in range(len(pz.ID)):
    c=pz.loc[i, 'ID']
    pz.loc[i, 'ID']=re.search("(\w+\d+(?:\w)?)",c).group(1)

In [None]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

In [None]:
prv_eau=prv_eau[['ID_ech', 'Date_prv', 'X', 'Y', 'Zsol','Niv_eau_sol', 'pH', 'CE', 'T','Affectation']]

In [None]:
dataframe_viewer(prv_eau, rows=5)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
source_prv_eau=prv_eau
source_an=source_an.append(an)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

In [None]:
excel_water_an, conflict_df = data_merger(an, source_prv_eau, how='outer', on='ID_ech', dist_max=1., drop_skip_col=['index'])

In [None]:
dataframe_viewer(excel_water_an, rows=5)

In [None]:
for i in range(len(excel_water_an.ID_ech)):
    c=excel_water_an.loc[i, 'ID_ech']
    excel_water_an.loc[i, 'ID']=re.search("(\w+\d+(?:\w)?)",c).group(1)

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

excel_water_an.to_csv(save_dir+'Water_analysis.csv', index=False)

## 9-Ensemble des résultats Memoris version Seafile.xls
* **Sheet : 'Résult SOL'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Memoris_seafile/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
dataframe_viewer(df, rows=5)

In [None]:
prv_sol=df.loc[:37]
an=df.loc[38:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [None]:
prv_sol.drop(list(range(5)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)

In [None]:
prv_sol.drop(columns=prv_sol.columns[[-3,-4]], axis=1, inplace=True)

In [None]:
name=['ID_ech', 'Date_prv', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [None]:
set(prv_sol['Description'])

In [None]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x=='R' or x=='R ': prv_sol.loc[i,'Description']='Remblais'
    elif x=='L': prv_sol.loc[i,'Description']='Limons'
    elif x=='LA': prv_sol.loc[i,'Description']='Limons et argiles'
    elif x=='LS': prv_sol.loc[i,'Description']='Limons et sables'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
prv_sol.insert(1,'Type_ech','Sol')

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
source_prv_sol=prv_sol
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Memoris_seafile/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [None]:
prv_eau['CE']=prv_eau['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
prv_eau.drop(list(range(5)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
prv_eau=dble_col_drop(prv_eau)

In [None]:
prv_eau=na_col_drop(prv_eau,1)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
prv_eau.drop(columns=prv_eau.columns[[2]], axis=2, inplace=True)

In [None]:
name=['ID_ech', 'Date_prv','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol', 
      'Niv_eau_sol','pH', 'CE', 'T']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau.insert(1,'Type_ech','Eau')

In [None]:
prv_eau['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(prv_eau)):
    c=prv_eau.loc[i,'Prof_crep']
    prv_eau.loc[i,'Equip_top']=c.split('-')[0]
    prv_eau.loc[i,'Equip_base']=c.split('-')[1]
    
prv_eau.drop(columns=['Prof_crep'], inplace=True)
prv_eau['Type_equip'] = 'Crepine'

In [None]:
prv_eau['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
prv_eau['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
pz=prv_eau[['ID_ech', 'X', 'Y', 'Zsol', 'Long_for','Long_pz_sol', 'Equip_top', 'Equip_base', 'Type_equip']]
pz.rename(columns={'ID_ech':'ID'}, inplace=True)

In [None]:
for i in range(len(pz.ID)):
    c=pz.loc[i, 'ID']
    pz.loc[i, 'ID']=re.search("(\w+\d+)",c).group(1)

In [None]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

In [None]:
prv_eau=prv_eau[['ID_ech', 'Date_prv', 'X', 'Y', 'Zsol','Niv_eau_sol', 'pH', 'CE', 'T','Affectation']]

In [None]:
dataframe_viewer(prv_eau, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
an['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
source_pz=pz
source_prv_eau=prv_eau
source_an=source_an.append(an)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

## 10-Résultats SOL container phyto t=0_décret sol.xls
* **Sheet : 'Résult SOL'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Container_phyto/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_sol=df.loc[:21]
an=df.loc[22:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [None]:
prv_sol=dble_col_drop(prv_sol)

In [None]:
prv_sol.drop(list(range(5)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,2)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)

In [None]:
prv_sol.drop(columns=prv_sol.columns[[-3]], axis=1, inplace=True)

In [None]:
name=['ID_ech', 'Ech_top', 'Ech_base','MS','Date_prv','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)
prv_sol=prv_sol.query('ID_ech==ID_ech')
prv_sol.insert(1,'Type_ech','Sol')

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
source_prv_sol=prv_sol
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Paramètres agro.'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Container_phyto/'
sheet='Param_agro'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Paramètres agro.', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)
df=col_ren(df, 0)

In [None]:
df.drop(list(range(1)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df=dble_col_drop(df)

In [None]:
df=na_col_drop(df,1)
df=na_line_drop(df,3)
df.reset_index(drop=True, inplace=True)

In [None]:
df.drop(columns=df.columns[[5,6]], axis=2, inplace=True)

In [None]:
df.columns

In [None]:
name=['ID_ech','Ech_top','Ech_base','MS','Date_prv','MO','Residu_perte_feu','COT','Fract_arg','Fract_min_2µ', 
      'Fract_min_50µ', 'Fract_min_2', 'Fract_2', 'Fract_2+', 'pH_KCl','Tem_pH_mes', 'pH_H20', 'sulfures_tot', 
      'chlorures', 'azote_Kjeldahl']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [None]:
prv_sol=df

In [None]:
dataframe_viewer(prv_sol, rows=5)

In [None]:
data_merger(source_prv_sol, prv_sol, on='ID_ech', how='outer', )[0]

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

--------------------------------------------------------------------------------------------------------

## 11-Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Ext_Pilote/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_eau=df.loc[:31]
an=df.loc[list(range(0,4))+list(range(32, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [None]:
prv_eau=dble_col_drop(prv_eau)

In [None]:
prv_eau.drop(list(range(5)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=na_col_drop(prv_eau,2)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_prv','pH','Temp_prv','Temp_pH_mes']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau=prv_eau.query('ID_ech==ID_ech')
prv_eau.insert(1,'Type_ech','Eau')

In [None]:
dataframe_viewer(prv_eau, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'Periode', 'Emplacement','Date_prv', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre',
      'Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE) - méthode basée sur EPA 335.3", "cyanure complex - méthode interne ", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g"]

an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-7]

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
dataframe_viewer(an, rows=3)

In [None]:
source_prv_eau=prv_eau
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Param physico'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Ext_Pilote/'
sheet='Param_physico'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [None]:
df=col_ren(df, 1)

In [None]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [None]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

In [None]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

In [None]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)

In [None]:
name=['ID_ech','Periode','Emplacement','Date_prv','Niv_eau_pz','Long_pz','Temp_prv ','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [None]:
sdf=sdf.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_prv','Niv_eau_pz','Niv_eau_chbre','pH','Niv_eau_sol','Long_pz',
      'Temp_prv ','CE','ORP','O_diss']
sdf=col_ren(sdf, mode=1, name=name)

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)

In [None]:
data=[df, sdf]
for d in data:
    d['Rmq']=''
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        n=str(d.loc[i, 'ID_ech'])
        d.loc[i,'ID_ech']=n.replace('*', '')
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan
        
        if re.match('\d+\*{1}$',n, re.I): 
            d.loc[i,'Rmq']="mesures faites dans un seau (débit non continu ou peu de débit)"
        elif re.match('\d+\*{2}$',n, re.I): 
            d.loc[i,'Rmq']="mésures faites dans une eau quasi-stagnante (Piezo rempli de sédiment et débit très faible)"

In [None]:
df.insert(1, 'Type_ech', 'Eau')
sdf.insert(1, 'Type_ech', 'Eau')

In [None]:
prv_eau=data_merger(sdf, df, 'outer', 'ID_ech')[0]

In [None]:
prv_eau=na_col_drop(df,2)
prv_eau=na_line_drop(df,1)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
for i in prv_eau.index:
    if not pd.isnull(prv_eau.loc[i, 'Emplacement']):
        val = prv_eau.loc[i, 'Emplacement']
    else:
        prv_eau.loc[i, 'Emplacement'] = val

In [None]:
dataframe_viewer(prv_eau, rows=3)

In [None]:
source_prv_eau, conflict_df = data_merger(source_prv_eau, prv_eau, on=['ID_ech', 'Date_prv'], how='outer')

In [None]:
dataframe_viewer(conflict_df, rows=3)

In [None]:
data_validation(overall_data=source_prv_eau, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Emplacement_x':list(conflict_df.index), 'pH_y':list(conflict_df.index), 
                           'Periode_x':list(conflict_df.index)})

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Inorganiques et composés majeurs'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Ext_Pilote/'
sheet='Inorganic_major'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_eau=df.loc[:21]
an=df.loc[list(range(0,4))+list(range(22, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [None]:
prv_eau=dble_col_drop(prv_eau)

In [None]:
prv_eau.drop(list(range(2)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=na_col_drop(prv_eau,2)
prv_eau=na_line_drop(prv_eau,2)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
prv_eau.columns

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_prv','Temp_prv']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau=prv_eau.query('ID_ech==ID_ech')
prv_eau.insert(1,'Type_ech','Eau')

In [None]:
dataframe_viewer(prv_eau, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an=na_col_drop(an,3)

In [None]:
an.columns

In [None]:
an.rename(columns={'ammoniaque - libre':'ammoniaque libre'}, inplace=True)

In [None]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)
an.rename(columns={'Période ':'Periode', 'Date de prélèvement':'Date_prv'}, inplace=True)
#an=an.iloc[:,:-7]

In [None]:
dataframe_viewer(an, rows=3)

In [None]:
dataframe_viewer(source_prv_eau, rows=3)

In [None]:
source_prv_eau.Date_prv = source_prv_eau.Date_prv.astype(object)

In [None]:
source_prv_eau, conflict_df=data_merger(source_prv_eau, prv_eau, 'outer', ['ID_ech', 'Date_prv'])

In [None]:
data_validation(overall_data=source_prv_eau, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Periode_y':list(conflict_df.index), 'Emplacement_y':list(conflict_df.index)})

In [None]:
source_an, conflict_df=data_merger(source_an, an, 'outer', ['ID_ech', 'Date_prv'])

In [None]:
source_an.Date_prv = source_an.Date_prv.astype(object)

In [None]:
dataframe_viewer(source_an, rows=5, cols=13), dataframe_viewer(source_prv_eau, rows=5, cols=13)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bh_water_an, conflict_df = data_merger(source_an, source_prv_eau, how='outer', on=['ID_ech', 'Date_prv'], drop_skip_col=['index'])

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

## 12-Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Pilote/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_eau=df.loc[:32]
an=df.loc[list(range(0,4))+list(range(33, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [None]:
prv_eau=dble_col_drop(prv_eau)

In [None]:
prv_eau.drop(list(range(3)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=na_col_drop(prv_eau,2)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
prv_eau.columns

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_prv','Niv_eau_pz','Niv_eau_chbre','pH','Temp_prv','CE','ORP',
      'O_diss','col_29','Temp_pH_mes']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau=prv_eau.query('ID_ech==ID_ech')
prv_eau.insert(1,'Type_ech','Eau')

In [None]:
prv_eau.drop(columns=['col_29'], inplace=True)
prv_eau['CE']=prv_eau['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
prv_eau['Periode'].replace('\n',' ', regex=True, inplace=True)
prv_eau.replace('\n','', regex=True, inplace=True)

In [None]:
data=[prv_eau]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
dataframe_viewer(prv_eau, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.rename(columns={'Période ':'Periode', 'Emplacement \n- P : Pilote \n- HZP : Hors zone pilote':'Emplacement',
                  'Date de prélèvement':'Date_prv'}, inplace=True)

In [None]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'Periode', 'Emplacement', 'Date_prv', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre','Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE) - méthode basée sur EPA 335.3", "cyanure complex - méthode interne ", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g","h"]

In [None]:
an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-8]

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
an['Periode'].replace('\n',' ', regex=True, inplace=True)
an.replace('\n','', regex=True, inplace=True)

In [None]:
data=[an]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
dataframe_viewer(an, rows=3)

In [None]:
data = [prv_eau, an]
for d in data:
    print('-------------')
    for i, r in d.iterrows():
        for c in d.columns:
            if c not in ['ID_ech', 'Type_ech', 'Periode', 'Emplacement', 'Date_prv'] and \
            str(type(r[c])) not in ["<class 'float'>", "<class 'int'>"]:
                d.loc[i, c] = np.nan
                #print(f'{i}- {str(type(r[c]))}- {c} : {r[c]}')

In [None]:
source_prv_eau=prv_eau
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Param physico'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Pilote/'
sheet='Param_physico'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [None]:
df=col_ren(df, 1)

In [None]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [None]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

In [None]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

In [None]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)

In [None]:
df=df.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_prv','Niv_eau_pz','Long_pz','Temp_prv ','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [None]:
sdf.drop(columns=['col_29'], inplace=True)
name=['ID_ech','Periode','Emplacement','Date_prv','Niv_eau_pz','Long_pz','pH','Niv_eau_sol','Temp_prv ','CE',
      'ORP','O_diss','Temp_pH_mes']
sdf=col_ren(sdf, mode=1, name=name)

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)
sdf.drop(columns=["Niv_eau_sol"], inplace=True)

In [None]:
set(sdf['Emplacement'])

In [None]:
data=[df, sdf]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
df.insert(1, 'Type_ech', 'Eau')
sdf.insert(1, 'Type_ech', 'Eau')

In [None]:
df.replace('\*|à compléter',np.nan, inplace=True, regex=True)

In [None]:
prv_eau, conflict_df = data_merger(sdf, df, 'outer', 'ID_ech')

In [None]:
source_prv_eau, conflict_df=data_merger(source_prv_eau, prv_eau, on=['ID_ech', 'Date_prv', 'Periode'], how='outer')

In [None]:
dataframe_viewer(source_prv_eau, rows=3)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Inorganiques et composés majeurs'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Pilote/'
sheet='Inorganic_major'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_eau=df.loc[:21]
an=df.loc[list(range(0,4))+list(range(22, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [None]:
prv_eau=dble_col_drop(prv_eau)

In [None]:
prv_eau.drop(list(range(2)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=na_col_drop(prv_eau,2)
prv_eau=na_line_drop(prv_eau,2)
prv_eau.reset_index(drop=True, inplace=True)

In [None]:
prv_eau.columns

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_prv','Temp_prv']
prv_eau.replace(r'\n',' ', inplace=True, regex=True)
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau=prv_eau.query('ID_ech==ID_ech')
prv_eau.insert(1,'Type_ech','Eau')

In [None]:
dataframe_viewer(prv_eau, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an=na_col_drop(an,3)

In [None]:
an.rename(columns={'Période ':'Periode', 'Emplacement \n- S : Simulateur \n- HZS : Hors zone simulateur':'Emplacement',
                  'Date de prélèvement':'Date_prv', 'col_9':'ammoniaque libre'}, inplace=True)

In [None]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)
#an=an.iloc[:,:-7]

In [None]:
data=[prv_eau, an]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
data = [prv_eau, an]
for d in data:
    print('-------------')
    for i, r in d.iterrows():
        for c in d.columns:
            if c not in ['ID', 'ID_ech', 'Type', 'Type_ech', 'Periode', 'Emplacement', 'Date_prv'] and \
            str(type(r[c])) not in ["<class 'float'>", "<class 'int'>"]:
                d.loc[i, c] = np.nan
                #print(f'{i}- {str(type(r[c]))}- {c} : {r[c]}')

In [None]:
dataframe_viewer(an, rows=3)

In [None]:
source_prv_eau.Date_prv = source_prv_eau.Date_prv.astype(object)

In [None]:
source_prv_eau, conflict_df=data_merger(source_prv_eau, prv_eau, 'outer', ['ID_ech', 'Date_prv'] )

In [None]:
data_validation(overall_data=source_prv_eau, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Periode_y':list(conflict_df.index), 'Temp_prv_y':list(conflict_df.index), 
                            'Emplacement_y':list(conflict_df.index)})

In [None]:
source_an, conflict_df=data_merger(source_an, an, 'outer', ['ID_ech', 'Date_prv', 'Periode'])

In [None]:
data_validation(overall_data=source_an, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Emplacement_y':list(conflict_df.index)})

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bh_water_an, conflict_df = data_merger(source_an, source_prv_eau, how='outer', on=['ID_ech', 'Date_prv', 'Periode'],
                                             drop_skip_col=['index'])

In [None]:
dataframe_viewer(excel_bh_water_an, rows=5)

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

## 13-Resultats_Siterem_SOL.xlsx
* **Sheet : 'Résult SOL ext. pilote'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Result_Sol/'
sheet='Result_sol_ExtP'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='Résult SOL ext. pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_sol=df.loc[:22]
an=df.loc[23:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [None]:
prv_sol=dble_col_drop(prv_sol)

In [None]:
prv_sol.drop(list(range(3)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
prv_sol=prv_sol[:-1]
prv_sol.drop(columns=['broyage'], inplace=True)

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_prv','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH_mes','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [None]:
set(prv_sol.Description)

In [None]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x in ['R','R ']: prv_sol.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: prv_sol.loc[i,'Description']='Terrain naturel'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
prv_sol.insert(1,'Type_ech','Sol')#

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
name=['ID_ech','METAUX LOURDS','Arsenic','Cadmium','Chrome','Chrome VI','Cuivre','Mercure','Plomb','Nickel',
'Zinc','CYANURES','cyanure (libre)','cyanure (totaux)','cyanure (APE)','cyanure complex','thiocyanate',
'COMPOSES AROMATIQUES VOLATILS','Benzène','Toluène','Éthylbenzène','Orthoxylène','Para- et métaxylène','Xylènes',
'Styrène','BTEX totaux','PHENOLS','Phénol','HYDROCARBURES AROMATIQUES POLYCYCLIQUES','Naphtalène','Acénaphtylène',
'Acénaphtène','Fluorène','Phénanthrène','Anthracène','Fluoranthène','Pyrène','Benzo(a)anthracène','Chrysène',
'Benzo(b)fluoranthène','Benzo(k)fluoranthène','Benzo(a)pyrène','Dibenzo(ah)anthracène','Benzo(ghi)pérylène',
'Indéno(1,2,3-cd)pyrène','HAP Totaux (16) - EPA','COMPOSES ORGANOHALOGENES VOLATILS','Tétrachloroéthylène',
'Trichloroéthylène','1,1-dichloroéthène','Cis-1,2-dichloroéthène','Trans 1,2-dichloroéthylène',
'Totaux (cis,trans) 1,2-dichloroéthènes','Chlorure de vinyle','1,1,1-Trichloroéthane','1,1,2-Trichloroéthane',
'1,1-Dichloroéthane','1,2-Dichloroéthane','Tétrachlorométhane','Chloroforme','Dichlorométhane',
'1,2-dichloropropane','HYDROCARBURES TOTAUX','fraction aromat. >C6-C7','fraction aromat. >C7-C8',
'fraction aromat. >C8-C10','fraction aliphat. C5-C6','fraction aliphat. >C6-C8','fraction aliphat. >C8-C10',
'Fraction C5 - C8','Fraction C8 - C10','Fraction C10-C12','Fraction C12-C16','Fraction C16 - C21',
'Fraction C21 - C35','Fraction C35 - C40','Hydrocarbures totaux C10-C35','Hydrocarbures totaux C10-C40',
'Teneur mesurée','Teneur mesurée','VS : Valeur seuil']

an=col_ren(an, name=name, mode=1)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
an.rename(columns={'cyanure (totaux)':'CN_tot', 'cyanure (APE)':'CN_EPA'}, inplace=True)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
dataframe_viewer(prv_sol, rows=5)

In [None]:
source_prv_sol=prv_sol
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'SOL T1 pilote'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Result_Sol/'
sheet='SOL_T1_Pilote'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='SOL T1 pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [None]:
prv_sol=dble_col_drop(prv_sol)

In [None]:
prv_sol.drop(list(range(3)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)

In [None]:
prv_sol=prv_sol[:-1]
prv_sol.drop(columns=['broyage'], inplace=True)

In [None]:
prv_sol.columns

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_prv','Long_for','Refus','Nature_ech','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [None]:
for i in range(len(prv_sol['Nature_ech'])):
    x = prv_sol.loc[i,'Nature_ech']
    if x in ['R','R ']: prv_sol.loc[i,'Nature_ech']='Remblais'
    elif x in ['TN','TN ']: prv_sol.loc[i,'Nature_ech']='Terrain naturel'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
prv_sol.insert(1,'Type_ech','Sol')#

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
name=['ID_ech','METAUX LOURDS','Arsenic','Cadmium','Chrome','Chrome VI','Cobalt','Cuivre','Mercure','Plomb', 
'Nickel','Zinc','CYANURES','cyanure (libre)','cyanure (totaux)','cyanure (APE)','cyanure complex','thiocyanate',
'COMPOSES AROMATIQUES VOLATILS','Benzène','Toluène','Éthylbenzène', 'Orthoxylène','Para- et métaxylène','Xylènes',
'Styrène','BTEX totaux','PHENOLS','Phénol','Indice phénol','HYDROCARBURES AROMATIQUES POLYCYCLIQUES','Naphtalène',
'Acénaphtylène','Acénaphtène', 'Fluorène','Phénanthrène','Anthracène','Fluoranthène','Pyrène','Benzo(a)anthracène',
'Chrysène','Benzo(b)fluoranthène','Benzo(k)fluoranthène','Benzo(a)pyrène','Dibenzo(ah)anthracène',
'Benzo(ghi)pérylène','Indéno(1,2,3-cd)pyrène','HAP Totaux (16) - EPA','COMPOSES ORGANOHALOGENES VOLATILS',
'Tétrachloroéthylène','Trichloroéthylène','1,1-dichloroéthène','Cis-1,2-dichloroéthène',
'Trans 1,2-dichloroéthylène','Totaux (cis,trans) 1,2-dichloroéthènes','Chlorure de vinyle',
'1,1,1-Trichloroéthane','1,1,2-Trichloroéthane','1,1-Dichloroéthane','1,2-Dichloroéthane','Tétrachlorométhane',
'Chloroforme','Dichlorométhane','1,2-dichloropropane','EOX','HYDROCARBURES TOTAUX',
'fraction aromat. >C6-C7','fraction aromat. >C7-C8','fraction aromat. >C8-C10','fraction aliphat. C5-C6',
'fraction aliphat. >C6-C8','fraction aliphat. >C8-C10','Fraction C5 - C8','Fraction C8 - C10','Fraction C10-C12',
'Fraction C12-C16','Fraction C16 - C21','Fraction C21 - C35','Fraction C35 - C40','Hydrocarbures totaux C10-C35',
'Hydrocarbures totaux C10-C40','METHYL-TERT-BUTYL-ETHER','MTBE']

an=an.iloc[:,:-17]
an=col_ren(an, name=name, mode=1)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,3)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
an.rename(columns={'cyanure (totaux)':'CN_tot', 'cyanure (APE)':'CN_EPA'}, inplace=True)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
dataframe_viewer(prv_sol, rows=5) 

In [None]:
#source_prv_sol.info()#, prv_sol.info()

In [None]:
source_prv_sol=source_prv_sol[['ID_ech', 'Type_ech', 'Date_prv','Long_for', 'Refus', 'Description', 
                               'Ech_top', 'Ech_base', 'MS', 'Fract_2', 'Fract_2+']]

In [None]:
source_prv_sol=data_merger(source_prv_sol, prv_sol, on='ID_ech', how='outer')[0]

In [None]:
source_an, conflict=data_merger(source_an,an, on='ID_ech', how='outer')

In [None]:
source_an=source_an.query('ID_ech==ID_ech')

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bh_soil_an, conflict_df = data_merger(source_prv_sol, source_an, how='outer', on='ID_ech', drop_skip_col=['index'])

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

## 14-Logs_forages_vUmons_2018-03-20.xlsx
* **Sheet : 'Analyse_eau_Phases1&2'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/vUmons_logsFor/'
sheet='Analyse_eau_Phases1&2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_eau_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
df=col_ren(df,mode=1,name=[re.sub('9999','-',x) for x in df.columns])
df=col_ren(df,mode=1, name=pol_field_model)

In [None]:
name=['ID', 'ID_ech', 'Date_prv', 'X', 'Y', 'Z', 'Long_for','Long_pz_sol', 'Niv_eau_sol', 'pH', 'CE', 'T', 
      'As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg','Pb', 'Ni', 'Zn', 'CN_libre', 'CN_tot', 'CN_APE', 'CN_comp',
      'thioCN', 'Bnz_vn', 'Bnz', 'Toln_vn', 'Toln', 'EthylBnz','O-Xyl', 'P-M-Xyl', 'Xyl_vn', 'Xyl', 'Styr', 
      'Phenol','Naphta_vn', 'Naphta', 'Acenaphtyl', 'Acenaphtn', 'Fluorene',
       'Phenanthr', 'Anthrc', 'Flranth', 'Pyr', 'Bnz(a)anthrc', 'Chrys',
       'Bnz(b)flranth', 'Bnz(k)flranth', 'Bnz(a)pyr', 'Dibnz(ah)anthrc',
       'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr', 'HAP_tot_EPA',
       '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       '(cis,trans) 1,2-DCE_tot', 'Trans 1,2-DCEyl', 'DCM', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35', 'MTBE', 'Chlorure']
df=col_ren(df, mode=1,name=name)

In [None]:
df['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)
df.insert(1,'Type_ech','Eau')

In [None]:
df.drop([20,39], axis=0,inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.loc[38:,'Date_prv']=df.loc[38:,'Date_prv'].apply(lambda x : dtm.datetime.fromordinal(dtm.datetime(1900, 1, 1).toordinal() + x - 2))

In [None]:
for i in range(len(df['ID_ech'])):
    if pd.isnull(df.loc[i,'ID_ech']): 
        df.loc[i,'ID_ech']=df.loc[i,'ID']

In [None]:
pz=df[['ID', 'X', 'Y', 'Z', 'Long_for','Long_pz_sol']]
pz['Type'] = 'Piezo'

prv_eau=df[['ID','ID_ech','Type_ech','Date_prv', 'X', 'Y', 'Z','Niv_eau_sol', 'pH', 'CE', 'T']]
an=df[['ID','ID_ech','Type_ech','Date_prv', 'X', 'Y', 'Z','As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg','Pb', 'Ni', 'Zn', 'CN_libre', 'CN_tot', 'CN_APE', 
       'CN_comp','thioCN', 'Bnz_vn', 'Bnz', 'Toln_vn', 'Toln', 'EthylBnz','O-Xyl', 'P-M-Xyl', 'Xyl_vn', 'Xyl',
       'Styr', 'Phenol','Naphta_vn', 'Naphta', 'Acenaphtyl', 'Acenaphtn', 'Fluorene',
       'Phenanthr', 'Anthrc', 'Flranth', 'Pyr', 'Bnz(a)anthrc', 'Chrys',
       'Bnz(b)flranth', 'Bnz(k)flranth', 'Bnz(a)pyr', 'Dibnz(ah)anthrc',
       'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr', 'HAP_tot_EPA',
       '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       '(cis,trans) 1,2-DCE_tot', 'Trans 1,2-DCEyl', 'DCM', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35', 'MTBE', 'Chlorure']]

In [None]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

In [None]:
dataframe_viewer(an, rows=5)

In [None]:
source_an=an
source_pz=pz
source_prv_eau=prv_eau

In [None]:
for i in range(len(excel_water_an.ID_ech)):
    c=excel_water_an.loc[i, 'ID_ech']
    excel_water_an.loc[i, 'ID']=re.search("(\w+\d+(?:\w)?)",c).group(1)

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

an.to_csv(save_dir+'Water_analysis.csv', index=False)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Analyse_sol_Phases1&2'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/vUmons_logsFor/'
sheet='Analyse_sol_Phases1&2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_sol_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=col_ren(df, mode=1, name=pol_field_model)

In [None]:
name=['ID_ech','Date_prv','ID','X','Y','Z','Nature_ech','Organo','Long_for','Refus','Ech_top','Ech_base',
      'MS','Broyage < 150 µm','Broyage ','Fract_2','Fract_2+','As','Cd','Cr','Cr_VI','Cu',
       'Hg','Pb','Ni','Zn','CN_libre','CN_tot','CN_APE',
       'CN_comp','thioCN','Bnz','Toln','EthylBnz','O-Xyl','P-M-Xyl',
       'Xyl','Styr','Phenol','Naphta','Acenaphtyl','Acenaphtn',
       'Fluorene','Phenanthr','Anthrc','Flranth','Pyr','Bnz(a)anthrc',
       'Chrys','Bnz(b)flranth','Bnz(k)flranth','Bnz(a)pyr',
       'Dibnz(ah)anthrc','Bnz(ghi)peryl','Indeno(1,2,3-cd)pyr',
       'HAP_tot_EPA','1,1-DCE','1,2-DCE','1,1-DCEn',
       'Cis-1,2-DCEn','Trans 1,2-DCEyl','DCM',
       '(cis,trans) 1,2-DCE_tot','1,2-DCP','TetraCEyn','TCM',
       '1,1,1-TCE','1,1,2-TCE','TCEyn','Chloroforme','CVinyl','Arom_C6C7',
       'Arom_C7C8','Arom_C8C10','Aliphat_C5C6','Aliphat_C6C8',
       'Aliphat_C8C10','Fract_C5C8','Fract_C8C10','Fract_C10C12',
       'Fract_C12C16','Fract_C16C21','Fract_C21C35','HC_tot_C10C35']
df=col_ren(df, mode=1, name=name)

In [None]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [None]:
for i in range(len(df['Nature_ech'])):
    x = df.loc[i,'Nature_ech']
    if x in ['R','R ']: df.loc[i,'Nature_ech']='Remblais'
    elif x in ['L']: df.loc[i,'Nature_ech']='Limons'
    elif x in ['LA']: df.loc[i,'Nature_ech']='Limons et argiles'
    elif x in ['LS']: df.loc[i,'Nature_ech']='Limons et sables'

df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
df.insert(1,'Type_ech','Sol')

In [None]:
df.drop(14, axis=0, inplace=True)
df.drop(['Broyage < 150 µm', 'Broyage '], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.loc[8, 'ID_ech']='F4/2M'
df.loc[31, 'ID_ech']='P19/1'
df.loc[32, 'ID_ech']='P19/2'

In [None]:
pz=df[['ID', 'X', 'Y', 'Z', 'Long_for','Refus']]
pz['Type'] = 'Piezo'

prv_sol=df[['ID_ech', 'Type_ech', 'Date_prv', 'X', 'Y', 'Z', 'Nature_ech','Organo', 
            'Ech_top', 'Ech_base', 'MS', 'Fract_2','Fract_2+']]
an=df[['ID','ID_ech', 'Date_prv', 'X', 'Y', 'Z', 'Type_ech','As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg', 'Pb', 'Ni', 'Zn',
       'CN_libre', 'CN_tot', 'CN_APE', 'CN_comp', 'thioCN', 'Bnz', 'Toln',
       'EthylBnz', 'O-Xyl', 'P-M-Xyl', 'Xyl', 'Styr', 'Phenol', 'Naphta',
       'Acenaphtyl', 'Acenaphtn', 'Fluorene', 'Phenanthr', 'Anthrc', 'Flranth',
       'Pyr', 'Bnz(a)anthrc', 'Chrys', 'Bnz(b)flranth', 'Bnz(k)flranth',
       'Bnz(a)pyr', 'Dibnz(ah)anthrc', 'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr',
       'HAP_tot_EPA', '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       'Trans 1,2-DCEyl', 'DCM', '(cis,trans) 1,2-DCE_tot', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35']]

In [None]:
an['ID'] = an['ID'].apply(lambda x: re.sub('^P', 'F', x)) 

In [None]:
dataframe_viewer(an, rows=5)

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

an.to_csv(save_dir+'Soil_analysis.csv', index=False)

In [None]:
source_an=source_an.append(an, ignore_index=True)
source_pz=pz
source_prv_sol=prv_sol

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Synthèse'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/vUmons_logsFor/'
sheet='Synthese'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Synthèse', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df[:29]
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')

In [None]:
name=['ID','X','Y','Z', 'Refus','Long_for', 'RB', 'ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)

In [None]:
cols=['ID','X','Y','Z', 'Refus','Long_for']

for i in range(len(df)):
    if not pd.isnull(df.loc[i, 'RB']): 
        df.loc[i, 'Nappe']='Remblais'
        df.loc[i, 'Litho_top']=0
        if not pd.isnull(df.loc[i, 'Rb_base']):
            df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else:
            df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'ALL']):
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Nappe']='Alluvions'
        df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        if not pd.isnull(df.loc[i, 'S_A']):
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else:
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'S_A']):
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Nappe']='Socle altéré'
        df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        if not pd.isnull(df.loc[i, 'S_S']):
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else:
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if not pd.isnull(df.loc[i, 'S_S']):
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Nappe']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S', 'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
dataframe_viewer(df, rows=5, cols=15)

In [None]:
df.columns

In [None]:
bh=df[['ID','X','Y','Z','Long_for','Refus']]
bh['Type']='Forage'

litho=df[['ID','X','Y','Z','Litho_top','Litho_base','Nappe']]
source_litho=litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Sond2017v2'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/vUmons_logsFor/'
sheet='Sond2017v2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Sond2017v2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if x==1 else '')

In [None]:
name=['R_ID','ID','X','Y','Z','Refus','Date_for','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','cote_rb','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)
df=df[['ID','X','Y','Z','Refus','Date_for','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']]

In [None]:
cols=['ID','Date_for','X','Y','Z','Z_fond','Refus','Long_for']

for i in range(len(df)):    
    if df.loc[i, 'RB']==1: 
        df.loc[i, 'Nappe']='Remblais'
        df.loc[i, 'Litho_top']=0
        
        if not pd.isnull(df.loc[i, 'Rb_base']): df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else: df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    val_def=df.loc[i, 'Litho_base'] # temporary value of litho_base if nan
    
    if df.loc[i, 'ALL']==1:
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Nappe']='Alluvions'
        
        if not pd.isnull(df.loc[i, 'All_top']): df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        else: df.loc[i+.2, 'Litho_top']=val_def #df.loc[i, 'litho_base']
            
        if df.loc[i, 'S_A']==1: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if df.loc[i, 'S_A']==1:
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Nappe']='Socle altéré'
        
        if not pd.isnull(df.loc[i, 'Soc_alt_top']): df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.5, 'Litho_top']=val_def #df.loc[i+.2, 'litho_base']
        
        if df.loc[i, 'S_S']==1: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if df.loc[i, 'S_S']==1:
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Nappe']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S','Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df=df[:-1]

In [None]:
dataframe_viewer(df, rows=5, cols=15)

In [None]:
bh=df[['ID','X','Y','Z','Z_fond','Date_for','Long_for','Refus']]
bh['Type']='Forage'

litho=df[['ID','X','Y','Z','Litho_top','Litho_base','Nappe']]
source_litho = source_litho.merge(df, 'outer')

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

# Processing for new data added - April 2021

## 15-Profils de sol et données de terrain 2019.xlsx
* **Sheet : 'Log'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Log'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Log', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name = ['ID','Litho_top', 'Litho_base', 'Keyword', 'Description']
df = col_ren(df, name=name, mode=1, )
df = df[1:]
df['Date_for'] = dtm.datetime(2019,12,18)

In [None]:
df.drop(index=df.query('Litho_base.isnull() or Litho_top.isnull()').index, inplace=True)

In [None]:
compute_BH_length(df)

In [None]:
df.query('Litho_base.isnull() or Litho_top.isnull()')

In [None]:
dataframe_viewer(df, rows=5, cols=15)

In [None]:
bh = df[1:62]
pza = df[65:80] #piezair
pz = df[83:]

In [None]:
bh.reset_index(drop=True, inplace=True)
pza.reset_index(drop=True, inplace=True)
pz.reset_index(drop=True, inplace=True)

In [None]:
bh.insert(1,'Type', 'Forage')
bh.insert(1,'Zone', 'Extension Pilote')
pza.insert(1,'Type', 'Piezair')
pza.insert(1,'Zone', 'Extension Pilote')
pz.insert(1,'Type', 'Piezo')
pz.insert(1,'Zone', 'Mini-Pilote')

In [None]:
litho=bh.append(pza)
litho=litho.append(pz)
litho=litho[['ID','Type','Zone','Litho_top','Litho_base','Description','Keyword']]
litho.reset_index(inplace=True, drop=True)

In [None]:
litho.loc[litho[litho['ID']==50].index, 'Type'] = 'Piezo'

In [None]:
dataframe_viewer(litho, rows=3)

In [None]:
source_litho=litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#pza.to_csv(tmp_dir+sheet+'_Piezairs.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Echantillon'+'Organoleptique**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Echantillon'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Echantillon', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Ech_top', 'Ech_base', 'ID_ech']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [None]:
df.drop(index=[43,44,55,56,66], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
ech=df.copy()

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Organoleptique', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,4)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Pol_top', 'Pol_base','Polluant','Intensite']
df=col_ren(df, name=name, mode=1)

In [None]:
df.drop(index=[10,11,14,15], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
mdf, conflict_df =data_merger(ech, df, on='ID', how='outer')

In [None]:
dataframe_viewer(mdf)

In [None]:
prv_sol=mdf
source_prv_sol=prv_sol

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Données de forage'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Donnees_forage'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Données de forage', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID', 'X', 'Y', 'Z', 'Date_for', 'Long_for', 'Methode', 'Diam_for','Rmq', 'Long_pz', 'Diam_pz', 
      'Crep_long','Societe', 'Resp_chantier']
df=col_ren(df, name=name, mode=1)
df.drop(index=[16,23], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.insert(5, 'Type', '')
df.loc[:15,'Type']='Forage'
df.loc[16:21,'Type']='Piezair'
df.loc[22:,'Type']='Piezo'

In [None]:
df.loc[9,'ID']='224 bis'

In [None]:
df['Refus'] = ''
df['Type_refus']=''

for i in range(len(df['Rmq'])):
    val = str(df.loc[i,'Rmq'])
    if re.search('[Bb]loqué', val) :
        df.loc[i,'Refus'] = 'x'
        
        if re.search('[lL]aitier', val):
            df.loc[i,'Type_refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Type_refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Type_refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Refus'] = '' 

df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[1]) if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[0]) if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x) if not pd.isnull(x) else x)

df.insert(10, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(11, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Rmq', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines
df.reset_index(drop=True, inplace=True)

gen_id_dated(df,'ID','Date_for')  

In [None]:
pz = df.query("Type=='Piezo'")
pza=df.query("Type=='Piezair'")
bh = df.query("Type=='Forage'")

pz.reset_index(inplace=True, drop=True)
pza.reset_index(inplace=True, drop=True)
bh.reset_index(inplace=True, drop=True)

In [None]:
dataframe_viewer(df, rows=3)

In [None]:
source_pz = pz
source_pza = pza
source_bh = bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
pza.to_csv(tmp_dir+sheet+'_Piezairs.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
source_pza.to_csv(tmp_dir+'source_merge/source_Piezairs.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)}; source_pza:{len(source_pza)} ;'
      f'source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Equipement'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Equipement'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Equipement', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID','Equip_top', 'Equip_base', 'Diam_for', 'Diam_ext_pz', 'Legende']
df=col_ren(df, mode=1, name=name)

In [None]:
df.drop(index=[24,25], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
compute_BH_length(df, top_col='Equip_top', base_col='Equip_base')

In [None]:
coi = ['ID', 'Profondeur', 'Diam_for', 'Diam_ext_pz']
pz=df[coi].drop_duplicates(['ID'])
pz['Type'] = 'Piezo'

In [None]:
dataframe_viewer(df)

In [None]:
equip=df
source_eqp=equip

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
equip.to_csv(tmp_dir+sheet+'_Equipment.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
source_eqp.to_csv(tmp_dir+'source_merge/source_eqpment.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Piézométrie'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='piezometrie'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Niv_pz_sol', 'Type_ech', 'Date_mes']
df=col_ren(df, name=name, mode=1)

In [None]:
mes_pz=df
source_mes_pz=mes_pz

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

In [None]:
dataframe_viewer(source_prv_sol, rows=5)

In [None]:
dataframe_viewer(excel_bhs, rows=5)

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bhs, conflict_df = data_merger(source_bh, source_pz, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bhs, conflict_df = data_merger(excel_bhs, source_pza, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bh_litho, conflict_df = data_merger(excel_bhs, source_litho, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bh_soil_sp, conflict_df = data_merger(excel_bhs, source_prv_sol, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bh_equip, conflict_df = data_merger(excel_bhs, source_eqp, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
data_validation(overall_data=excel_bh_equip, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Diam_ext_pz_x':list(conflict_df.index)})

In [None]:
excel_bh_mes, conflict_df = data_merger(excel_bhs, source_mes_pz, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

## 16-Résultats SOL extension pilote et piézairs.xlsx
* **Sheet : 'Résult SOL'**

In [None]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ukw, source_an, source_litho, source_bh = _df, _df, _df, _df

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/result_sol_ext_pilote/'
sheet='Result_Sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Resultats SOL extension pilote et piezairs.xlsx', 
                   sheet_name='Résult SOL', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [None]:
prv_sol=dble_col_drop(prv_sol)

In [None]:
prv_sol.drop(list(range(3)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,3)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)

In [None]:
prv_sol=prv_sol[:-1]
prv_sol.drop(columns=['broyage'], inplace=True)

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_prv','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH_mes','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [None]:
set(prv_sol.Description)

In [None]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x in ['R','R ']: prv_sol.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: prv_sol.loc[i,'Description']='Terrain naturel'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
prv_sol.insert(1,'Type_ech','Sol')#

In [None]:
for i in range(len(prv_sol)):
    x=prv_sol.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        prv_sol.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
dataframe_viewer(prv_sol, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an.columns

In [None]:
an=an[an.columns[:-17]]
an.rename(columns={'col_35':'Phénanthrène'}, inplace=True)

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an = na_line_drop(an, 1)
an.insert(1,'Type_ech','Sol')

In [None]:
data = an
for i in range(len(data)):
    x=data.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        data.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
source_prv_sol=prv_sol
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'inorganiques et composés majeur'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Inorg_comp_majeur'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Resultats SOL extension pilote et piezairs.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
prv_sol=df.loc[:20] # not really interesting here!
an=df.loc[21:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=an[an.columns[:-7]]

In [None]:
an=col_ren(an, name=pol_field_model, mode=1)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,2)
an = na_line_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
data = an
for i in range(len(data)):
    x=data.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        data.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
#source_prv_sol=prv_sol
source_an=data_merger(source_an, an, how='outer', on='ID_ech')[0]

In [None]:
dataframe_viewer(source_an, rows=5) 

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

### $\color{red}{\textbf{merging of 'Donnees_2019' and 'result_sol' data}}$

In [None]:
excel_bh_soil_an, conflict_df = data_merger(excel_bh_soil_sp, source_an, how='outer', on='ID_ech', dist_max=1., drop_skip_col=['index'])

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

In [None]:
#dataframe_viewer(test.query('ID=="F16M"'), rows=5)
dataframe_viewer(mdf, rows=5), dataframe_viewer(conflict_df, rows=5)
#dataframe_viewer(source_bh, rows=5), dataframe_viewer(source_mes, rows=5)
#dataframe_viewer(bh, rows=5), dataframe_viewer(eqp, rows=5)