# ORGANISATION DES DONNEES

In [None]:
import re, os
import numpy as np
import pandas as pd
from difflib import get_close_matches

from utils.config import DEFAULT_POL_LEXICON, POL_NAMES_MODEL
from definitions import ROOT_DIR
from utils.io import dataframe_viewer, data_merger, data_validation, data_slicer, \
collect_time_data, replicate_values, gen_id_from_ech, na_col_drop, na_line_drop, col_ren, \
dble_col_drop, find_borehole_by_position, compute_borehole_length

### Creation du répertoire de sauvegarde

In [None]:
save_dir = ROOT_DIR + '/CF_data/Result_traitem/organisation/'

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

### Definition d'entêtes usuelles

In [None]:
MEAS_NAMES_MODEL = {'Fraction   2000 µm':'Fract_2000µ', 'Fraction   63 µm':'Fract_63µ', 'Fraction   45 µm':'Fract_45µ', 'Fraction   16 µm':'Fract_16µ', 
                    'Fraction   2 µm':'Fract_2µ', 'Fraction 2 mm':'Fract_2', 'Fraction +2 mm':'Fract_2+', 'Fract_2':'Fract_2', 'Fract_2+':'Fract_2+', 
                    'Mat. organique':'MO', 'Mat. sèche':'MS', 'Argile':'Fract_arg', 'Fraction argileuse':'Fract_arg'}

In [None]:
params_kw = ['O_diss','Niv_eau', 'temp', '^T$', '^CE$', 'pH$', 'ORP']
meas_kw_col = ['O_diss','pH','CE','ORP','Niv_eau_pz','Niv_eau_sol','Temp']
sufx = ['sup', 'prof', 'inf', '/\dM(\*)?']
prefx = ['eau forage ']
id_reg = '\s*(?P<id>(?:^canne |Piezair )*\w*\d+\w*)\s*'
pollutants_names = list(set(list(DEFAULT_POL_LEXICON.abbreviations.keys()) + list(POL_NAMES_MODEL.values())))

In [None]:
bh_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Sect_crep','Long_pz_sol','Ht_pz_sol',
           'Diam_for','Diam_int_pz','Diam_ext_pz','Ht_chbre','Refus','Societe','Zone','Sous_zone','Etude','Method','Resp_chantier',
           'Emplacement','Rmq']))

mes_cols = list(set(['Date_mes','ID','ID_ech','X','Y','Z','Zsol','pH_H2O', 'Temp_pH_H2O', 'Temp_pH_CaCl2','pH_CaCl2','Temp_pH_KCl',
            'pH_KCl','Residu_perte_feu','Fract_arg','Fract_min_2µ','Fract_min_50µ','Fract_min_2','Temp_pH_mes',
            'pH_H20', 'Fract_min_2µ', 'Fract_min_50µ', 'Fract_min_2', 'pH_KCl', 'Temp_pH_mes', 'pH_H20', 'sulfures_tot''N_Kjdl','Temp_CE','Temp_pH','Nappe','Rmq','Fract_2000µ','Fract_63µ','Fract_45µ','Fract_16µ',
            'Fract_2µ','Temp_ech', 'Periode'] + meas_kw_col + list(MEAS_NAMES_MODEL.values())))

eqp_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type_equip','Equip_base','Equip_top',
                     'Equip_epais','Rmq']))

litho_cols = list(set(['Date_for','ID','ID_ech','X','Y','Z','Zsol','Long_for','Litho_top','Litho_base','Intv_top',
                       'Intv_base','Litho_epais','Intv_epais','Keyword','Description','Rmq']))

an_cols = list(set(['ID','X','Y','Z','Zsol','Date_ech','ID_ech','Type_ech','Ech_top','Ech_base','Ech_epais',
                    'Intv_top','Intv_base','Description','Nappe','Organo','Intensite', 'Min_organo', 'Max_organo',
                    'Polluant','Surnageant','Sousnageant','Caractere','Opacite','Rmq'] + pollutants_names))

ukw_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Method','Societe','Rmq']))

cols_dict = {'borehole': bh_cols, 'measure': mes_cols, 'lithology': litho_cols, 'analysis': an_cols, 
 'equipement': eqp_cols, 'unknown': ukw_cols}

In [None]:
bh_crit = ['ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Diam_for','Diam_int_pz','Diam_ext_pz']

mes_crit = ['Date_mes'] + meas_kw_col + list(MEAS_NAMES_MODEL.values())

eqp_crit = ['Type_equip','Equip_base','Equip_top']

litho_crit = ['Litho_top','Litho_base','Intv_top','Intv_base','Description']

an_crit = ['ID_ech','Type_ech','Organo','Surnageant','Sousnageant'] + list(DEFAULT_POL_LEXICON.abbreviations.keys()) 

ukw_crit = ['ID','X','Y','Z','Zsol','Long_for','Type']

crit_dict = {'borehole': bh_crit, 'measure': mes_crit, 'lithology': litho_crit, 'analysis': an_crit, 
 'equipement': eqp_crit, 'unknown': ukw_crit}

variables utilisées par jeu de données
================================
- bh 	: 	forages (simple ou piezo)
- equip	:	equipements d'un forage (outils, méthodes utilisés, ...)
- ukw	:	objets physiques indéterminés
- litho :	descriptions lithologiques
- an 	: 	analyses de contaminants sur des échantillons (sol, eau)
- mes	:	mesures de propriétés sur des échantillons (sol, eau), de paramètres hydrochimiques, ...


# ---------------------------------------------------------

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 4-profondeur de contact campagne de forages octobre 2019.xlsx

* **Sheet : 'Feuil1'**

In [None]:
tmp_dir= save_dir + 'Prof_contact_sol_forage/'
sheet='Feuil1'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/profondeur de contact campagne de forages octobre 2019.xlsx', 
                   sheet_name='Feuil1', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

In [None]:
df.rename(columns={'n°forage ':'ID','profondeur(m)':'Long_for','x':'X', 'y':'Y', 'z':'Z'}, inplace=True)
df['Type']='Forage' # type is not defined clearly in data

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
bh = df
source_bh=bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 5-Forages_Pilote_Decoupe.xlsx

* **Sheet : 'leve'**

In [None]:
tmp_dir= save_dir + 'Forage_Pilote/'
sheet='leve_Z_elect_pos'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/geometrie_electrodes_et_sondes/Forages_Pilote_Decoupe.xlsx', 
                   sheet_name='leve')#, skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.rename(columns={'Ref_puits':'ID','Niveau mesuré':'Z_mes', 'Niveau corrigé':'Z','Z_diff [m] repere_local':'Diff_Z_local',
                   'long_fin [m]':'Long_for','Pos_Inox_#1 [m]':'Pos_Inox_#1', 'Unnamed: 11':'Rmq',
                   'Pos_Inox_#6 [m]':'Pos_Inox_#6', 'Pos_Impol_#3 [m]':'Pos_Impol_#3'}, inplace=True)

In [None]:
df['Type']='Forage' # type is not defined clearly in data
#df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
elc = df[['ID','Pos_Inox_#6', 'Pos_Impol_#3']] # 'ID' is for boreholes
bh = df[['ID','Long_for', 'Type']]# Z_local origin = 145.5 [m] # incoherence !??

In [None]:
source_bh = bh
source_elc = elc

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
elc.to_csv(tmp_dir+sheet+'_Electrodes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_elc.to_csv(tmp_dir+'source_merge/source_Electrodes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_elect:{len(source_elc)} ;')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 6-Liste XY investigations.xlsx
* **Sheet : 'SOL_EAU'**

In [None]:
tmp_dir= save_dir + 'Liste_XY/'
sheet='Sol_Eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='SOL')#, skiprows=4)
df['Type_ech']='Sol'
df.rename(columns={'N°':'ID_ech'}, inplace=True)

df1 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU PR')#, skiprows=4)
df1['Type_ech']='Eau'
df1['Nappe']='Socle'
df1.rename(columns={'N°':'ID_ech'}, inplace=True)

df2 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU RB')#, skiprows=4)
df2['Type_ech']='Eau'
df2['Nappe']='remblais'
df2.rename(columns={'N°':'ID_ech'}, inplace=True)

df3 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU ALL')#, skiprows=4)
df3['Type_ech']='Eau'
df3['Nappe']='Alluvions'
df3.rename(columns={'N°':'ID_ech'}, inplace=True)

In [None]:
mdf, conflict_df=data_merger(df1, df, 'outer', 'ID_ech')

In [None]:
mdf, conflict_df=data_merger(mdf, df2, 'outer', 'ID_ech')

In [None]:
df, conflict_df=data_merger(mdf, df3, 'outer', 'ID_ech')

In [None]:
dataset = df
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
df = gen_id_from_ech(df, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
an = df
source_an = an
#source_an.insert(0,'ID', source_an.pop('ID_ech'))

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 7-Résultats phase 1_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'Phase_1_Memoris/'
sheet='Result_sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x=='R': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='A': ech_df.loc[i,'Description']='Argiles'
    elif x=='S': ech_df.loc[i,'Description']='Sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('x|X', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène', 'col_63':'EOX'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)#, verbose=True)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)
df['ID'] = df['ID'].apply(lambda x: x+'M')

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_an = an
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir= save_dir + 'Phase_1_Memoris/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech','Date_ech','Num_maille','Affectation','X','Y','Zsol','Long_for','Prof_crep','Long_pz',
      'Niv_eau_sol','pH','CE','Temp']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [None]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]

ech_df['Type_equip'] = 'Crepine'
ech_df.drop(columns=['Prof_crep'], inplace=True)

In [None]:
#ech_df['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech', 'col_43':'phénanthrène'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
df['ID'] = df['ID'].apply(lambda x: re.sub('^P', 'F', str(x)) if not pd.isnull(x) else x)

In [None]:
df['Type'] = 'Piezo'
df['Date_mes'] = df['Date_ech']

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_eqp = eqp

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_for_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 8-Résultats phase 2_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'Phase_2_Memoris/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Date_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df['Description'])

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x=='R': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='LA': ech_df.loc[i,'Description']='Limons et argiles'
    elif x=='LS': ech_df.loc[i,'Description']='Limons et sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
df['Type'] = 'Forage'

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_an = an
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir= save_dir + 'Phase_2_Memoris/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Date_ech','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol',
      'Niv_eau_sol','pH', 'CE', 'Temp']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [None]:
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]
    
ech_df.drop(columns=['Prof_crep'], inplace=True)
ech_df['Type_equip'] = 'Crepine'
ech_df['Type']='Piezo'

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_43':'phénanthrène'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx)

In [None]:
df.loc[6, 'ID'] = 'P23'
df['ID'] = df['ID'].apply(lambda x: re.sub('^P', 'F', str(x)) if not pd.isnull(x) else x)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_eqp = eqp

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Type_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 9-Ensemble des résultats Memoris version Seafile.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'Memoris_seafile/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:37]
an=df.loc[38:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.drop(columns=ech_df.columns[[-3,-4]], axis=1, inplace=True)

In [None]:
name=['ID_ech', 'Date_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df['Description'])

In [None]:
for i in ech_df.index:
    x = ech_df.loc[i,'Description']
    if x=='R' or x=='R ': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='LA': ech_df.loc[i,'Description']='Limons et argiles'
    elif x=='LS': ech_df.loc[i,'Description']='Limons et sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)

In [None]:
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène'}, inplace=True)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Type'] = 'Forage'

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_bh = bh
source_an = an
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir= save_dir + 'Memoris_seafile/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.drop(columns=ech_df.columns[[2]], axis=2, inplace=True)

In [None]:
name=['ID_ech', 'Date_ech','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol', 
      'Niv_eau_sol','pH', 'CE', 'Temp']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [None]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]
    
ech_df.drop(columns=['Prof_crep'], inplace=True)
ech_df['Type_equip'] = 'Crepine'

In [None]:
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech','Température pour mes. pH':'Temp_pH', 'col_43':'phénanthrène'}, inplace=True)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']

In [None]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
df['ID'] = df['ID'].apply(lambda x: re.sub('^P', 'F', str(x)) if not pd.isnull(x) else x)

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_eqp = eqp

In [None]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_for_x':list(conflict_df.index), 'Type_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 10-Résultats SOL container phyto t=0_décret sol.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir= save_dir + 'Container_phyto/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:21]
an=df.loc[22:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.drop(columns=ech_df.columns[[-3]], axis=1, inplace=True)

In [None]:
name=['ID_ech', 'Ech_top', 'Ech_base','MS','Date_ech','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.rename(columns={an.columns[0]:'ID_ech',  'col_35':'phénanthrène'}, inplace=True)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
df, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df['ID'] = df['ID_ech'].apply(lambda x: 'bh_' + x if not pd.isnull(x) else x)
df['Date_mes'] = df['Date_ech']

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_an = an
source_mes = mes

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

* **Sheet : 'Paramètres agro.'**

In [None]:
tmp_dir= save_dir + 'Container_phyto/'
sheet='Param_agro'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Paramètres agro.', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)
df=col_ren(df, 0)

In [None]:
df.drop(list(range(1)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df=dble_col_drop(df)

In [None]:
df=na_col_drop(df,1)
df=na_line_drop(df,3)
df.reset_index(drop=True, inplace=True)

In [None]:
df.drop(columns=df.columns[[5,6]], axis=2, inplace=True)

In [None]:
name=['ID_ech','Ech_top','Ech_base','MS','Date_ech','MO','Residu_perte_feu','COT','Fract_arg','Fract_min_2µ', 
      'Fract_min_50µ', 'Fract_min_2', 'Fract_2', 'Fract_2+', 'pH_KCl','Temp_pH_mes', 'pH_H20', 'Sulfure_tot', 
      'Chlorure', 'N_Kjdl']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [None]:
#mdf['Type'] = 'Forage'
df['Date_mes'] = df['Date_ech']

In [None]:
id_cols = ['ID', 'ID_ech']
dtf = df
for id_col in id_cols:
    if id_col in dtf.columns:
        dtf[id_col] = dtf[id_col].apply(lambda x: str(x) if not isinstance(x, str) and not pd.isnull(x) else x)

In [None]:
df['ID'] = df['ID_ech'].apply(lambda x: 'bh_' + x if not pd.isnull(x) else x)

In [None]:
df_dict = data_slicer(df, cols_dict, crit_dict)

In [None]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

In [None]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [None]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Fract_2_x':list(conflict_df.index), 'Fract_2+_x':list(conflict_df.index),
                           'MS_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_an = dataset

In [None]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')