# DATA ORGANIZATION

In [12]:
from utils.io import update_dict, gen_dated_id, dataframe_viewer, gen_geodf_geom, data_merger, data_validation, \
data_slicer, replicate_values, collect_measure, collect_time_data, gen_id_from_ech, na_col_drop, na_line_drop, col_ren, \
dble_col_drop, dict_viewer

from utils.config import DEFAULT_POL_LEXICON, POL_NAMES_MODEL 
from difflib import get_close_matches

import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR

In [13]:
def compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Intv_top', base_col='Intv_base', verbose=False):
    
    if length_col_name in df.columns:
        raise(NameError(f'{length_col_name} is already in columns. Give another name'))
    
    for i in df.index:
        try:
            float(df.loc[i, top_col])
        except ValueError:
            df.loc[i, top_col] = np.nan

        try:
            float(df.loc[i, base_col])
        except ValueError:
            df.loc[i, base_col] = np.nan

    df[top_col] = df[top_col].astype('float64')
    df[base_col] = df[base_col].astype('float64')

    # compute length based on litho_top and litho_base
    id_list = []

    for i in df.index:
        id_ = df.loc[i,id_col]
        
        if verbose : print(i, id_, df.loc[i, top_col], df.loc[i, base_col])
        if id_ not in id_list:
            id_list.append(id_)
            if isinstance(id_, str):
                sql_id = f"{id_}"
            elif isinstance(id_, float) or isinstance(id_, int):
                sql_id = id_
                
            tmp = df[df[id_col] == sql_id]
            
            if verbose : print(len(tmp))
            #if len(tmp) > 0:
            df.loc[tmp.index, length_col_name] = float(max(tmp[base_col])) - float(min(tmp[top_col]))
    
    df.drop(index=df.query(f'{base_col}.isnull() and {top_col}.isnull()').index, inplace=True)
    df.insert(df.columns.to_list().index(id_col)+1, length_col_name, df.pop(length_col_name))
    #df.reset_index(drop=True, inplace=True)
    

In [14]:
POL_NAMES_MODEL = {'Arsenic': 'As', 'Cobalt': 'Co', 'Cadmium': 'Cd', 'Chrome': 'Cr', 'Chrome VI': 'Cr_VI', 'Chrome (VI)': 'Cr_VI', 'Chrome_total': 'Cr_tot', 'Cuivre': 'Cu', 'Mercure': 'Hg', 'Plomb': 'Pb', 'Nickel': 'Ni', 'Zinc': 'Zn', 'Cyanure(?:s)? (?libre(?:s)?)?': 'CN_libre', 'Cyanures (totaux)': 'CN_tot', 'CN_totaux': 'CN_tot', 'Cyanures (APE)': 'CN_tot_APE', 'Cyanures totaux APE':'CN_tot_APE', 'cyanure complex': 'CN_cplx', "Cyanures (libres) - NEN-EN-ISO 14403": 'CN_libre', 'Cyanures (libres)': 'CN_libre', 'CN_libres': 'CN_libre', 'thiocyanate': 'ThioCN', 'Benzène': 'Bnz', 'Toluène': 'Toln', 'Éthylbenzène': 'EthBnz', 'Orthoxylène': 'O-Xyl', 'O-xylènes': 'O-Xyl', 'mp-xylènes': 'P-M-Xyl', 'Para- et métaxylène': 'P-M-Xyl', 'Xylènes': 'Xyl', 'Styrène': 'Styr', 'BTEX totaux': 'BTEX_tot', 'Phénol': 'Phenol', 'Indice phénol': 'IPh', 'Naphtalène': 'Naphta', 'Acénaphtylène': 'Acenaphtyl', 'Acénaphtène': 'Acenaphtn', 'Fluorène': 'Flrn', 'Phénanthrène': 'Phenanthr', 'Anthracène': 'Anthrc', 'Fluoranthène': 'Flranth', 'Pyrène': 'Pyr', 'Benzo(a)anthracène': 'Bnz(a)anthrc', 'Chrysène': 'Chrys', 'Benzo(b)fluoranthène': 'Bnz(b)flranth', 'Benzo(k)fluoranthène': 'Bnz(k)flranth', 'Benzo(a)pyrène': 'Bnz(a)pyr', 'Dibenzo(ah)anthracène': 'Dibnz(ah)anthrc', 'Benzo(ghi)pérylène': 'Bnz(ghi)peryl', 'Indéno(1,2,3-cd)pyrène': 'Indeno(1.2.3-cd)pyr', 'HAP Totaux (16) - EPA': 'HAP_tot_EPA', '1,1-Dichloroéthane': '1.1-DCE', '1,2-Dichloroéthane': '1.2-DCE', '1,1-dichloroéthène': '1.1-DCEn', 'Cis-1,2-dichloroéthène': 'Cis-1.2-DCEn', 'Trans 1,2-dichloroéthylène': 'Trans-1.2-DCEyl', 'Dichlorométhane': 'DCM', 'dibromochlorométhane': 'DiBCM', 'bromodichlorométhane': 'BromoDCM', 'Totaux (cis,trans) 1,2-dichloroéthènes': '(cis.trans)-1.2-DCEn_tot', '1,2-dichloropropane': '1.2-DCP', 'Tétrachloroéthylène': 'TetraCEyn', 'Tétrachlorométhane': 'TCM', '1,1,1-Trichloroéthane': '1.1.1-TCE', '1,1,2-Trichloroéthane': '1.1.2-TCE', 'Trichloroéthylène': 'TCEyn', 'Chlorure de vinyle': 'CVinyl', '3-éthylphénol': '3-EthPhn', 'métacrésol': 'M-cresol', 'o-crésol': 'O-cresol', 'p-crésol': 'P-cresol', 'crésols (total)': 'Cresol_tot', '2,4-dimethylphénol': '2.4-DMetPhn', '2,5-dimethylphénol': '2.5-DMetPhn', '3,5+2,3-dimethylphénol+4-ethylphénol': 'DMetPhn_4-EthPhn', '2,6-dimethylphénol': '2.6-DMetPhn', '3,4-dimethylphénol': '3.4-DMetPhn', 'alkylphénols C2 total': 'AlkPhn_C2_tot', '2-éthylphénol': '2-EthPhn', 'para(tert)butylphénol': 'P(T)ButPhn', 'alkylphénols C4 total': 'AlkPhn_C4_tot', '2,3,5-triméthylphénol': '2.3.5-TMPethn', '3,4,5-triméthylphénol': '3.4.5-TMetPhn', '2-isopropylphénol': '2-IsoPropPhn', 'alkylphénols C3 total': 'AlkPhn_C3_tot', 'HAP totaux (10) VROM': 'HAP_tot_vrom', 'monochlorobenzène': 'MonoCBzn', '1,2-dichlorobenzène': '1.2-DCBzn', '1,3-dichlorobenzène': '1.3-DCBzn', '1,4-Dichlorobenzène': '1.4-DCBzn', '1,2,3-trichlorobenzène': '1.2.3-TCBzn', '1,2,4-trichlorobenzène': '1.2.4-TCBzn', '1,3,5-trichlorobenzène': '1.3.5-TCBzn', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes': '1.2.3.4_5-TCBzn', '1,2,3,4-tétrachlorobenzène': '1.2.3.4-TCBzn', 'hexachlorobenzène': 'HCBzn', '2-chlorophénol': '2-CPhn', '4-chlorophénol': '4-CPhn', '3-chlorophénol': '3-CPhn', 'monochlorophénol total': 'MonoCPhn_tot', '2,3-dichlorophénol': '2.3-DCPhn', '2,4+2,5-dichlorophénol': '2.4_5-DCPhn', '2,6-dichlorophénol': '2.6-DCPhn', '3,4-dichlorophénol': '3.4-DCPhn', '3,5-dichlorophénol': '3.5-DCPhn', 'dichlorophénol total': 'DCPhn_tot', '2,3,4-trichlorophénol': '2.3.4-TCPhn', '2,3,5-trichlorophénol': '2.3.5-TCPhn', '2,3,6-trichlorophénol': '2.3.6-TCPhn', '2,4,5-trichlorophénol': '2.4.5-TCPhn', '2,4,6-trichlorophénol': '2.4.6-TCPhn', '3,4,5-trichlorophénol': '3.4.5-TCPhn', 'trichlorophénol total': 'TriCPhn_tot', '2,3,5,6-tétrachlorophénol': '2.3.5.6-TCPhn', '2,3,4,6- tétrachlorophénol': '2.3.4.6-TCPhn', '2,3,4,5- tétrachlorophénol': '2.3.4.5-TCPhn', 'tétrachlorophénol total': 'TCPhn_tot', 'pentachlorobenzène': 'PCBzn', 'pentachlorophénol': 'PCPhn', 'chlorophénol total': 'CPhn_tot', 'EOX': 'EOX', 'fraction aromat. >C6-C7': 'Ar_C6-C7', 'fraction aromat. >C7-C8': 'Ar_C7-C8', 'fraction aromat. >C8-C10': 'Ar_C8-C10', 'fraction aliphat. C5-C6': 'Alp_C5-C6', 'fraction aliphat. >C6-C8': 'Alp_C6-C8', 'fraction aliphat. >C8-C10': 'Alp_C8-C10', 'Fraction C5-C8': 'C5-C8', 'Fraction C8-C10': 'C8-C10', 'Fraction C10-C12': 'C10-C12', 'Fraction C12-C16': 'C12-C16', 'Fraction C16-C21': 'C16-C21', 'Fraction C21 - C35': 'C21-C35', 'Fraction C35 - C40': 'C35-C40', 'C16 - C21': 'C16-C21', 'C21 - C35': 'C21-C35', 'C30 - C40': 'C30-C40', 'C35 - C40': 'C35-C40', 'aromat.>C6-C7': 'Ar_C6-C7', 'aromat.>C7-C8': 'Ar_C7-C8', 'aromat.>C8-C10': 'Ar_C8-C10', 'aromat.>C10-C12': 'Ar_C10-C12', 'aromat.>C12-C16': 'Ar_C12-C16', 'aromat.>C16-C21': 'Ar_C16-C21', 'aromat.>C21-C35': 'Ar_C21-C35', 'aliphat.>C5-C6': 'Alp_C5-C6', 'aliphat.>C6-C8': 'Alp_C6-C8', 'aliphat.>C8-C10': 'Alp_C8-C10', 'aliphat.>C10-C12': 'Alp_C10-C12', 'aliphat.>C12-C16': 'Alp_C12-C16', 'aliphat.>C16-C35': 'Alp_C16-C35', 'Hydrocarbures totaux C10-C35': 'HC_tot_C10-C35', 'totaux C10-C35': 'HC_tot_C10-C35', 'Totaux C10-C40': 'HC_tot_C10-C40', 'Hydrocarbures totaux C10-C40': 'HC_tot_C10-C40', 'MTBE': 'MTBE', 'PCB 28': 'PCB_28', 'PCB 52': 'PCB_52', 'PCB 101': 'PCB_101', 'PCB 118': 'PCB_118', 'PCB 138': 'PCB_138', 'PCB 153': 'PCB_153', 'PCB 180': 'PCB_180', 'PCB totaux (7)?': 'PCB_tot', 'Chlorure(?:s)?': 'Chlorure', 'Soufre Total': 'S_tot', 'sulfite(?:s)?': 'sulfite', 'sulfate(?:s)?': 'sulfate', 'COT': 'COT', 'DBO (5 jours)': 'DBO_5j', 'DCO': 'DCO', 'Ammonium': 'NH4', 'ammoniaque libre': 'NH3_libre', 'Nitrate': 'HNO3', 'Nitrite': 'HNO2', 'azote Kjeldahl': 'N_Kjdl', 'sulfures totaux': 'Sulfure_tot', 'sulfure(?:s)? (libre(?:s)?)': 'Sulfure_libre', 'calcium': 'Ca', 'potassium': 'K', 'magnésium': 'Mg', 'manganèse': 'Mn', 'sodium': "Na", 'fer': 'Fe', 'phosphore (total)': 'P_tot', 'phosphates (totaux)': 'Phosphate_tot', 'carbonate': 'CaCO3', 'bicarbonate': 'Bicarb', 'Phoshore': 'P', 'fer ((Fe))? total': 'Fe_tot', 'fer (2\+)': 'Fe2', 'fluorure(?:s)?': 'Fluorure', 'chlorures': 'Chlorure', 'chloroformes': 'Chloroforme', 'bromoformes': 'Bromoforme', 'bromure (libre)': 'Br_libre', 'Iph.': 'IPh', 'CN_NCl': 'CN_NCl', '2-naphtol': '2-Naphtol', 'thymol': 'Thymol', 'chloroforme': 'Chloroforme', 'bromoforme': 'Bromoforme', 'C12-C20': 'C12-C20', 'C20-C30': 'C20-C30', 'Non chloro destruct.':'Non_chloro_destr', 'SOM VROM 10':'HAP_tot_vrom','SOM EPA 16':'HAP_tot_EPA', 'SOM_C5_C35':'HC_tot_C15-C35', 'SOM_C10_C40':'HC_tot_C10-C40', 'SOM BTEX':'BTEX_tot','C5_C8':'C5-C8', 'C8_C10':'C8-C10', 'C10_C12':'C10-C12', 'C12_C16':'C12-C16', 'C30_C35':'C30-C35','Fraction   2000 µm':'Fract_2000µ','Fraction   63 µm':'Fract_63µ', 'Fraction   45 µm':'Fract_45µ','Fraction   16 µm':'Fract_16µ','Fraction   2 µm':'Fract_2µ', 'Fraction 2 mm':'Fract_2', 'Fraction +2 mm':'Fract_2+', 'Fract_2':'Fract_2', 'Fract_2+':'Fract_2+', 'Mat. organique':'MO', 'Mat. sèche':'MS', 'Argile':'Argile'}

In [15]:
params_kw = ['O_diss','Niv_eau', 'temp', '^T$', '^CE$', 'pH$', 'ORP']
meas_kw_col = ['O_diss','pH','CE','ORP','Niv_eau_pz','Niv_eau_sol','Temp', 'T']
sufx = ['sup', 'prof', 'inf', '/\dM(\*)?']
prefx = ['eau forage ']
id_reg = '\s*(?P<id>(?:^canne )*\w*\d+\w*)\s*'
pollutants_names = list(set(list(DEFAULT_POL_LEXICON.abbreviations.keys()) + list(POL_NAMES_MODEL.values())))

In [16]:
bh_cols = ['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Sect_crep','Long_pz_sol','Ht_pz_sol',
           'Diam_for','Diam_int_pz','Diam_ext_pz','Ht_chbre','Refus','Societe','Zone','Sous_zone','Etude','Method','Resp_chantier',
           'ID_date','Rmq']

mes_cols = ['Date_mes','ID','X','Y','Z','Zsol','pH_H2O', 'Temp_pH_H2O', 'Temp_pH_CaCl2','pH_CaCl2','Temp_pH_KCl',
            'pH_KCl', 'Temp_CE', 'Temp_pH','Nappe','Rmq'] + meas_kw_col

eqp_cols = ['Date_for','ID','X','Y','Z','Zsol','Type_equip','Equip_base','Equip_top','Rmq']

litho_cols = ['Date_for','ID','X','Y','Z','Zsol','Long_for','Litho_top','Litho_base','Intv_top','Intv_base',
              'Description','Rmq']

an_cols = ['ID','X','Y','Z','Zsol','Date_ech','ID_ech','Type_ech','Ech_top','Ech_base','Intv_top','Intv_base',
           'Description','Nappe','Organo','Intensite', 'Min_organo', 'Max_organo', 'Polluant',
           'Surnageant','Sousnageant','Caractere','Opacite','Rmq'] + pollutants_names

ukw_cols = ['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Method','Societe','Rmq']

cols_dict = {'borehole': bh_cols, 'measure': mes_cols, 'lithology': litho_cols, 'analysis': an_cols, 
 'equipement': eqp_cols, 'unknown': ukw_cols}

In [17]:
bh_crit = ['ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Diam_for','Diam_int_pz','Diam_ext_pz']

mes_crit = ['ID','Date_mes'] + meas_kw_col

eqp_crit = ['Type_equip','Equip_base','Equip_top']

litho_crit = ['Litho_top','Litho_base','Intv_top','Intv_base','Description']

an_crit = ['ID_ech','Type_ech','Organo','Surnageant','Sousnageant'] + list(DEFAULT_POL_LEXICON.abbreviations.keys()) 

ukw_crit = ['ID','X','Y','Z','Zsol','Long_for','Type']

crit_dict = {'borehole': bh_crit, 'measure': mes_crit, 'lithology': litho_crit, 'analysis': an_crit, 
 'equipement': eqp_crit, 'unknown': ukw_crit}

variables utilisées 
==========================
- bh 	: 	forages (simple ou piezo)
- equip	:	equipements d'un forage (outils, méthodes utilisés, ...)
- ukw	:	objets physiques indéterminés
- litho :	descriptions lithologiques
- an 	: 	analyses de contaminants sur des échantillons (sol, eau)
- mes	:	mesures de propriétés sur des échantillons (sol, eau), de paramètres hydrochimiques, ...


# ---------------------------------------------------------

source dataframes initialization

In [7]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 1- Profils sols et données forages.xls
* **Sheet : 'Données de forage'**

In [8]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='donnees_forage'

In [9]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', 
                   sheet_name='Données de forage')#, skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

Rows : 25, columns : 15


interactive(children=(IntSlider(value=5, description='rows', max=25, min=5, readout=False), IntSlider(value=12…

In [10]:
df.rename(columns={'Date':'Date_for','Profondeur':'Long_for', 'Méthode':'Method', 
                        'Diamètre forage':'Diam_for','Niv. Eau p/r sol':'Niv_eau_sol',
                        'PZ Prof.':'Long_pz', 'PZ Diamètre':'Diam_pz','PZ L.crépinée':'Sect_crep', 
                        'Société forage':'Societe', 'Resp. chantier':'Resp_chantier'}, inplace=True)

In [11]:
df['Type'] = df['Long_pz'].apply(lambda x: 'Forage' if pd.isnull(x) else 'Piezo')
df['Refus'] = ''

for i in range(len(df['Remarque'])):
    val = str(df.loc[i,'Remarque'])
    if re.search('[Bb]loqué', val) :        
        if re.search('[lL]aitier', val):
            df.loc[i,'Refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Refus'] = np.nan

# convert diameter values unit from mm to m
df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace(' mm','').split('x')[1].strip(' m'))/1000 
                                        if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace(' mm','').split('x')[0].strip(' m'))/1000 
                                        if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)

df.insert(7, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(8, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Remarque', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines
df['Date_mes'] = df['Date_for']

gen_dated_id(df,'ID','Date_for')  

Generation of ID-dated...
Using column ' Date_for ' in the (geo)dataframe !
Process ended, check the (geo)dataframe


In [12]:
if 'Date_for' in df.columns:
    df['Date_for'] = df['Date_for'].astype('datetime64')
if 'Date_mes' in df.columns:
    df['Date_mes'] = df['Date_mes'].astype('datetime64')

In [13]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 25 ; measure: 25 ; lithology: 0 ; analysis: 0 ; equipement: 0 ; unknown: 0 ; 


In [14]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 25 ; measure: 25 ; lithology: 0 ; analysis: 0 ;equipement: 0 ; unknown: 0


In [15]:
source_mes = mes
source_bh = bh

In [16]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 25 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 25


* **Sheet : 'Piézométrie'**

In [17]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='piezometrie'

In [18]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Piézométrie', skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

Rows : 37, columns : 21


interactive(children=(IntSlider(value=5, description='rows', max=37, min=5, readout=False), IntSlider(value=12…

In [19]:
sdf = na_col_drop(df[:12], 3)
sdf.rename(columns={'z':'Z'}, inplace=True)

Columns dropped :['Label', 'Commentaires ', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15', 'Unnamed: 16', 'Unnamed: 17', 'Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20']



In [20]:
a=0
for x in df.columns:
    if pd.isnull(df.loc[16,x]):
        df.loc[16,x]='col'+str(a)
    a+=1

In [21]:
if not 'tmp_df' in vars().keys():
    tmp_df = df.copy()
    
df = tmp_df.copy()
df.loc[16]=df.loc[16].apply(lambda x : x if not pd.isnull(x) else '')
df.columns = df.loc[16]

In [22]:
df=df[17:]
df.reset_index(inplace=True, drop=True)

#df.drop(columns=[df.columns.to_list()[x] for x in range(0,8)
#                      if re.compile(r"col|unnamed").match(df.columns.to_list()[x])], axis=1, inplace=True) 

In [23]:
df.rename(columns={'col8':'Date_mes', 'col9':'Nappe', 'col10':'ID', 'NP/piézo [m]':'Niv_eau_pz', 
                        'dim. piezo hors sol [m]':'Ht_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 
                        'Prof. piézo/piézo [m]':'Long_pz', 'Prof. piézo/sol [m]':'Long_pz_sol', 
                        't° [°C]':'Temp', 'Observations':'Rmq'}, inplace=True)

In [24]:
df = na_col_drop(df, 3)
df.reset_index(drop=True, inplace=True)

Columns dropped :['col0', 'col1', 'col2', 'col3', 'col4', 'col5', 'col6', 'col7']



In [25]:
df['CE'] = df[['CE [µS/cm]', 'CE [mS/cm]']].apply(lambda x: x[0]/1000 if pd.isnull(x[1]) else x[1], axis=1) # mS/cm
df.drop(columns=['CE [µS/cm]', 'CE [mS/cm]'], inplace=True)
df['ID'] = df['ID'].apply(lambda x: re.sub('P','F',x) if not pd.isnull(x) else x)
df.insert(0, 'ID', df.pop('ID')) # move to first column
df['Type'] = 'Piezo'

In [26]:
df.rename_axis(None, inplace=True, axis=1)
df.drop(df.query("ID!=ID").index, inplace=True) # supprimer les lignes avec ID='NaN'
df.reset_index(inplace=True, drop=True)

In [27]:
if 'Date_for' in df.columns:
    df['Date_for'] = df['Date_for'].astype('datetime64')
if 'Date_mes' in df.columns:
    df['Date_mes'] = df['Date_mes'].astype('datetime64')

In [28]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 17 ; measure: 17 ; lithology: 0 ; analysis: 0 ; equipement: 0 ; unknown: 0 ; 


In [29]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 17 ; measure: 17 ; lithology: 0 ; analysis: 0 ;equipement: 0 ; unknown: 0


##### Data merging

In [30]:
bh, conflict_df = data_merger(bh, sdf[['ID', 'Z']], how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [31]:
mdf, conflict_df = data_merger(source_bh, bh, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [32]:
dataset = mdf
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_pz_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

all conflicts have been fixed!


In [33]:
source_bh = dataset.copy()

In [34]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [35]:
cols_rep = ['X', 'Y', 'Z']
source_bh = replicate_values(source_bh, id_col='ID', cols_to_replicate=cols_rep, suffix=['sup', 'inf'], replace_id=True)
source_mes = replicate_values(source_mes, id_col='ID', cols_to_replicate=cols_rep, suffix=['sup', 'inf'], replace_id=False)

In [36]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 30 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 42


* **Sheet : 'Equipement'**

In [37]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='Equipement'

In [38]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', 
                   sheet_name='Equipement')#, skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

Rows : 36, columns : 7


interactive(children=(IntSlider(value=5, description='rows', max=36, min=5, readout=False), IntSlider(value=7,…

In [39]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID', 'Equip_top', 'Equip_base', 'Diam_for','Diam_int_pz', 'Type_equip']
df=col_ren(df, mode=1, name=name)

In [40]:
compute_BH_length(df, id_col='ID', length_col_name='Long_pz', top_col='Equip_top', base_col='Equip_base')

In [41]:
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)
df['Diam_int_pz'] = df['Diam_int_pz'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)

In [42]:
bh_ = source_bh[['ID', 'X', 'Y', 'Z']]
df, conflict_df = data_merger(bh_, df, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [43]:
df = na_line_drop(df, 3, 2)

15 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [44]:
if 'Date_for' in df.columns:
    df['Date_for'] = df['Date_for'].astype('datetime64')

In [45]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 36 ; measure: 0 ; lithology: 0 ; analysis: 0 ; equipement: 36 ; unknown: 0 ; 


In [46]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 12 ; measure: 0 ; lithology: 0 ; analysis: 0 ;equipement: 36 ; unknown: 0


##### Data merging

In [47]:
mdf, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [48]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_pz_x':list(conflict_df.index), 'Diam_for_y':list(conflict_df.index), 
                           'Diam_int_pz_y':list(conflict_df.index)})

all conflicts have been fixed!


In [49]:
dataset = mdf
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [50]:
source_bh = mdf.copy()

In [51]:
data = source_bh
source_bh = replicate_values(data, 'ID', list(data.columns)).drop_duplicates(list(data.columns))
source_bh.reset_index(drop=True, inplace=True)

In [52]:
source_eqp = eqp

In [53]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 27 ; source_eqp: 36 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 42


* **Sheets: 'Echantillon' + 'Organoleptique'**

In [54]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='Echant-organo'

In [55]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Echantillon')#, skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

Rows : 29, columns : 4


interactive(children=(IntSlider(value=5, description='rows', max=29, min=5, readout=False), IntSlider(value=4,…

In [56]:
df.rename(columns={'De':'Ech_top', 'A':'Ech_base', 'Numéro':'ID_ech'}, inplace=True)

In [57]:
# df, conflict_df = data_merger(df, sdf, 'outer', ['ID', 'Ech_top', 'Ech_base'])
df['Type_ech']='Sol'

In [58]:
if 'Date_for' in df.columns:
    df['Date_for'] = df['Date_for'].astype('datetime64')
if 'Date_mes' in df.columns:
    df['Date_mes'] = df['Date_mes'].astype('datetime64')

##### Data merging

In [59]:
bh_ = source_bh[['ID', 'X', 'Y', 'Z']]
df, conflict_df = data_merger(bh_, df, how='inner', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [60]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 29 ; measure: 0 ; lithology: 0 ; analysis: 29 ; equipement: 0 ; unknown: 0 ; 


In [61]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 15 ; measure: 0 ; lithology: 0 ; analysis: 29 ;equipement: 0 ; unknown: 0


In [62]:
source_an = an

In [63]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 27 ; source_eqp: 36 ; source_uknw: 0 ; source_litho: 0 ; source_an: 29 ; source_mes: 42


* **Sheet : 'Log'**

In [64]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/profils_sols_donnees_forages/'
sheet='Log'

In [65]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Log')#, skiprows=1)
dataframe_viewer(df, rows=5)

Rows : 55, columns : 5


interactive(children=(IntSlider(value=5, description='rows', max=55, min=5, readout=False), IntSlider(value=5,…

In [66]:
df.rename(columns={'De':'Litho_top', 'A':'Litho_base'}, inplace=True)

In [67]:
compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Litho_top', base_col='Litho_base')

In [68]:
bh_ = source_bh[['ID', 'X', 'Y', 'Z','Long_for']]
df, conflict_df = data_merger(bh_, df, how='inner', on=['ID'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [69]:
data_validation(overall_data=df, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_for_x':list(conflict_df.index)})

all conflicts have been fixed!


In [70]:
dataset = df
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [71]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 55 ; measure: 0 ; lithology: 55 ; analysis: 0 ; equipement: 0 ; unknown: 0 ; 

[1;32mNot used columns:[0;0m
 ['Keyword']


In [72]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 25 ; measure: 0 ; lithology: 55 ; analysis: 0 ;equipement: 0 ; unknown: 0


In [73]:
source_litho=litho

In [74]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 27 ; source_eqp: 36 ; source_uknw: 0 ; source_litho: 55 ; source_an: 29 ; source_mes: 42


### $\color{red}{\textbf{Excel data final merge}}$

In [75]:
bh_coords = source_bh[['ID', 'X', 'Y', 'Z','Date_for']].copy()

In [76]:
source_eqp, conflict_df = data_merger(bh_coords, source_eqp, how='inner', on='ID', dist_max=1., drop_skip_col=['index'])

In [77]:
source_litho, conflict_df = data_merger(bh_coords, source_litho, how='inner', on='ID', dist_max=1., drop_skip_col=['index'])

In [78]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 27 ; source_eqp: 36 ; source_uknw: 0 ; source_litho: 55 ; source_an: 29 ; source_mes: 42


#### ======================================================================================

In [79]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 2-Database MEMORIS3.xlsx
* **Sheet : 'PROFILS_SOL'**

In [80]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Profils_sol'

In [81]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. '+
                        'Siterem - 2017/Database MEMORIS3.xlsx', sheet_name='PROFILS_SOL')#, skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

Rows : 2041, columns : 16


interactive(children=(IntSlider(value=3, description='rows', max=2041, min=3, readout=False), IntSlider(value=…

In [82]:
df = na_col_drop(df, 3)

Columns dropped :['Unnamed: 12', 1927, 'Unnamed: 14', 'Unnamed: 15']



In [83]:
df.rename({'Date':'Date_for', 'N°':'Ref', 'Id':'idx', 'Piézo':'Type', 'Unnamed: 6':'Societe',
                'MFT Ø145':'MFT_145', 'Gouge Ø75':'Gouge_75', 'Liner Ø60': 'Liner_60'}, axis=1, inplace=True)

In [84]:
print(list(set(df['Date_for'].apply(lambda x: x.year if not pd.isnull(x) else x))))

[NaT, 2009, 2010, 2015]


In [85]:
df.loc[df.fillna('').query("Societe.str.contains('x|X')").index, 'Type']='X'

In [86]:
df.loc[df.fillna('').query("Gouge_75.str.contains('SBS|SITER')").index, 'Societe']='SBS Environnement'
df.loc[df.fillna('').query("Gouge_75.str.contains('SBS|SITER')").index, 'Gouge_75']=''

In [87]:
for i in range(len(df['Date_for'])-1):
    if not pd.isnull(df.loc[i, 'Date_for']) and pd.isnull(df.loc[i+1, 'Date_for']):
        df.loc[i+1, 'Date_for']=df.loc[i, 'Date_for']
        
    if not pd.isnull(df.loc[i, 'Societe']) and pd.isnull(df.loc[i+1, 'Societe']):
        df.loc[i+1, 'Societe']=df.loc[i, 'Societe']
        
    if not pd.isnull(df.loc[i, 'Type']) and pd.isnull(df.loc[i+1, 'Type']) and \
       df.loc[i, 'Ref']==df.loc[i+1, 'Ref']:
        df.loc[i+1, 'Type']=df.loc[i, 'Type']

In [88]:
for i in range(len(df['idx'])-1):    
    if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
    and re.findall('Forage',df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
        w=df.loc[i, 'Profondeur'][0]
    elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])
    
    if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
    and re.findall('Tranch',df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
        w=df.loc[i, 'Profondeur'][0]
    elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])
     
   # if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
   # and re.findall('Moni',df.loc[i, 'Profondeur']):
   #     df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
   #     w=df.loc[i, 'Profondeur'][0]
   # elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
   #     df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])

In [89]:
df['Ref']=df['idx'].apply(lambda x : x if re.findall('F|T', str(x)) else '')
df['Ref']=df['idx'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)

In [90]:
df['Type']=df['Type'].apply(lambda x: 'Piezo' if not pd.isnull(x) else '')

In [91]:
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.1","a",str(x)) if re.search(r"\.1", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.2","b",str(x)) if re.search(r"\.2", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.3","c",str(x)) if re.search(r"\.3", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.4","d",str(x)) if re.search(r"\.4", str(x)) else x)

In [92]:
gen_dated_id(df, ref_col='Ref', date_col='Date_for')

Generation of ID-dated...
Using column ' Date_for ' in the (geo)dataframe !
Process ended, check the (geo)dataframe


In [93]:
df.loc[df.query('Profondeur!=Profondeur' ).index,'Profondeur']=''

In [94]:
df['Method']=''
            
for i in range(len(df['Method'])):
    if not pd.isnull(df.loc[i, 'Gouge_75']) : df.loc[i, 'Method']='Gouge_75'
    if not pd.isnull(df.loc[i, 'MFT_145']) : df.loc[i, 'Method']='MFT_145'
    if not pd.isnull(df.loc[i, 'Liner_60']) : df.loc[i, 'Method']='Liner_60'
    if not pd.isnull(df.loc[i, 'carottier']) : df.loc[i, 'Method']='carrotier'
    if not pd.isnull(df.loc[i, 'tarrière']) : df.loc[i, 'Method']='tarrière'

In [95]:
df.drop(df.query('Profondeur.str.contains("Forage") and Profondeur!="Forage bloqué"', engine='python').index, inplace=True)
df.drop(df.query('Profondeur.str.contains("Tranc") and Profondeur!="Tranchée bloqué"', engine='python').index, inplace=True)
df.drop(df.query('Profondeur.str.contains(".orage|..ranch", regex=True)', engine='python').index, inplace=True)
df.drop(df.fillna('').query('Description.str.contains("^.orage bloq|^.ranc.* bloq|^.*efus", regex=True)', engine='python').index, inplace=True)
df.drop(df.query('Ref!=Ref').index, inplace=True)
df.drop(columns=['MFT_145','Gouge_75','Liner_60', 'carottier', 'tarrière', 'idx'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [96]:
df['Litho_top'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[0].strip(' m'))
df['Litho_base'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[-1].strip(' m'))

In [97]:
df.rename({'Ref':'ID'}, axis=1, inplace=True)
if 'Profondeur' in df.columns: df.drop(columns=['Profondeur'], axis=1, inplace=True)

In [98]:
set([x[0] for x in list(set(df.ID)) if isinstance(x,str)])

{'F', 'T'}

In [99]:
df.loc[df.query('ID_date.str.contains("T")', engine='python').index, 'Type'] = 'Tranchee'
df.loc[df.query('Type==""', engine='python').index, 'Type'] = 'Forage'

In [100]:
df.loc[1268, ['ID_date','ID']] = df.loc[1267, ['ID_date','ID']]
df.loc[df.query('Description.isnull() or Description.str.len()<1').index, 'Description'] = ''

In [101]:
df.drop(index=df.query('Litho_base.isnull() or Litho_base.str.len()<1').index, inplace=True)
df.reset_index(drop=True, inplace=True)

In [102]:
compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Litho_top', base_col='Litho_base')

In [103]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 1628 ; measure: 0 ; lithology: 1628 ; analysis: 0 ; equipement: 0 ; unknown: 0 ; 


In [104]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 298 ; measure: 0 ; lithology: 1628 ; analysis: 0 ;equipement: 0 ; unknown: 0


In [105]:
ukw = bh.loc[bh.query('Type=="Tranchee"', engine='python')[list(ukw.columns)].index] # trenches
ukw['Type'] = 'Inconnu'
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)

bh = bh.drop(index=ukw.index).reset_index(drop=True)

In [106]:
source_litho = litho
source_bh = bh
source_ukw = ukw

In [107]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 281 ; source_eqp: 0 ; source_uknw: 17 ; source_litho: 1628 ; source_an: 0 ; source_mes: 0


* **Sheet : 'DONNEES PIEZOS'**

In [108]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Donnees_piezos'

In [109]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. '+
                        'Siterem - 2017/Database MEMORIS3.xlsx', sheet_name='DONNEES PIEZOS', skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

Rows : 130, columns : 22


interactive(children=(IntSlider(value=3, description='rows', max=130, min=3, readout=False), IntSlider(value=1…

In [110]:
names = ['Ref_id','ID','Societe','Zone','Sous_zone','X','Y','Zsol','Z','Nappe','Long_pz','Sect_crep',
         'Diam_int_pz','Niv_eau_pz_27/04/2010','Niv_eau_pz_08/09/2010','Niv_eau_sol_27/04/2010',
         'Niv_eau_sol_08/09/2010','Surnageant','Sousnageant','Caractere','Opacite','Rmq']
df = col_ren(df, mode=1, name=names)
df = na_col_drop(df, 3)

In [111]:
df=df.query("ID==ID")
df.replace('-',np.nan, inplace=True)

In [112]:
df['Sousnageant']=df['Sousnageant'].apply(lambda x: x/100 if not pd.isnull(x) else x) #convert unit in [m]
df['Surnageant']=df['Surnageant'].apply(lambda x: x/100 if not pd.isnull(x) else x)
df['Type']=df['Sect_crep'].apply(lambda x: 'Piezo' if not pd.isnull(x) else 'Inconnu')

In [113]:
df = df[['ID','X','Y','Z','Zsol','Type','Long_pz','Diam_int_pz','Sect_crep','Nappe','Societe','Zone','Sous_zone',
         'Niv_eau_pz_27/04/2010','Niv_eau_pz_08/09/2010','Niv_eau_sol_27/04/2010','Niv_eau_sol_08/09/2010',
         'Surnageant','Sousnageant','Caractere',
      'Opacite','Rmq']]

In [114]:
df = collect_time_data(df)

dates found: ['27/04/2010', '08/09/2010']


In [115]:
df = gen_id_from_ech(df, id_ech_col='ID', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [116]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 260 ; measure: 260 ; lithology: 0 ; analysis: 260 ; equipement: 0 ; unknown: 26 ; 


In [117]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 111 ; measure: 260 ; lithology: 0 ; analysis: 260 ;equipement: 0 ; unknown: 13


In [118]:
an['Type_ech'] = 'Eau'
an = an.drop_duplicates('ID_ech').reset_index(drop=True)

##### Data merging

In [119]:
source_an = an

In [120]:
source_mes = mes

In [121]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [122]:
source_ukw, conflict_df = data_merger(source_ukw, ukw, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [123]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 392 ; source_eqp: 0 ; source_uknw: 30 ; source_litho: 1628 ; source_an: 130 ; source_mes: 260


* **Sheet : 'DRAINS ET PIEZOS ENEL'**

In [124]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Drains_Pz_ENEL'

In [125]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/Database MEMORIS3.xlsx', 
                        sheet_name='DRAINS ET PIEZOS ENEL', skiprows=1)

df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=3)

Rows : 21, columns : 65


interactive(children=(IntSlider(value=3, description='rows', max=21, min=3, readout=False), IntSlider(value=12…

In [126]:
df.insert(5, 'Z', df.pop('PZ absolue (m)'))
df.rename(columns={'N°':'ID', 'Date ':'Date_ech','Hauteur de la chambre ':'Ht_chbre','T':'Temp', 'ETUDE':'Etude',
                   'Niv_EAU_SOL (m)': 'Niv_eau_sol_01/10/2013', 'Niv_EAU_SOL (m).1':'Niv_eau_sol_14/12/2016', 
                   'Prof_PZ':'Long_pz','Section_crépinée':'Sect_crep', 'Diamètre_int':'Diam_int_pz', 'Odiss':'O_diss',
                   '\nC5-C8':'C5-C8'}, inplace=True)
df = df.query('ID==ID')

In [127]:
df = collect_time_data(df)

dates found: ['01/10/2013', '14/12/2016']


In [128]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan) # -> in mS/cm

In [129]:
df.drop(index=df.query('ID.str.contains("nan", regex=True)', engine='python').index, inplace=True)

In [130]:
df = gen_id_from_ech(df, id_ech_col='ID', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [131]:
df = col_ren(df, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID', 'ID_ech', 'Date_ech', 'Etude', 'X', 'Y', 'Z', 'Zsol', 'Ht_chbre', 'Long_pz', 'Sect_crep', 'Diam_int_pz', 'PZ relative (m)', 'pH', 'CE', 'Temp', 'ORP', 'O_diss', 'CN_libre', 'CN_totaux.1', 'CN_totaux.2', 'BTEX total', 'C5-C8', 'C8-C10', 'C10-C12', 'C12-C16', ' C16 - C21', 'C10-C12.1', 'C12-C22', 'C22-C30', 'C30-C40', 'Date_mes', 'Niv_eau_sol']


In [132]:
data = df.copy()
drop = []
for c in data.columns:
    c = re.sub('\s+$|\\n','', c)
    if re.match('\s*\w+\s*-\s*\w+\s*', c):
        c_mod = c.replace(' ','')
        data.rename(columns={c:c_mod}, inplace=True)
        c = c_mod
    if re.search('\w+_<\d*>', c):
        drop.append(c)
data.drop(columns=drop, inplace=True)
df = data.copy()

In [133]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 38 ; measure: 38 ; lithology: 0 ; analysis: 38 ; equipement: 0 ; unknown: 0 ; 

[1;32mNot used columns:[0;0m
 ['PZ relative (m)']


In [134]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 15 ; measure: 38 ; lithology: 0 ; analysis: 38 ;equipement: 0 ; unknown: 0


In [135]:
bh.insert(0, 'Type', 'Piezo')
an.insert(0, 'Type_ech', 'Eau')

In [136]:
source_an

Unnamed: 0,ID,X,Y,Z,Zsol,ID_ech,Nappe,Surnageant,Sousnageant,Caractere,Opacite,Rmq,Type_ech
0,1,153124.0710,122653.3820,102.622,101.983,1,Remblai_All,0.0,0.0,coupant,Translucide,très légère irisation et odeur de mazout et HAP,Eau
1,523,152701.3000,122778.5100,104.69,104.24,523,Remblais,0.0,0.0,coupant,Translucide,,Eau
2,522,152650.1700,122765.7700,103.59,103.07,522,Remblai_All,0.0,0.0,coupant,Chargé,,Eau
3,517,152576.5800,122884.3400,106,105.42,517,Remblais,0.0,0.0,coupant,Translucide,,Eau
4,528,152555.3200,122706.5300,103.83,103.5,528,Remblai_All,0.0,0.0,coupant,Translucide,,Eau
...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,405,152553.0230,122875.1550,106.253,105.53,405,Remblais,0.0,0.0,coupant,Translucide,,Eau
126,406,152671.2450,122785.6580,103.858,103.207,406,Remblais,0.0,0.0,coupant,Translucide,forte odeur H2S,Eau
127,410,153094.7422,122552.6908,102.32,102.35,410,Remblais,0.0,0.0,coupant,Translucide,,Eau
128,411,153066.7005,122556.2908,102.1762,102.24,411,All_limoneuse,0.0,0.0,non coupant,Translucide,,Eau


##### Data merging

In [137]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [138]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [139]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID','Date_mes'], dist_max=1., drop_skip_col=['index'])

In [140]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 407 ; source_eqp: 0 ; source_uknw: 30 ; source_litho: 1628 ; source_an: 149 ; source_mes: 298


* **Sheet : 'RESULTS_EAU' (F)**

In [141]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Result_eau'

In [142]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/Database MEMORIS3.xlsx', 
                        sheet_name='RESULTS_EAU', skiprows=1)

df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

Rows : 131, columns : 185


interactive(children=(IntSlider(value=5, description='rows', max=131, min=5, readout=False), IntSlider(value=1…

In [143]:
df.rename(columns={'Campagne':'Societe','N_piezo.':'ID','Z tête PZ':'Z', 'Prof_PZ':'Long_pz',
                   'Niv_EAU_TETE (m)':'Niv_eau_pz_27/04/2010','Niv_EAU_SOL (m)':'Niv_eau_sol_27/04/2010',
                   'Unnamed: 13':'Niv_eau_pz_08/09/2010','Unnamed: 15':'Niv_eau_sol_08/09/2010','T':'Temp',
                   'Section_crépinée':'Sect_crep','Diamètre_int':'Diam_int_pz','Description éch.':'Opacite',
                   'O_diss':'O_diss','Remarques':'Rmq','Aquifère_échantillonné':'Nappe', 
                   'Caractéristique':'Caractere'}, inplace=True)

df=df.query("ID ==ID")
df.replace('-',np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [144]:
df['Type']=df['Sect_crep'].apply(lambda x: 'Piezo' if not pd.isnull(x) else 'Inconnu')
df.insert(8, 'Type', df.pop('Type'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [145]:
# to express value in [m]
df['Surnageant']=df['Surnageant'].apply(lambda x: x/100)
df['Sousnageant']=df['Sousnageant'].apply(lambda x: x/100)
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                        if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [146]:
df = collect_time_data(df)

dates found: ['27/04/2010', '08/09/2010']


In [147]:
data = df.copy()
drop = []
for c in data.columns:
    c_mod = re.sub('\s+$|\n','', c)
    if re.match('\s*\w+\s*-\s*\w+\s*', c_mod):
        c_mod = c_mod.replace(' ','')
    if re.search('\w+_<\d*>', c_mod):
        drop.append(c)
    data.rename(columns={c:c_mod}, inplace=True)
data.drop(columns=drop, inplace=True)

In [148]:
df = data.copy()

In [149]:
df = col_ren(df, name=POL_NAMES_MODEL, mode=1, cutoff=0.7)#, verbose=True)


[1;34mPossible new pollutants names:[0;0m
['ID', 'Societe', 'Zone', 'Sous_zone', 'X', 'Y', 'Zsol', 'Z', 'Type', 'Long_pz', 'Sect_crep', 'Diam_int_pz', 'Nappe', 'Surnageant', 'Sousnageant', 'Caractere', 'Opacite', 'Rmq', 'pH', 'CE', 'Temp', 'ORP', 'Odiss', 'CN_libre', 'para-etmétaxylène', 'BTEX total', 'PCB totaux (7)', 'C5-C8', 'C8-C10', 'C10-C12', 'C12-C16', 'C16-C21', 'C21-C35', 'C35-C40', 'C30-C40', 'sulfites', 'sulfate', 'Date_mes', 'Niv_eau_pz', 'Niv_eau_sol']


In [150]:
df.rename(columns={'3,5+2,3-dimethylphénol+4-ethylphénol' : 'DMetPhn_4-EthPhn', 'chrome (VI)': 'Cr_VI',
                   '2,4+2,5-dichlorophénol' : '2.4_5-DCPhn', 'sulfites':'Sulfite'}, inplace=True)

In [151]:
df = gen_id_from_ech(df, id_ech_col='ID', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [152]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 260 ; measure: 260 ; lithology: 0 ; analysis: 260 ; equipement: 0 ; unknown: 26 ; 

[1;32mNot used columns:[0;0m
 ['Odiss']


In [153]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 111 ; measure: 260 ; lithology: 0 ; analysis: 260 ;equipement: 0 ; unknown: 13


In [154]:
data = an
data.drop_duplicates(list(data.columns), inplace=True)
data.reset_index(drop=True, inplace=True)
data['Type_ech'] = 'eau'

In [155]:
an = data.copy()

##### data merging

In [156]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [157]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

  exec(code_obj, self.user_global_ns, self.user_ns)


In [158]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID', 'Z', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [159]:
dataframe_viewer(source_mes.query('ID==522'), rows=10, un_val=['ID','ID_ech', 'Date_mes'])

Rows : 0, columns : 16, Unique values on cols: {'ID': 0, 'ID_ech': 'NA', 'Date_mes': 0}


interactive(children=(IntSlider(value=0, description='rows', max=0, readout=False), IntSlider(value=12, descri…

In [160]:
source_ukw, conflict_df = data_merger(source_ukw, ukw, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [161]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 409 ; source_eqp: 0 ; source_uknw: 30 ; source_litho: 1628 ; source_an: 149 ; source_mes: 300


* **Sheet : 'RESULTS_SOL'**

In [162]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/database_Memoris3/'
sheet='Result_sol'

In [163]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/'
                   'Database MEMORIS3.xlsx', sheet_name='RESULTS_SOL', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

  warn(msg)


Rows : 1423, columns : 94


interactive(children=(IntSlider(value=10, description='rows', max=1423, min=10, readout=False), IntSlider(valu…

In [164]:
df.rename(columns={'Unnamed: 92':'EOX', 'Unnamed: 93':'Idc_phenol','Campagne':'Societe','N_forage':'ID','refus':'Refus',
                   'Prof.\nforage':'Long_for', 'N_ech':'ID_ech', 'Min_Ech':'Ech_top','Max_Ech':'Ech_base',
                   'Terrain':'Nappe','Epaisseur remblais':'Ep_remb', 'Epaisseur alluvions':'Ep_alluv',
                   'pH H2O':'pH_H2O','T° pH H2O':'Temp_pH_H2O','T° pH CaCl2':'Temp_pH_CaCl2','pH CaCl2':'pH_CaCl2', 
                   'T° pH KCl':'Temp_pH_KCl', 'pH KCl':'pH_KCl', 'T° CE':'Temp_CE', 'Argile ':'Argile', 
                   'Résidus chauffage':'Residu_chauf','Nature':'Polluant', 'Intensité':'Intensite',
                   'Libres':'CN_libre','Fraction   2000 µm':'Fract_2000µ','Fraction   63 µm':'Fract_63µ', 
                   'Fraction   45 µm':'Fract_45µ','Fraction   16 µm':'Fract_16µ','Fraction   2 µm':'Fract_2µ',
                   'Totaux':'CN_tot'
                  }, inplace=True)

In [165]:
df.drop(columns=[df.columns.to_list()[x] for x in range(len(df.columns))
                      if re.search(r"Unnamed",df.columns.to_list()[x])], axis=1, inplace=True) 
df.replace(r'<|>','', inplace=True, regex=True)
df=df.query('ID==ID')
df['ID']=df['ID'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)
df['ID_ech']=df['ID_ech'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)
df.replace('-',np.nan, inplace=True)
df.insert(5, 'Type', 'Piezo')
df.insert(6, 'Type_ech', 'Sol')

In [166]:
for i in df.index:
    #r=re.search('(\w+)/.+',str(df.loc[i, 'ID_ech']))
    #if r : df.loc[i, 'ID']=r.group(1)
    r=re.search('^\d+',str(df.loc[i, 'ID']))
    if r : df.loc[i, 'ID']='F'+str(df.loc[i, 'ID'])

In [167]:
df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
df.replace('#',np.nan, inplace=True)

In [168]:
for i in df.index:
    x=df.loc[i,'Nappe']
    if not re.search('^F|^Mo', str(df.loc[i,'ID'])) : df.loc[i,'Type']='Inconnu'
        
    if re.search('[R|r]em', str(x)) : df.loc[i,'Nappe']='Remblais'
    elif re.search('[A|a]ll', str(x)) : df.loc[i,'Nappe']='Alluvions'
    elif re.search('[S|s]oc', str(x)) : df.loc[i,'Nappe']='Socle'
    elif re.search('[A|a]rg', str(x)) : df.loc[i,'Nappe']='Argile'
    else : df.loc[i,'Nappe']=''

In [169]:
df['Date_mes'] = '2050-01-01'
#df['Date_mes'] = df['Date_mes'].astype('datetime64')

In [170]:
POL_NAMES_MODEL = {'Arsenic': 'As', 'Cobalt': 'Co', 'Cadmium': 'Cd', 'Chrome': 'Cr', 'Chrome VI': 'Cr_VI', 'Chrome (VI)': 'Cr_VI', 'Chrome_total': 'Cr_tot', 'Cuivre': 'Cu', 'Mercure': 'Hg', 'Plomb': 'Pb', 'Nickel': 'Ni', 'Zinc': 'Zn', 'Cyanure(?:s)? (?libre(?:s)?)?': 'CN_libre', 'Cyanures (totaux)': 'CN_tot', 'CN_totaux': 'CN_tot', 'Cyanures (APE)': 'CN_tot_APE', 'Cyanures totaux APE':'CN_tot_APE', 'cyanure complex': 'CN_cplx', "Cyanures (libres) - NEN-EN-ISO 14403": 'CN_libre', 'Cyanures (libres)': 'CN_libre', 'CN_libres': 'CN_libre', 'thiocyanate': 'ThioCN', 'Benzène': 'Bnz', 'Toluène': 'Toln', 'Éthylbenzène': 'EthBnz', 'Orthoxylène': 'O-Xyl', 'O-xylènes': 'O-Xyl', 'mp-xylènes': 'P-M-Xyl', 'Para- et métaxylène': 'P-M-Xyl', 'Xylènes': 'Xyl', 'Styrène': 'Styr', 'BTEX totaux': 'BTEX_tot', 'Phénol': 'Phenol', 'Indice phénol': 'IPh', 'Naphtalène': 'Naphta', 'Acénaphtylène': 'Acenaphtyl', 'Acénaphtène': 'Acenaphtn', 'Fluorène': 'Flrn', 'Phénanthrène': 'Phenanthr', 'Anthracène': 'Anthrc', 'Fluoranthène': 'Flranth', 'Pyrène': 'Pyr', 'Benzo(a)anthracène': 'Bnz(a)anthrc', 'Chrysène': 'Chrys', 'Benzo(b)fluoranthène': 'Bnz(b)flranth', 'Benzo(k)fluoranthène': 'Bnz(k)flranth', 'Benzo(a)pyrène': 'Bnz(a)pyr', 'Dibenzo(ah)anthracène': 'Dibnz(ah)anthrc', 'Benzo(ghi)pérylène': 'Bnz(ghi)peryl', 'Indéno(1,2,3-cd)pyrène': 'Indeno(1.2.3-cd)pyr', 'HAP Totaux (16) - EPA': 'HAP_tot_EPA', '1,1-Dichloroéthane': '1.1-DCE', '1,2-Dichloroéthane': '1.2-DCE', '1,1-dichloroéthène': '1.1-DCEn', 'Cis-1,2-dichloroéthène': 'Cis-1.2-DCEn', 'Trans 1,2-dichloroéthylène': 'Trans-1.2-DCEyl', 'Dichlorométhane': 'DCM', 'dibromochlorométhane': 'DiBCM', 'bromodichlorométhane': 'BromoDCM', 'Totaux (cis,trans) 1,2-dichloroéthènes': '(cis.trans)-1.2-DCEn_tot', '1,2-dichloropropane': '1.2-DCP', 'Tétrachloroéthylène': 'TetraCEyn', 'Tétrachlorométhane': 'TCM', '1,1,1-Trichloroéthane': '1.1.1-TCE', '1,1,2-Trichloroéthane': '1.1.2-TCE', 'Trichloroéthylène': 'TCEyn', 'Chlorure de vinyle': 'CVinyl', '3-éthylphénol': '3-EthPhn', 'métacrésol': 'M-cresol', 'o-crésol': 'O-cresol', 'p-crésol': 'P-cresol', 'crésols (total)': 'Cresol_tot', '2,4-dimethylphénol': '2.4-DMetPhn', '2,5-dimethylphénol': '2.5-DMetPhn', '3,5+2,3-dimethylphénol+4-ethylphénol': 'DMetPhn_4-EthPhn', '2,6-dimethylphénol': '2.6-DMetPhn', '3,4-dimethylphénol': '3.4-DMetPhn', 'alkylphénols C2 total': 'AlkPhn_C2_tot', '2-éthylphénol': '2-EthPhn', 'para(tert)butylphénol': 'P(T)ButPhn', 'alkylphénols C4 total': 'AlkPhn_C4_tot', '2,3,5-triméthylphénol': '2.3.5-TMPethn', '3,4,5-triméthylphénol': '3.4.5-TMetPhn', '2-isopropylphénol': '2-IsoPropPhn', 'alkylphénols C3 total': 'AlkPhn_C3_tot', 'HAP totaux (10) VROM': 'HAP_tot_vrom', 'monochlorobenzène': 'MonoCBzn', '1,2-dichlorobenzène': '1.2-DCBzn', '1,3-dichlorobenzène': '1.3-DCBzn', '1,4-Dichlorobenzène': '1.4-DCBzn', '1,2,3-trichlorobenzène': '1.2.3-TCBzn', '1,2,4-trichlorobenzène': '1.2.4-TCBzn', '1,3,5-trichlorobenzène': '1.3.5-TCBzn', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes': '1.2.3.4_5-TCBzn', '1,2,3,4-tétrachlorobenzène': '1.2.3.4-TCBzn', 'hexachlorobenzène': 'HCBzn', '2-chlorophénol': '2-CPhn', '4-chlorophénol': '4-CPhn', '3-chlorophénol': '3-CPhn', 'monochlorophénol total': 'MonoCPhn_tot', '2,3-dichlorophénol': '2.3-DCPhn', '2,4+2,5-dichlorophénol': '2.4_5-DCPhn', '2,6-dichlorophénol': '2.6-DCPhn', '3,4-dichlorophénol': '3.4-DCPhn', '3,5-dichlorophénol': '3.5-DCPhn', 'dichlorophénol total': 'DCPhn_tot', '2,3,4-trichlorophénol': '2.3.4-TCPhn', '2,3,5-trichlorophénol': '2.3.5-TCPhn', '2,3,6-trichlorophénol': '2.3.6-TCPhn', '2,4,5-trichlorophénol': '2.4.5-TCPhn', '2,4,6-trichlorophénol': '2.4.6-TCPhn', '3,4,5-trichlorophénol': '3.4.5-TCPhn', 'trichlorophénol total': 'TriCPhn_tot', '2,3,5,6-tétrachlorophénol': '2.3.5.6-TCPhn', '2,3,4,6- tétrachlorophénol': '2.3.4.6-TCPhn', '2,3,4,5- tétrachlorophénol': '2.3.4.5-TCPhn', 'tétrachlorophénol total': 'TCPhn_tot', 'pentachlorobenzène': 'PCBzn', 'pentachlorophénol': 'PCPhn', 'chlorophénol total': 'CPhn_tot', 'EOX': 'EOX', 'fraction aromat. >C6-C7': 'Ar_C6-C7', 'fraction aromat. >C7-C8': 'Ar_C7-C8', 'fraction aromat. >C8-C10': 'Ar_C8-C10', 'fraction aliphat. C5-C6': 'Aliphat_C5-C6', 'fraction aliphat. >C6-C8': 'Aliphat_C6-C8', 'fraction aliphat. >C8-C10': 'Aliphat_C8-C10', 'Fraction C5-C8': 'C5-C8', 'Fraction C8-C10': 'C8-C10', 'Fraction C10-C12': 'C10-C12', 'Fraction C12-C16': 'C12-C16', 'Fraction C16-C21': 'C16-C21', 'Fraction C21 - C35': 'C21-C35', 'Fraction C35 - C40': 'C35-C40', 'C16 - C21': 'C16-C21', 'C21 - C35': 'C21-C35', 'C30 - C40': 'C30-C40', 'C35 - C40': 'C35-C40', 'aromat.>C6-C7': 'Ar_C6-C7', 'aromat.>C7-C8': 'Ar_C7-C8', 'aromat.>C8-C10': 'Ar_C8-C10', 'aromat.>C10-C12': 'Ar_C10-C12', 'aromat.>C12-C16': 'Ar_C12-C16', 'aromat.>C16-C21': 'Ar_C16-C21', 'aromat.>C21-C35': 'Ar_C21-C35', 'aliphat.>C5-C6': 'Alp_C5-C6', 'aliphat.>C6-C8': 'Alp_C6-C8', 'aliphat.>C8-C10': 'Alp_C8-C10', 'aliphat.>C10-C12': 'Alp_C10-C12', 'aliphat.>C12-C16': 'Alp_C12-C16', 'aliphat.>C16-C35': 'Alp_C16-C35', 'Hydrocarbures totaux C10-C35': 'HC_tot_C10-C35', 'totaux C10-C35': 'HC_tot_C10-C35', 'Totaux C10-C40': 'HC_tot_C10-C40', 'Hydrocarbures totaux C10-C40': 'HC_tot_C10-C40', 'MTBE': 'MTBE', 'PCB 28': 'PCB_28', 'PCB 52': 'PCB_52', 'PCB 101': 'PCB_101', 'PCB 118': 'PCB_118', 'PCB 138': 'PCB_138', 'PCB 153': 'PCB_153', 'PCB 180': 'PCB_180', 'PCB totaux (7)?': 'PCB_tot', 'Chlorure(?:s)?': 'Chlorure', 'Soufre Total': 'S_tot', 'sulfite(?:s)?': 'sulfite', 'sulfate(?:s)?': 'sulfate', 'COT': 'COT', 'DBO (5 jours)': 'DBO_5j', 'DCO': 'DCO', 'Ammonium': 'NH4', 'ammoniaque libre': 'NH3_libre', 'Nitrate': 'HNO3', 'Nitrite': 'HNO2', 'azote Kjeldahl': 'N_Kjdl', 'sulfures totaux': 'Sulfure_tot', 'sulfure(?:s)? (libre(?:s)?)': 'Sulfure_libre', 'calcium': 'Ca', 'potassium': 'K', 'magnésium': 'Mg', 'manganèse': 'Mn', 'sodium': "Na", 'fer': 'Fe', 'phosphore (total)': 'P_tot', 'phosphates (totaux)': 'Phosphate_tot', 'carbonate': 'CaCO3', 'bicarbonate': 'Bicarb', 'Phoshore': 'P', 'fer ((Fe))? total': 'Fe_tot', 'fer (2\+)': 'Fe2', 'fluorure(?:s)?': 'Fluorure', 'chlorures': 'Chlorure', 'chloroformes': 'Chloroforme', 'bromoformes': 'Bromoforme', 'bromure (libre)': 'Br_libre', 'Iph.': 'IPh', 'CN_NCl': 'CN_NCl', '2-naphtol': '2-Naphtol', 'thymol': 'Thymol', 'chloroforme': 'Chloroforme', 'bromoforme': 'Bromoforme', 'C12-C20': 'C12-C20', 'C20-C30': 'C20-C30', 'Non chloro destruct.':'Non_chloro_destr', 'SOM VROM 10':'HAP_tot_vrom','SOM EPA 16':'HAP_tot_EPA', 'SOM_C5_C35':'HC_tot_C15-C35', 'SOM_C10_C40':'HC_tot_C10-C40', 'SOM BTEX':'BTEX_tot','C5_C8':'C5-C8', 'C8_C10':'C8-C10', 'C10_C12':'C10-C12', 'C12_C16':'C12-C16', 'C30_C35':'C30-C35'}

In [171]:
df = col_ren(df, name=POL_NAMES_MODEL, mode=1, cutoff=0.7)#, verbose=True)


[1;34mPossible new pollutants names:[0;0m
['Societe', 'Zone', 'Sous_zone', 'Numéro_zone', 'ID', 'Type', 'Type_ech', 'Affectation', 'X', 'Y', 'Z', 'Long_for', 'Refus', 'ID_ech', 'Ech_top', 'Ech_base', 'Soumis', 'Nappe', 'Description', 'Ep_remb', 'Ep_alluv', 'Intensite', 'Min_organo', 'Max_organo', 'Polluant', 'MS', 'pH_H2O', 'Temp_pH_H2O', 'Temp_pH_CaCl2', 'pH_CaCl2', 'Temp_pH_KCl', 'pH_KCl', 'Temp_CE', 'CE', 'MO', 'Residu_chauf', 'Argile', 'Fract_2000µ', 'Fract_63µ', 'Fract_45µ', 'Fract_16µ', 'Fract_2µ', 'Chrome_VI', 'CN_libre', 'CN_tot', 'Thiocyantes', 'Cyanures totaux EPA', 'Ethylbenzène', 'Anthracene', 'Benzoaanthracène', 'Benzo(a)pyrene', 'Indéno[123cd]pyrène', 'Acenaphtylene', 'Acenaphthene', 'Benzo_b_fluoranthene', 'Dibenzo[ah]anthracène', 'C16_C21', 'C21_C35', 'C35_C40', 'C21_C30', 'SOM C10_C40', 'Idc_phenol', 'Date_mes']


In [172]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 1423 ; measure: 1423 ; lithology: 0 ; analysis: 1423 ; equipement: 0 ; unknown: 49 ; 

[1;32mNot used columns:[0;0m
 ['Numéro_zone', 'Affectation', 'Soumis', 'Ep_remb', 'Ep_alluv', 'Residu_chauf']


In [173]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 304 ; measure: 1423 ; lithology: 0 ; analysis: 1423 ;equipement: 0 ; unknown: 17


##### data merging

In [174]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [175]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID',
                valid_dict={'Societe_x':list(conflict_df.index), 'Type_x':list(conflict_df.index),
                            'Long_for_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

all conflicts have been fixed!


In [176]:
source_bh = dataset.copy()

In [177]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [178]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID','Date_mes'], dist_max=1., drop_skip_col=['index'])

In [179]:
dataset = source_mes
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset.copy()

In [180]:
source_ukw, conflict_df = data_merger(source_ukw, ukw, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [181]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 520 ; source_eqp: 0 ; source_uknw: 32 ; source_litho: 1628 ; source_an: 1572 ; source_mes: 1723


### $\color{red}{\textbf{Excel data final merge}}$

In [182]:
bh_coords = source_bh[['ID', 'X', 'Y', 'Z','Date_for']].copy()

In [183]:
source_an, conflict_df = data_merger(source_an, bh_coords, how='left', on='ID', dist_max=1., drop_skip_col=['index'])

In [184]:
source_litho, conflict_df = data_merger(source_litho, bh_coords, how='left', on='ID', dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [185]:
dataset = source_litho
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Date_for_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_litho = dataset.copy()

all conflicts have been fixed!


In [186]:
source_mes, conflict_df = data_merger(source_mes, bh_coords, how='left', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [187]:
source_ukw, conflict_df = data_merger(source_ukw, bh_coords, how='left', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [188]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)
source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 520 ; source_eqp: 0 ; source_uknw: 32 ; source_litho: 1627 ; source_an: 1583 ; source_mes: 1740


#### ======================================================================================

In [189]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 3-obsrevations terrain et mesures piézos phase 2.xlsx

* **Sheet : 'Piézométrie'**

In [190]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/observ_terrain/'
sheet='Piezometrie'

In [191]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'obsrevations terrain et mesures piézos phase 2.xlsx', sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

Columns dropped :['Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6']

Rows : 31, columns : 19


interactive(children=(IntSlider(value=10, description='rows', max=31, min=10, readout=False), IntSlider(value=…

In [192]:
sdf=df[df.columns.to_list()[:3]]
sdf=na_line_drop(sdf,0)
sdf.rename(columns={'Niveau \npiézométrique':'Niv_eau_sol', 'Commentaires ':'Date_ech'}, inplace=True)

9 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [193]:
sdf2=df.loc[:11, df.columns.to_list()[3:-1]]
sdf2.rename(columns={'Unnamed: 7':'Date_mes', 'Unnamed: 8':'Nappe', 'Unnamed: 9':'ID', 'NP/piézo [m]':'Niv_eau_pz',
       'dim. piezo hors sol [m]':'Ht_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 'Prof. piézo/piézo [m]':'Long_pz',
       'Prof. piézo/sol [m]':'Long_pz_sol', 'CE [mS/cm]':'CE','t° [°C]':'Temp','O2 dissous\n[%]':'O_diss', 
        'Observations':'Rmq'}, 
           inplace=True)

In [194]:
for i in range(len(sdf2['ID'])):
    sdf2.loc[i,'ID']=re.sub(r'^P','F', sdf2.loc[i,'ID'])
    
    if pd.isnull(sdf2.loc[i,'CE']) and not pd.isnull(sdf2.loc[i,'CE [µS/cm]']):
        sdf2.loc[i,'CE']=sdf2.loc[i,'CE [µS/cm]']/1000

sdf2.drop(['CE [µS/cm]'], axis=1, inplace=True)

In [195]:
df=df.loc[14:, df.columns.to_list()[3:-1]]
df.rename(columns={'Unnamed: 7':'Date_mes', 'Unnamed: 8':'Nappe', 'Unnamed: 9':'ID', 'NP/piézo [m]':'Niv_eau_pz',
       'dim. piezo hors sol [m]':'Ht_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 'Prof. piézo/piézo [m]':'Long_pz',
       'Prof. piézo/sol [m]':'Long_pz_sol', 'CE [mS/cm]':'CE','t° [°C]':'Temp','O2 dissous\n[%]':'O_diss', 
        'Observations':'Rmq'}, 
           inplace=True)
df.drop([19,20], inplace=True)
df.reset_index(drop=True, inplace=True)

In [196]:
for i in range(len(df['ID'])):
    df.loc[i,'ID']=re.sub(r'^P','F', df.loc[i,'ID'])
    
    if pd.isnull(df.loc[i,'CE']) and not pd.isnull(df.loc[i,'CE [µS/cm]']):
        df.loc[i,'CE']=df.loc[i,'CE [µS/cm]']/1000
        
df.drop(['CE [µS/cm]', 'O_diss'], axis=1, inplace=True)

In [197]:
df, conflict_df=data_merger(sdf2, df, how='outer', on='ID')

In [198]:
df = na_col_drop(df, 5)
df['Type'] = 'Piezo'

Columns dropped :['O_diss']



In [199]:
dataframe_viewer(df, rows=3, un_val=['ID','ID_ech'])

Rows : 27, columns : 14, Unique values on cols: {'ID': 27, 'ID_ech': 'NA'}


interactive(children=(IntSlider(value=3, description='rows', max=27, min=3, readout=False), IntSlider(value=12…

In [200]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 27 ; measure: 27 ; lithology: 0 ; analysis: 0 ; equipement: 0 ; unknown: 0 ; 


In [201]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ;' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 27 ; measure: 27 ; lithology: 0 ; analysis: 0 ;equipement: 0 ; unknown: 0


In [202]:
source_bh = bh
source_mes = mes

In [203]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 27 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 27


#### ======================================================================================

In [204]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 4-profondeur de contact campagne de forages octobre 2019.xlsx

* **Sheet : 'Feuil1'**

In [205]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Prof_contact_sol_forage/'
sheet='Feuil1'

In [206]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/ouvrages/profondeur de contact campagne de forages octobre 2019.xlsx', 
                   sheet_name='Feuil1', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df)

Rows : 8, columns : 5


interactive(children=(IntSlider(value=8, description='rows', max=8, min=8, readout=False), IntSlider(value=5, …

In [207]:
df.rename(columns={'n°forage ':'ID','profondeur(m)':'Long_for','x':'X', 'y':'Y', 'z':'Z'}, inplace=True)
df['Type']='Forage' # type is not defined clearly in data
df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

bh=df

In [208]:
source_bh=bh

In [209]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 8 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


#### ======================================================================================

In [210]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 5-Forages_Pilote_Decoupe.xlsx

* **Sheet : 'leve'**

In [211]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Forage_Pilote/'
sheet='leve_Z_elect_pos'

In [212]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/geometrie_electrodes_et_sondes/Forages_Pilote_Decoupe.xlsx', 
                   sheet_name='leve')#, skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

Columns dropped :['Unnamed: 10']

Rows : 72, columns : 11


interactive(children=(IntSlider(value=5, description='rows', max=72, min=5, readout=False), IntSlider(value=11…

In [213]:
df.rename(columns={'Ref_puits':'ID','Niveau mesuré':'Z_mes', 'Niveau corrigé':'Z','Z_diff [m] repere_local':'Diff_Z_local',
                   'long_fin [m]':'Long_for','Pos_Inox_#1 [m]':'Pos_Inox_#1', 'Unnamed: 11':'Rmq',
                   'Pos_Inox_#6 [m]':'Pos_Inox_#6', 'Pos_Impol_#3 [m]':'Pos_Impol_#3'}, inplace=True)

In [214]:
df['Type']='Forage' # type is not defined clearly in data
df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

elc = df[['ID','Pos_Inox_#6', 'Pos_Impol_#3']] # 'ID' is for boreholes
bh = df[['ID','Z','Diff_Z_local','Long_for', 'Type']]# Z_local origin = 145.5 [m]

In [215]:
dataframe_viewer(df, rows=3, un_val=['ID','ID_ech'])

Rows : 72, columns : 12, Unique values on cols: {'ID': 72, 'ID_ech': 'NA'}


interactive(children=(IntSlider(value=3, description='rows', max=72, min=3, readout=False), IntSlider(value=12…

In [216]:
source_bh = bh
source_elc = elc

In [217]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
elc.to_csv(tmp_dir+sheet+'_Electrodes.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_elc.to_csv(tmp_dir+'source_merge/source_Electrodes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_elect:{len(source_elc)} ;')

source_bh:72 ; source_elect:72 ;


#### ======================================================================================

In [11]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

NameError: name 'pd' is not defined

## 6-Liste XY investigations.xlsx
* **Sheet : 'SOL_EAU'**

In [219]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Liste_XY/'
sheet='Sol_Eau'

In [220]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='SOL')#, skiprows=4)
df['Type_ech']='Sol'
df.rename(columns={'N°':'ID_ech'}, inplace=True)

df1 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU PR')#, skiprows=4)
df1['Type_ech']='Eau'
df1['Nappe']='Socle'
df1.rename(columns={'N°':'ID_ech'}, inplace=True)

df2 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU RB')#, skiprows=4)
df2['Type_ech']='Eau'
df2['Nappe']='remblais'
df2.rename(columns={'N°':'ID_ech'}, inplace=True)

df3 = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU ALL')#, skiprows=4)
df3['Type_ech']='Eau'
df3['Nappe']='Alluvions'
df3.rename(columns={'N°':'ID_ech'}, inplace=True)

In [221]:
dataframe_viewer(df, rows=3, un_val=['ID','ID_ech'])

Rows : 134, columns : 4, Unique values on cols: {'ID': 'NA', 'ID_ech': 134}


interactive(children=(IntSlider(value=3, description='rows', max=134, min=3, readout=False), IntSlider(value=4…

In [222]:
mdf, conflict_df=data_merger(df1, df, 'outer', 'ID_ech')

In [223]:
mdf, conflict_df=data_merger(mdf, df2, 'outer', 'ID_ech')

In [224]:
mdf, conflict_df=data_merger(mdf, df3, 'outer', 'ID_ech')

Conflict values present. Please resolve this manually !


In [225]:
dataset = mdf
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Nappe_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
mdf = dataset.copy()

all conflicts have been fixed!


In [226]:
source_an = mdf
#source_an.insert(0,'ID', source_an.pop('ID_ech'))

In [227]:
source_an = gen_id_from_ech(source_an, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [228]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 254 ; source_mes: 0


#### ======================================================================================

## 7-Résultats phase 1_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [229]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


In [230]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Phase_1_Memoris/'
sheet='Result_sol'

In [231]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

1 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 135, columns : 35


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [232]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [233]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [234]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [235]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['col_1', "Nom / description d'échantillon", 'Date de prélèvement', "Nature de l'étude (*)", 'Terrain', 'Epaisseur de remblais', 'Epaisseur alluvions', "Nature de l'observation organoleptique", 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'zone', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'X Lambert', 'Y Lambert', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (***)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'Matières organiques', 'GRANULOMETRIE', 'Fraction argileuse']



In [236]:
name=['ID_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [237]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x=='R': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='A': ech_df.loc[i,'Description']='Argiles'
    elif x=='S': ech_df.loc[i,'Description']='Sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('x|X', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [238]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [239]:
an=col_ren(an, 1)

In [240]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène', 'col_63':'EOX'}, inplace=True)

In [241]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'EOX', 'EOX (****)', 'HYDROCARBURES TOTAUX', 'fraction aromat. C6-C7', 'fraction aromat. C7-C8', 'fraction aromat. C8-C10', 'fraction aliphat. C5-C6', 'fraction aliphat. C6-C8', 'fraction aliphat. C8-C10', 'Fraction C35 - C40', 'Hydrocarbures totaux C10-C40', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYL (PCB)', 'PCB 28', 'PCB 52', 'PCB 101', 'PCB 118', 'PCB 138', 'PCB 153', 'PCB 180', 'PCB totaux (7)', 'F4/2M*', 'Teneur mesurée', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = Suivi d'assainissement", '(**) L: limon, A: Argile, S: Sable, R: Remblai', '(***) ib : imperméable (béton) ; ih : imperméable hydrocarboné ; p : perméable (gravier

In [242]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)#, verbose=True)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Cyanure (libre)', 'Cyanure (totaux)', 'cyanure (APE)', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [243]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [244]:
an = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [245]:
source_an=an

In [246]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 29 ; source_mes: 0


* **Sheet : 'Résult EAU'**

In [247]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Phase_1_Memoris/'
sheet='Result_eau'

In [248]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(df, rows=5)

1 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 136, columns : 23


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=136, min=5, readout=False), IntSlider(value=1…

In [249]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [250]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [251]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('DESCRIPTION SOMMAIRE', 2), ('Prof. arrêt du forage', 2)]


In [252]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [253]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [254]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['21:DESCRIPTION SOMMAIRE', '27:Prof. arrêt du forage']


In [255]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['col_1', "Nature de l'étude (*)", 'Observations organoleptiques', 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (**)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'PARAMETRES PHYSICO-CHIMIQUES ', 'ORP']



In [256]:
name=['ID_ech','Date_ech','Num_maille','Affectation','X','Y','Zsol','Long_for','Prof_crep','Long_pz',
      'Niv_eau_sol','pH','CE','T']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [257]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]

ech_df['Type_equip'] = 'Crepine'
ech_df.drop(columns=['Prof_crep'], inplace=True)

In [258]:
#ech_df['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [259]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [260]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('Teneur mesurée', 2)]


In [261]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech', 'col_43':'phénanthrène'}, inplace=True)

In [262]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [263]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'PHENOLS', 'crésols (total)', 'CHLOROPHENOLS', '2-chlorophénol', 'monochlorophénol total', 'dichlorophénol total', '2,4,5-trichlorophénol', '2,4,6-trichlorophénol', 'trichlorophénol total', '2,3,4,6- tétrachlorophénol', 'tétrachlorophénol total', 'pentachlorophénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'CHLOROBENZENES', 'monochlorobenzène', '1,3-dichlorobenzène', '1,2-dichlorobenzène', '1,4-Dichlorobenzène', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes', 'pentachlorobenzène', 'hexachlorobenzène', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYLS (PCB)', 'PCB totaux (7)', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = Suivi d'assainissement", '(**) ib : imperméable (béton) ; ih : imperméable hydrocarb

In [264]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'cyanure (APE)', 'Tétrachloroéthylène ', 'fraction aromat. C6-C7', 'fraction aromat. C7-C8', 'fraction aromat. C8-C10', 'fraction aliphat. C6-C8', 'fraction aliphat. C8-C10', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [265]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [266]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [267]:
df['Type'] = 'Piezo'
df['Date_mes'] = df['Date_ech']

In [268]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 17 ; measure: 17 ; lithology: 0 ; analysis: 17 ; equipement: 17 ; unknown: 0 ; 

[1;32mNot used columns:[0;0m
 ['Aliphat_C8-C10', 'Affectation', 'Aliphat_C5-C6', 'Num_maille', 'Aliphat_C6-C8']


In [269]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 14 ; measure: 17 ; lithology: 0 ; analysis: 17 ; equipement: 17 ; unknown: 0


In [270]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [271]:
source_bh = bh
source_mes = mes
source_eqp = eqp

In [272]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 14 ; source_eqp: 17 ; source_uknw: 0 ; source_litho: 0 ; source_an: 46 ; source_mes: 17


#### ======================================================================================

In [18]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 8-Résultats phase 2_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [19]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Phase_2_Memoris/'
sheet='Result_SOL'

In [20]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
dataframe_viewer(df, rows=5)

1 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 135, columns : 31


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [21]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [22]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [23]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [24]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['col_0', "Nom / description d'échantillon", "Nature de l'étude (*)", 'Terrain', 'Epaisseur de remblais', 'Epaisseur alluvions', "Nature de l'observation organoleptique", 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'zone', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'X Lambert', 'Y Lambert', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (***)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'Matières organiques', 'GRANULOMETRIE', 'Fraction argileuse']



In [25]:
name=['ID_ech', 'Date_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [26]:
set(ech_df['Description'])

{'L', 'LA', 'LS', 'R'}

In [27]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x=='R': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='LA': ech_df.loc[i,'Description']='Limons et argiles'
    elif x=='LS': ech_df.loc[i,'Description']='Limons et sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [28]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [29]:
an=col_ren(an, 1)

In [30]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène'}, inplace=True)

In [31]:
an.drop(list(range(5)), axis=0, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Phénol', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', '1,1-Dichloroéthane', '1,2-Dichloroéthane', '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène', 'Trans 1,2-dichloroéthylène', 'Dichlorométhane', 'Totaux (cis,trans) 1,2-dichloroéthènes', '1,2-dichloropropane', 'Tétrachloroéthylène', 'Tétrachlorométhane', '1,1,1-Trichloroéthane', '1,1,2-Trichloroéthane', 'Trichloroéthylène', 'Chloroforme', 'Chlorure de vinyle', 'col_63', 'EOX (****)', 'HYDROCARBURES TOTAUX', 'Fraction C35 - C40', 'Hydrocarbures totaux C10-C40', 'METHYL-TERT-BUTYL-ETHER', 'MTBE', 'POLYCHLOROBIPHENYL (PCB)', 'PCB 28', 'PCB 52', 'PCB 101', 'PCB 118', 'PCB 138', 'PCB 153', 'PCB 180', 'PCB totaux (7)', 'F4/2M*', 'Teneur mesurée', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = E

In [32]:
POL_NAMES_MODEL = {'Arsenic': 'As', 'Cobalt': 'Co', 'Cadmium': 'Cd', 'Chrome': 'Cr', 'Chrome VI': 'Cr_VI', 'Chrome (VI)': 'Cr_VI', 'Chrome_total': 'Cr_tot', 'Cuivre': 'Cu', 'Mercure': 'Hg', 'Plomb': 'Pb', 'Nickel': 'Ni', 'Zinc': 'Zn', 'Cyanure(?:s)? (?libre(?:s)?)?': 'CN_libre', 'Cyanures (totaux)': 'CN_tot', 'CN_totaux': 'CN_tot', 'Cyanures (APE)': 'CN_tot_APE', 'Cyanures totaux APE':'CN_tot_APE', 'cyanure complex': 'CN_cplx', "Cyanures (libres) - NEN-EN-ISO 14403": 'CN_libre', 'Cyanures (libres)': 'CN_libre', 'CN_libres': 'CN_libre', 'thiocyanate': 'ThioCN', 'Benzène': 'Bnz', 'Toluène': 'Toln', 'Éthylbenzène': 'EthBnz', 'Orthoxylène': 'O-Xyl', 'O-xylènes': 'O-Xyl', 'mp-xylènes': 'P-M-Xyl', 'Para- et métaxylène': 'P-M-Xyl', 'Xylènes': 'Xyl', 'Styrène': 'Styr', 'BTEX totaux': 'BTEX_tot', 'Phénol': 'Phenol', 'Indice phénol': 'IPh', 'Naphtalène': 'Naphta', 'Acénaphtylène': 'Acenaphtyl', 'Acénaphtène': 'Acenaphtn', 'Fluorène': 'Flrn', 'Phénanthrène': 'Phenanthr', 'Anthracène': 'Anthrc', 'Fluoranthène': 'Flranth', 'Pyrène': 'Pyr', 'Benzo(a)anthracène': 'Bnz(a)anthrc', 'Chrysène': 'Chrys', 'Benzo(b)fluoranthène': 'Bnz(b)flranth', 'Benzo(k)fluoranthène': 'Bnz(k)flranth', 'Benzo(a)pyrène': 'Bnz(a)pyr', 'Dibenzo(ah)anthracène': 'Dibnz(ah)anthrc', 'Benzo(ghi)pérylène': 'Bnz(ghi)peryl', 'Indéno(1,2,3-cd)pyrène': 'Indeno(1.2.3-cd)pyr', 'HAP Totaux (16) - EPA': 'HAP_tot_EPA', '1,1-Dichloroéthane': '1.1-DCE', '1,2-Dichloroéthane': '1.2-DCE', '1,1-dichloroéthène': '1.1-DCEn', 'Cis-1,2-dichloroéthène': 'Cis-1.2-DCEn', 'Trans 1,2-dichloroéthylène': 'Trans-1.2-DCEyl', 'Dichlorométhane': 'DCM', 'dibromochlorométhane': 'DiBCM', 'bromodichlorométhane': 'BromoDCM', 'Totaux (cis,trans) 1,2-dichloroéthènes': '(cis.trans)-1.2-DCEn_tot', '1,2-dichloropropane': '1.2-DCP', 'Tétrachloroéthylène': 'TetraCEyn', 'Tétrachlorométhane': 'TCM', '1,1,1-Trichloroéthane': '1.1.1-TCE', '1,1,2-Trichloroéthane': '1.1.2-TCE', 'Trichloroéthylène': 'TCEyn', 'Chlorure de vinyle': 'CVinyl', '3-éthylphénol': '3-EthPhn', 'métacrésol': 'M-cresol', 'o-crésol': 'O-cresol', 'p-crésol': 'P-cresol', 'crésols (total)': 'Cresol_tot', '2,4-dimethylphénol': '2.4-DMetPhn', '2,5-dimethylphénol': '2.5-DMetPhn', '3,5+2,3-dimethylphénol+4-ethylphénol': 'DMetPhn_4-EthPhn', '2,6-dimethylphénol': '2.6-DMetPhn', '3,4-dimethylphénol': '3.4-DMetPhn', 'alkylphénols C2 total': 'AlkPhn_C2_tot', '2-éthylphénol': '2-EthPhn', 'para(tert)butylphénol': 'P(T)ButPhn', 'alkylphénols C4 total': 'AlkPhn_C4_tot', '2,3,5-triméthylphénol': '2.3.5-TMPethn', '3,4,5-triméthylphénol': '3.4.5-TMetPhn', '2-isopropylphénol': '2-IsoPropPhn', 'alkylphénols C3 total': 'AlkPhn_C3_tot', 'HAP totaux (10) VROM': 'HAP_tot_vrom', 'monochlorobenzène': 'MonoCBzn', '1,2-dichlorobenzène': '1.2-DCBzn', '1,3-dichlorobenzène': '1.3-DCBzn', '1,4-Dichlorobenzène': '1.4-DCBzn', '1,2,3-trichlorobenzène': '1.2.3-TCBzn', '1,2,4-trichlorobenzène': '1.2.4-TCBzn', '1,3,5-trichlorobenzène': '1.3.5-TCBzn', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes': '1.2.3.4_5-TCBzn', '1,2,3,4-tétrachlorobenzène': '1.2.3.4-TCBzn', 'hexachlorobenzène': 'HCBzn', '2-chlorophénol': '2-CPhn', '4-chlorophénol': '4-CPhn', '3-chlorophénol': '3-CPhn', 'monochlorophénol total': 'MonoCPhn_tot', '2,3-dichlorophénol': '2.3-DCPhn', '2,4+2,5-dichlorophénol': '2.4_5-DCPhn', '2,6-dichlorophénol': '2.6-DCPhn', '3,4-dichlorophénol': '3.4-DCPhn', '3,5-dichlorophénol': '3.5-DCPhn', 'dichlorophénol total': 'DCPhn_tot', '2,3,4-trichlorophénol': '2.3.4-TCPhn', '2,3,5-trichlorophénol': '2.3.5-TCPhn', '2,3,6-trichlorophénol': '2.3.6-TCPhn', '2,4,5-trichlorophénol': '2.4.5-TCPhn', '2,4,6-trichlorophénol': '2.4.6-TCPhn', '3,4,5-trichlorophénol': '3.4.5-TCPhn', 'trichlorophénol total': 'TriCPhn_tot', '2,3,5,6-tétrachlorophénol': '2.3.5.6-TCPhn', '2,3,4,6- tétrachlorophénol': '2.3.4.6-TCPhn', '2,3,4,5- tétrachlorophénol': '2.3.4.5-TCPhn', 'tétrachlorophénol total': 'TCPhn_tot', 'pentachlorobenzène': 'PCBzn', 'pentachlorophénol': 'PCPhn', 'chlorophénol total': 'CPhn_tot', 'EOX': 'EOX', 'fraction aromat. >C6-C7': 'Ar_C6-C7', 'fraction aromat. >C7-C8': 'Ar_C7-C8', 'fraction aromat. >C8-C10': 'Ar_C8-C10', 'fraction aliphat. C5-C6': 'Alp_C5-C6', 'fraction aliphat. >C6-C8': 'Alp_C6-C8', 'fraction aliphat. >C8-C10': 'Alp_C8-C10', 'Fraction C5-C8': 'C5-C8', 'Fraction C8-C10': 'C8-C10', 'Fraction C10-C12': 'C10-C12', 'Fraction C12-C16': 'C12-C16', 'Fraction C16-C21': 'C16-C21', 'Fraction C21 - C35': 'C21-C35', 'Fraction C35 - C40': 'C35-C40', 'C16 - C21': 'C16-C21', 'C21 - C35': 'C21-C35', 'C30 - C40': 'C30-C40', 'C35 - C40': 'C35-C40', 'aromat.>C6-C7': 'Ar_C6-C7', 'aromat.>C7-C8': 'Ar_C7-C8', 'aromat.>C8-C10': 'Ar_C8-C10', 'aromat.>C10-C12': 'Ar_C10-C12', 'aromat.>C12-C16': 'Ar_C12-C16', 'aromat.>C16-C21': 'Ar_C16-C21', 'aromat.>C21-C35': 'Ar_C21-C35', 'aliphat.>C5-C6': 'Alp_C5-C6', 'aliphat.>C6-C8': 'Alp_C6-C8', 'aliphat.>C8-C10': 'Alp_C8-C10', 'aliphat.>C10-C12': 'Alp_C10-C12', 'aliphat.>C12-C16': 'Alp_C12-C16', 'aliphat.>C16-C35': 'Alp_C16-C35', 'Hydrocarbures totaux C10-C35': 'HC_tot_C10-C35', 'totaux C10-C35': 'HC_tot_C10-C35', 'Totaux C10-C40': 'HC_tot_C10-C40', 'Hydrocarbures totaux C10-C40': 'HC_tot_C10-C40', 'MTBE': 'MTBE', 'PCB 28': 'PCB_28', 'PCB 52': 'PCB_52', 'PCB 101': 'PCB_101', 'PCB 118': 'PCB_118', 'PCB 138': 'PCB_138', 'PCB 153': 'PCB_153', 'PCB 180': 'PCB_180', 'PCB totaux (7)?': 'PCB_tot', 'Chlorure(?:s)?': 'Chlorure', 'Soufre Total': 'S_tot', 'sulfite(?:s)?': 'sulfite', 'sulfate(?:s)?': 'sulfate', 'COT': 'COT', 'DBO (5 jours)': 'DBO_5j', 'DCO': 'DCO', 'Ammonium': 'NH4', 'ammoniaque libre': 'NH3_libre', 'Nitrate': 'HNO3', 'Nitrite': 'HNO2', 'azote Kjeldahl': 'N_Kjdl', 'sulfures totaux': 'Sulfure_tot', 'sulfure(?:s)? (libre(?:s)?)': 'Sulfure_libre', 'calcium': 'Ca', 'potassium': 'K', 'magnésium': 'Mg', 'manganèse': 'Mn', 'sodium': "Na", 'fer': 'Fe', 'phosphore (total)': 'P_tot', 'phosphates (totaux)': 'Phosphate_tot', 'carbonate': 'CaCO3', 'bicarbonate': 'Bicarb', 'Phoshore': 'P', 'fer ((Fe))? total': 'Fe_tot', 'fer (2\+)': 'Fe2', 'fluorure(?:s)?': 'Fluorure', 'chlorures': 'Chlorure', 'chloroformes': 'Chloroforme', 'bromoformes': 'Bromoforme', 'bromure (libre)': 'Br_libre', 'Iph.': 'IPh', 'CN_NCl': 'CN_NCl', '2-naphtol': '2-Naphtol', 'thymol': 'Thymol', 'chloroforme': 'Chloroforme', 'bromoforme': 'Bromoforme', 'C12-C20': 'C12-C20', 'C20-C30': 'C20-C30', 'Non chloro destruct.':'Non_chloro_destr', 'SOM VROM 10':'HAP_tot_vrom','SOM EPA 16':'HAP_tot_EPA', 'SOM_C5_C35':'HC_tot_C15-C35', 'SOM_C10_C40':'HC_tot_C10-C40', 'SOM BTEX':'BTEX_tot','C5_C8':'C5-C8', 'C8_C10':'C8-C10', 'C10_C12':'C10-C12', 'C12_C16':'C12-C16', 'C30_C35':'C30-C35','Fraction   2000 µm':'Fract_2000µ','Fraction   63 µm':'Fract_63µ', 'Fraction   45 µm':'Fract_45µ','Fraction   16 µm':'Fract_16µ','Fraction   2 µm':'Fract_2µ', 'Fraction 2 mm':'Fract_2', 'Fraction +2 mm':'Fract_2+', 'Fract_2':'Fract_2', 'Fract_2+':'Fract_2+', 'Mat. organique':'MO', 'Mat. sèche':'MS', 'Argile':'Argile'}

In [33]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Cyanure (libre)', 'Cyanure (totaux)', 'cyanure (APE)', 'fraction aromat. C6-C7', 'fraction aromat. C7-C8', 'fraction aromat. C8-C10', 'fraction aliphat. C6-C8', 'fraction aliphat. C8-C10', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [34]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [35]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [36]:
df['Type'] = 'Forage'

In [37]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 25 ; measure: 0 ; lithology: 0 ; analysis: 25 ; equipement: 0 ; unknown: 0 ; 


In [38]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 12 ; measure: 0 ; lithology: 0 ; analysis: 25 ; equipement: 0 ; unknown: 0


In [39]:
source_bh = bh
source_an = an

In [40]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 12 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 25 ; source_mes: 0


* **Sheet : 'Résult EAU'**

In [41]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Phase_2_Memoris/'
sheet='Result_eau'

In [42]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 138, columns : 17


interactive(children=(IntSlider(value=5, description='rows', max=138, min=5, readout=False), IntSlider(value=1…

In [43]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [44]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [45]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('DESCRIPTION SOMMAIRE', 2), ('Prof. arrêt du forage', 2)]


In [46]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [47]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [48]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['21:DESCRIPTION SOMMAIRE', '27:Prof. arrêt du forage']


In [49]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['Nom du piézomètre', "Nature de l'étude (*)", 'Observations organoleptiques', 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'Numéro de maille', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (**)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'PARAMETRES PHYSICO-CHIMIQUES ', 'ORP']



In [50]:
name=['ID_ech', 'Date_ech','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol',
      'Niv_eau_sol','pH', 'CE', 'T']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [51]:
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [52]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]
    
ech_df.drop(columns=['Prof_crep'], inplace=True)
ech_df['Type_equip'] = 'Crepine'
ech_df['Type']='Piezo'

In [53]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [54]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('Teneur mesurée', 2)]


In [55]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={'col_0':'ID_ech', 'col_43':'phénanthrène'}, inplace=True)

In [56]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [57]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

Columns dropped :['METAUX LOURDS', 'Chrome VI', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'PHENOLS', 'crésols (total)', 'CHLOROPHENOLS', '2-chlorophénol', 'monochlorophénol total', 'dichlorophénol total', '2,4,5-trichlorophénol', '2,4,6-trichlorophénol', 'trichlorophénol total', '2,3,4,6- tétrachlorophénol', 'tétrachlorophénol total', 'pentachlorophénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'CHLOROBENZENES', 'monochlorobenzène', '1,3-dichlorobenzène', '1,2-dichlorobenzène', '1,4-Dichlorobenzène', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes', 'pentachlorobenzène', 'hexachlorobenzène', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYLS (PCB)', 'PCB totaux (7)', 'AUTRES ANALYSES ', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = Suivi d'assainissement", '(**) ib : imperméable (bé

In [58]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'cyanure (APE)', 'Tétrachloroéthylène ', 'fraction aromat. C6-C7', 'fraction aromat. C7-C8', 'fraction aromat. C8-C10', 'fraction aliphat. C6-C8', 'fraction aliphat. C8-C10', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [59]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [60]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']

In [61]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx)

In [62]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 11 ; measure: 11 ; lithology: 0 ; analysis: 11 ; equipement: 11 ; unknown: 0 ; 

[1;32mNot used columns:[0;0m
 ['Affectation']


In [63]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 10 ; measure: 11 ; lithology: 0 ; analysis: 11 ; equipement: 11 ; unknown: 0


In [68]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [69]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [70]:
source_mes = mes
source_eqp = eqp

In [71]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 22 ; source_eqp: 11 ; source_uknw: 0 ; source_litho: 0 ; source_an: 36 ; source_mes: 11


#### ======================================================================================

In [72]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 9-Ensemble des résultats Memoris version Seafile.xls
* **Sheet : 'Résult SOL'**

In [73]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Memoris_seafile/'
sheet='Result_SOL'

In [74]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
dataframe_viewer(df, rows=5)

2 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 138, columns : 66


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=138, min=5, readout=False), IntSlider(value=1…

In [75]:
ech_df=df.loc[:37]
an=df.loc[38:]

In [76]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [77]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [78]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['col_1', "Nom / description d'échantillon", "Nature de l'étude (*)", 'Terrain', 'Epaisseur de remblais', 'Epaisseur alluvions', "Nature de l'observation organoleptique", 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'zone', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'X Lambert', 'Y Lambert', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (***)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'Matières organiques', 'GRANULOMETRIE', 'Fraction argileuse']



In [79]:
ech_df.drop(columns=ech_df.columns[[-3,-4]], axis=1, inplace=True)

In [80]:
name=['ID_ech', 'Date_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [81]:
set(ech_df['Description'])

{'L', 'LA', 'LS', 'R', 'R '}

In [82]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x=='R' or x=='R ': ech_df.loc[i,'Description']='Remblais'
    elif x=='L': ech_df.loc[i,'Description']='Limons'
    elif x=='LA': ech_df.loc[i,'Description']='Limons et argiles'
    elif x=='LS': ech_df.loc[i,'Description']='Limons et sables'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
ech_df.insert(1,'Type_ech','Sol')

In [83]:
dataframe_viewer(ech_df, rows=3)

Rows : 60, columns : 12


interactive(children=(IntSlider(value=3, description='rows', max=60, min=3, readout=False), IntSlider(value=12…

In [84]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [85]:
an=col_ren(an, 1)

In [87]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [88]:
an=dble_col_drop(an)

column(s) dropped: []


In [89]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)

Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'col_63', 'EOX (****)', 'HYDROCARBURES TOTAUX', 'Hydrocarbures totaux C10-C40', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYL (PCB)', 'PCB 28', 'PCB 52', 'PCB 101', 'PCB 118', 'PCB 138', 'PCB 153', 'PCB 180', 'PCB totaux (7)', 'F4/2M*', 'Teneur mesurée', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = Suivi d'assainissement", '(**) L: limon, A: Argile, S: Sable, R: Remblai', '(***) ib : imperméable (béton) ; ih : imperméable hydrocarboné ; p : perméable (gravier, fissuré,…) ; tvh : terres végétation haute ; tvb : terres végétation basse ', '(****) 3 mg/kg = Seuil limite défini dans le GREO ', "(1) l'échantillon n'a pas pu être extrait ni 

In [90]:
an.columns

Index(['col_0', 'Arsenic', 'Cadmium', 'Chrome', 'Chrome VI', 'Cuivre',
       'Mercure', 'Plomb', 'Nickel', 'Zinc', 'Cyanure (libre)',
       'Cyanure (totaux)', 'cyanure (APE)', 'cyanure complex', 'thiocyanate',
       'Benzène', 'Toluène', 'Éthylbenzène', 'Orthoxylène',
       'Para- et métaxylène', 'Xylènes', 'Styrène', 'Phénol', 'Naphtalène',
       'Acénaphtylène', 'Acénaphtène', 'Fluorène', 'col_34', 'Anthracène',
       'Fluoranthène', 'Pyrène', 'Benzo(a)anthracène', 'Chrysène',
       'Benzo(b)fluoranthène', 'Benzo(k)fluoranthène', 'Benzo(a)pyrène',
       'Dibenzo(ah)anthracène', 'Benzo(ghi)pérylène', 'Indéno(1,2,3-cd)pyrène',
       'HAP Totaux (16) - EPA', '1,1-Dichloroéthane', '1,2-Dichloroéthane',
       '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène',
       'Trans 1,2-dichloroéthylène', 'Dichlorométhane',
       'Totaux (cis,trans) 1,2-dichloroéthènes', '1,2-dichloropropane',
       'Tétrachloroéthylène', 'Tétrachlorométhane', '1,1,1-Trichloroéthane',
       '1,1,2-Trichl

In [91]:
an.rename(columns={'col_0':'ID_ech', 'col_34':'phénanthrène'}, inplace=True)
an.insert(1,'Type_ech','Sol')

In [92]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Cyanure (libre)', 'Cyanure (totaux)', 'cyanure (APE)', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [95]:
dataframe_viewer(ech_df, rows=5) 

Rows : 60, columns : 12


interactive(children=(IntSlider(value=5, description='rows', max=60, min=5, readout=False), IntSlider(value=12…

In [96]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [97]:
mdf['Type'] = 'Forage'

In [98]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx)

In [99]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 60 ; measure: 0 ; lithology: 0 ; analysis: 60 ; equipement: 0 ; unknown: 0 ; 


In [100]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 46 ; measure: 0 ; lithology: 0 ; analysis: 60 ; equipement: 0 ; unknown: 0


In [101]:
source_bh = bh
source_an = an

In [102]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
#source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 46 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 60 ; source_mes: 0


* **Sheet : 'Résult EAU'**

In [148]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Memoris_seafile/'
sheet='Result_eau'

In [149]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

4 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 154, columns : 51


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=154, min=5, readout=False), IntSlider(value=1…

In [150]:
ech_df=df.loc[:32]
an=df.loc[33:]

In [151]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [152]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('DESCRIPTION SOMMAIRE', 2), ('Prof. arrêt du forage', 2)]


In [153]:
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [154]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)

In [155]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['21:DESCRIPTION SOMMAIRE', '27:Prof. arrêt du forage']


In [156]:
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['col_1', "Nature de l'étude (*)", 'Observations organoleptiques', 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (**)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'PARAMETRES PHYSICO-CHIMIQUES ', 'ORP']



In [157]:
ech_df.drop(columns=ech_df.columns[[2]], axis=2, inplace=True)

In [158]:
name=['ID_ech', 'Date_ech','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol', 
      'Niv_eau_sol','pH', 'CE', 'T']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df.insert(1,'Type_ech','Eau')

In [159]:
ech_df['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(ech_df)):
    c=ech_df.loc[i,'Prof_crep']
    ech_df.loc[i,'Equip_top']=c.split('-')[0]
    ech_df.loc[i,'Equip_base']=c.split('-')[1]
    
ech_df.drop(columns=['Prof_crep'], inplace=True)
ech_df['Type_equip'] = 'Crepine'

In [160]:
ech_df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [162]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [163]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('nitrite', 2), ('Teneur mesurée', 2), ('nitrate', 2), ('ammonium', 2)]


In [164]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech','Température pour mes. pH':'Temp_pH' 'col_43':'phénanthrène'}, inplace=True)

In [165]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [166]:
an=dble_col_drop(an)

column(s) dropped: ['104:nitrite', '106:nitrate', '112:ammonium', '117:Teneur mesurée']


In [167]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'PHENOLS', 'crésols (total)', 'CHLOROPHENOLS', '2-chlorophénol', 'monochlorophénol total', 'dichlorophénol total', '2,4,5-trichlorophénol', '2,4,6-trichlorophénol', 'trichlorophénol total', '2,3,4,6- tétrachlorophénol', 'tétrachlorophénol total', 'pentachlorophénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'CHLOROBENZENES', 'monochlorobenzène', '1,3-dichlorobenzène', '1,2-dichlorobenzène', '1,4-Dichlorobenzène', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes', 'pentachlorobenzène', 'hexachlorobenzène', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYLS (PCB)', 'PCB totaux (7)', 'AUTRES ANALYSES ', 'azote Kjeldahl', 'COMPOSES INORGANIQUES ', 'sulfures totaux', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = S

In [168]:
an['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [170]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'cyanure (APE)', 'Tétrachloroéthylène ', 'fraction aromat. C6-C7', 'fraction aromat. C7-C8', 'fraction aromat. C8-C10', 'fraction aliphat. C6-C8', 'fraction aliphat. C8-C10', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21', 'pH', 'Température pour mes. pH', 'sulfites', 'sulfate', 'sulfures (libre)']


In [172]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [173]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']

In [211]:
df = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [212]:
dataframe_viewer(df, rows=3, un_val=['ID','ID_ech'])

Rows : 45, columns : 96, Unique values on cols: {'ID': 30, 'ID_ech': 45}


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=12…

In [213]:
df_dict = data_slicer(df, cols_dict, crit_dict)

borehole: 45 ; measure: 45 ; lithology: 0 ; analysis: 45 ; equipement: 45 ; unknown: 0 ; 

[1;32mNot used columns:[0;0m
 ['Température pour mes. pH', 'Affectation']


In [214]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: bh = bh.query('ID==ID and X==X')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 24 ; measure: 45 ; lithology: 0 ; analysis: 45 ; equipement: 45 ; unknown: 0


In [216]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [217]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Type_ech'], dist_max=1., drop_skip_col=['index'])

In [218]:
source_mes = mes
source_eqp = eqp

In [219]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknow.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknow.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 70 ; source_eqp: 45 ; source_uknw: 0 ; source_litho: 0 ; source_an: 105 ; source_mes: 45


#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 10-Résultats SOL container phyto t=0_décret sol.xls
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Container_phyto/'
sheet='Result_SOL'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:21]
an=df.loc[22:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.drop(columns=ech_df.columns[[-3]], axis=1, inplace=True)

In [None]:
name=['ID_ech', 'Ech_top', 'Ech_base','MS','Date_ech','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Sol')

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [587]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('0.1', 6), ('1', 2)]


In [588]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [589]:
an=dble_col_drop(an)

column(s) dropped: ['3:ID_ech', '4:0.1', '5:0.1', '6:0.1', '7:0.1', '8:0.1']


In [590]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [591]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Eau', '0.1']


In [592]:
dataframe_viewer(an, rows=5) 

Rows : 63, columns : 4


interactive(children=(IntSlider(value=5, description='rows', max=63, min=5, readout=False), IntSlider(value=4,…

In [593]:
source_ech_df=ech_df
source_an=an

In [594]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

NameError: name 'source_mes_pz' is not defined

* **Sheet : 'Paramètres agro.'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Container_phyto/'
sheet='Param_agro'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Paramètres agro.', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)
df=col_ren(df, 0)

In [None]:
df.drop(list(range(1)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df=dble_col_drop(df)

In [None]:
df=na_col_drop(df,1)
df=na_line_drop(df,3)
df.reset_index(drop=True, inplace=True)

In [None]:
df.drop(columns=df.columns[[5,6]], axis=2, inplace=True)

In [None]:
df.columns

In [None]:
name=['ID_ech','Ech_top','Ech_base','MS','Date_ech','MO','Residu_perte_feu','COT','Fract_arg','Fract_min_2µ', 
      'Fract_min_50µ', 'Fract_min_2', 'Fract_2', 'Fract_2+', 'pH_KCl','Tem_pH_mes', 'pH_H20', 'sulfures_tot', 
      'chlorures', 'azote_Kjeldahl']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [None]:
ech_df=df

In [None]:
dataframe_viewer(ech_df, rows=5)

In [None]:
data_merger(source_ech_df, ech_df, on='ID_ech', how='outer', )[0]

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 11-Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Ext_Pilote/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:31]
an=df.loc[list(range(0,4))+list(range(32, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','pH','Temp_prv','Temp_pH']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'Periode', 'Emplacement','Date_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre',
      'Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE) - méthode basée sur EPA 335.3", "cyanure complex - méthode interne ", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g"]

an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-7]

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
dataframe_viewer(an, rows=3)

In [None]:
source_ech_df=ech_df
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Param physico'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Ext_Pilote/'
sheet='Param_physico'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [None]:
df=col_ren(df, 1)

In [None]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [None]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

In [None]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

In [None]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)

In [None]:
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','Temp_prv ','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [None]:
sdf=sdf.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Niv_eau_chbre','pH','Niv_eau_sol','Long_pz',
      'Temp_prv ','CE','ORP','O_diss']
sdf=col_ren(sdf, mode=1, name=name)

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)

In [None]:
data=[df, sdf]
for d in data:
    d['Rmq']=''
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        n=str(d.loc[i, 'ID_ech'])
        d.loc[i,'ID_ech']=n.replace('*', '')
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan
        
        if re.match('\d+\*{1}$',n, re.I): 
            d.loc[i,'Rmq']="mesures faites dans un seau (débit non continu ou peu de débit)"
        elif re.match('\d+\*{2}$',n, re.I): 
            d.loc[i,'Rmq']="mésures faites dans une eau quasi-stagnante (Piezo rempli de sédiment et débit très faible)"

In [None]:
df.insert(1, 'Type_ech', 'Eau')
sdf.insert(1, 'Type_ech', 'Eau')

In [None]:
ech_df=data_merger(sdf, df, 'outer', 'ID_ech')[0]

In [None]:
ech_df=na_col_drop(df,2)
ech_df=na_line_drop(df,1)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
for i in ech_df.index:
    if not pd.isnull(ech_df.loc[i, 'Emplacement']):
        val = ech_df.loc[i, 'Emplacement']
    else:
        ech_df.loc[i, 'Emplacement'] = val

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
source_ech_df, conflict_df = data_merger(source_ech_df, ech_df, on=['ID_ech', 'Date_ech'], how='outer')

In [None]:
dataframe_viewer(conflict_df, rows=3)

In [None]:
data_validation(overall_data=source_ech_df, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Emplacement_x':list(conflict_df.index), 'pH_y':list(conflict_df.index), 
                           'Periode_x':list(conflict_df.index)})

In [595]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

NameError: name 'source_mes_pz' is not defined

* **Sheet : 'Inorganiques et composés majeurs'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Ext_Pilote/'
sheet='Inorganic_major'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:21]
an=df.loc[list(range(0,4))+list(range(22, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(2)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,2)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.columns

In [596]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Temp_prv']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

TypeError: Error! names list length and columns length are not the same.

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [597]:
an=col_ren(an, 1)

In [598]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [599]:
an=dble_col_drop(an)

column(s) dropped: []


In [600]:
an=na_col_drop(an,3)

In [601]:
an.columns

Index(['ID_ech', 'Sol', 'Eau', '0.06'], dtype='object')

In [602]:
an.rename(columns={'ammoniaque - libre':'ammoniaque libre'}, inplace=True)

In [603]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [604]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)
an.rename(columns={'Période ':'Periode', 'Date de prélèvement':'Date_ech'}, inplace=True)
#an=an.iloc[:,:-7]


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Sol', 'Eau', '0.06']


In [605]:
dataframe_viewer(an, rows=3)

Rows : 60, columns : 5


interactive(children=(IntSlider(value=3, description='rows', max=60, min=3, readout=False), IntSlider(value=5,…

In [606]:
dataframe_viewer(source_ech_df, rows=3)

Rows : 11, columns : 10


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=10…

In [607]:
source_ech_df.Date_ech = source_ech_df.Date_ech.astype(object)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [608]:
source_ech_df, conflict_df=data_merger(source_ech_df, ech_df, 'outer', ['ID_ech', 'Date_ech'])

In [609]:
data_validation(overall_data=source_ech_df, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Periode_y':list(conflict_df.index), 'Emplacement_y':list(conflict_df.index)})

KeyError: 'Check_col'

In [None]:
source_an, conflict_df=data_merger(source_an, an, 'outer', ['ID_ech', 'Date_ech'])

In [None]:
source_an.Date_ech = source_an.Date_ech.astype(object)

In [None]:
dataframe_viewer(source_an, rows=5, cols=13), dataframe_viewer(source_ech_df, rows=5, cols=13)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bh_water_an, conflict_df = data_merger(source_an, source_ech_df, how='outer', on=['ID_ech', 'Date_ech'], drop_skip_col=['index'])

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 12-Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Pilote/'
sheet='Result_eau'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:32]
an=df.loc[list(range(0,4))+list(range(33, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.columns

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Niv_eau_pz','Niv_eau_chbre','pH','Temp_prv','CE','ORP',
      'O_diss','col_29','Temp_pH']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [None]:
ech_df.drop(columns=['col_29'], inplace=True)
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
ech_df['Periode'].replace('\n',' ', regex=True, inplace=True)
ech_df.replace('\n','', regex=True, inplace=True)

In [None]:
data=[ech_df]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [610]:
dataframe_viewer(ech_df, rows=3)

Rows : 11, columns : 10


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=10…

In [611]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [612]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('Eau', 60)]


In [613]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [614]:
an=dble_col_drop(an)

column(s) dropped: ['1:ID_ech', '2:ID_ech', '3:ID_ech', '4:ID_ech', '5:ID_ech', '6:ID_ech', '7:ID_ech', '8:ID_ech', '9:ID_ech', '10:ID_ech', '11:ID_ech', '12:ID_ech', '13:ID_ech', '14:ID_ech', '15:ID_ech', '16:ID_ech', '17:ID_ech', '18:ID_ech', '19:ID_ech', '20:ID_ech', '21:ID_ech', '22:ID_ech', '23:ID_ech', '24:ID_ech', '25:ID_ech', '26:ID_ech', '27:ID_ech', '28:ID_ech', '29:ID_ech', '30:ID_ech', '31:ID_ech', '32:ID_ech', '33:ID_ech', '34:ID_ech', '35:ID_ech', '36:ID_ech', '37:ID_ech', '38:ID_ech', '39:ID_ech', '40:ID_ech', '41:ID_ech', '42:ID_ech', '43:ID_ech', '44:ID_ech', '45:ID_ech', '46:ID_ech', '47:ID_ech', '48:ID_ech', '49:ID_ech', '50:ID_ech', '51:ID_ech', '52:ID_ech', '53:ID_ech', '54:ID_ech', '55:ID_ech', '56:ID_ech', '57:ID_ech', '58:ID_ech', '59:ID_ech']


In [615]:
an.rename(columns={'Période ':'Periode', 'Emplacement \n- P : Pilote \n- HZP : Hors zone pilote':'Emplacement',
                  'Date de prélèvement':'Date_ech'}, inplace=True)

In [616]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'Periode', 'Emplacement', 'Date_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre','Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE) - méthode basée sur EPA 335.3", "cyanure complex - méthode interne ", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g","h"]

In [617]:
an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-8]

TypeError: Error! names list length and columns length are not the same.

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
an['Periode'].replace('\n',' ', regex=True, inplace=True)
an.replace('\n','', regex=True, inplace=True)

In [None]:
data=[an]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
dataframe_viewer(an, rows=3)

In [None]:
data = [ech_df, an]
for d in data:
    print('-------------')
    for i, r in d.iterrows():
        for c in d.columns:
            if c not in ['ID_ech', 'Type_ech', 'Periode', 'Emplacement', 'Date_ech'] and \
            str(type(r[c])) not in ["<class 'float'>", "<class 'int'>"]:
                d.loc[i, c] = np.nan
                #print(f'{i}- {str(type(r[c]))}- {c} : {r[c]}')

In [None]:
source_ech_df=ech_df
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Param physico'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Pilote/'
sheet='Param_physico'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [None]:
df=col_ren(df, 1)

In [None]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [None]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

In [None]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

In [None]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)

In [None]:
df=df.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','Temp_prv ','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [None]:
sdf.drop(columns=['col_29'], inplace=True)
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','pH','Niv_eau_sol','Temp_prv ','CE',
      'ORP','O_diss','Temp_pH']
sdf=col_ren(sdf, mode=1, name=name)

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)
sdf.drop(columns=["Niv_eau_sol"], inplace=True)

In [None]:
set(sdf['Emplacement'])

In [None]:
data=[df, sdf]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
df.insert(1, 'Type_ech', 'Eau')
sdf.insert(1, 'Type_ech', 'Eau')

In [None]:
df.replace('\*|à compléter',np.nan, inplace=True, regex=True)

In [None]:
ech_df, conflict_df = data_merger(sdf, df, 'outer', 'ID_ech')

In [None]:
source_ech_df, conflict_df=data_merger(source_ech_df, ech_df, on=['ID_ech', 'Date_ech', 'Periode'], how='outer')

In [None]:
dataframe_viewer(source_ech_df, rows=3)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Inorganiques et composés majeurs'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Pilote/'
sheet='Inorganic_major'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:21]
an=df.loc[list(range(0,4))+list(range(22, len(df)))]

In [None]:
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(2)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,2)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df.columns

In [None]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Temp_prv']
ech_df.replace(r'\n',' ', inplace=True, regex=True)
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an=dble_col_drop(an)

In [None]:
an=na_col_drop(an,3)

In [None]:
an.rename(columns={'Période ':'Periode', 'Emplacement \n- S : Simulateur \n- HZS : Hors zone simulateur':'Emplacement',
                  'Date de prélèvement':'Date_ech', 'col_9':'ammoniaque libre'}, inplace=True)

In [None]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)
#an=an.iloc[:,:-7]

In [None]:
data=[ech_df, an]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan

In [None]:
data = [ech_df, an]
for d in data:
    print('-------------')
    for i, r in d.iterrows():
        for c in d.columns:
            if c not in ['ID', 'ID_ech', 'Type', 'Type_ech', 'Periode', 'Emplacement', 'Date_ech'] and \
            str(type(r[c])) not in ["<class 'float'>", "<class 'int'>"]:
                d.loc[i, c] = np.nan
                #print(f'{i}- {str(type(r[c]))}- {c} : {r[c]}')

In [None]:
dataframe_viewer(an, rows=3)

In [None]:
source_ech_df.Date_ech = source_ech_df.Date_ech.astype(object)

In [None]:
source_ech_df, conflict_df=data_merger(source_ech_df, ech_df, 'outer', ['ID_ech', 'Date_ech'] )

In [None]:
data_validation(overall_data=source_ech_df, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Periode_y':list(conflict_df.index), 'Temp_prv_y':list(conflict_df.index), 
                            'Emplacement_y':list(conflict_df.index)})

In [None]:
source_an, conflict_df=data_merger(source_an, an, 'outer', ['ID_ech', 'Date_ech', 'Periode'])

In [None]:
data_validation(overall_data=source_an, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Emplacement_y':list(conflict_df.index)})

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bh_water_an, conflict_df = data_merger(source_an, source_ech_df, how='outer', on=['ID_ech', 'Date_ech', 'Periode'],
                                             drop_skip_col=['index'])

In [None]:
dataframe_viewer(excel_bh_water_an, rows=5)

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 13-Resultats_Siterem_SOL.xlsx
* **Sheet : 'Résult SOL ext. pilote'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Result_Sol/'
sheet='Result_sol_ExtP'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='Résult SOL ext. pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:22]
an=df.loc[23:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df.Description)

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x in ['R','R ']: ech_df.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Description']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
name=['ID_ech','METAUX LOURDS','Arsenic','Cadmium','Chrome','Chrome VI','Cuivre','Mercure','Plomb','Nickel',
'Zinc','CYANURES','cyanure (libre)','cyanure (totaux)','cyanure (APE)','cyanure complex','thiocyanate',
'COMPOSES AROMATIQUES VOLATILS','Benzène','Toluène','Éthylbenzène','Orthoxylène','Para- et métaxylène','Xylènes',
'Styrène','BTEX totaux','PHENOLS','Phénol','HYDROCARBURES AROMATIQUES POLYCYCLIQUES','Naphtalène','Acénaphtylène',
'Acénaphtène','Fluorène','Phénanthrène','Anthracène','Fluoranthène','Pyrène','Benzo(a)anthracène','Chrysène',
'Benzo(b)fluoranthène','Benzo(k)fluoranthène','Benzo(a)pyrène','Dibenzo(ah)anthracène','Benzo(ghi)pérylène',
'Indéno(1,2,3-cd)pyrène','HAP Totaux (16) - EPA','COMPOSES ORGANOHALOGENES VOLATILS','Tétrachloroéthylène',
'Trichloroéthylène','1,1-dichloroéthène','Cis-1,2-dichloroéthène','Trans 1,2-dichloroéthylène',
'Totaux (cis,trans) 1,2-dichloroéthènes','Chlorure de vinyle','1,1,1-Trichloroéthane','1,1,2-Trichloroéthane',
'1,1-Dichloroéthane','1,2-Dichloroéthane','Tétrachlorométhane','Chloroforme','Dichlorométhane',
'1,2-dichloropropane','HYDROCARBURES TOTAUX','fraction aromat. >C6-C7','fraction aromat. >C7-C8',
'fraction aromat. >C8-C10','fraction aliphat. C5-C6','fraction aliphat. >C6-C8','fraction aliphat. >C8-C10',
'Fraction C5 - C8','Fraction C8 - C10','Fraction C10-C12','Fraction C12-C16','Fraction C16 - C21',
'Fraction C21 - C35','Fraction C35 - C40','Hydrocarbures totaux C10-C35','Hydrocarbures totaux C10-C40',
'Teneur mesurée','Teneur mesurée','VS : Valeur seuil']

an=col_ren(an, name=name, mode=1)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
an.rename(columns={'cyanure (totaux)':'CN_tot', 'cyanure (APE)':'CN_EPA'}, inplace=True)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
dataframe_viewer(ech_df, rows=5)

In [None]:
source_ech_df=ech_df
source_an=an

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'SOL T1 pilote'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/Siterem_Result_Sol/'
sheet='SOL_T1_Pilote'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='SOL T1 pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [None]:
ech_df.columns

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Nature_ech','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
for i in range(len(ech_df['Nature_ech'])):
    x = ech_df.loc[i,'Nature_ech']
    if x in ['R','R ']: ech_df.loc[i,'Nature_ech']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Nature_ech']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [None]:
dataframe_viewer(ech_df, rows=3)

In [None]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [None]:
an=col_ren(an, 1)

In [None]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
name=['ID_ech','METAUX LOURDS','Arsenic','Cadmium','Chrome','Chrome VI','Cobalt','Cuivre','Mercure','Plomb', 
'Nickel','Zinc','CYANURES','cyanure (libre)','cyanure (totaux)','cyanure (APE)','cyanure complex','thiocyanate',
'COMPOSES AROMATIQUES VOLATILS','Benzène','Toluène','Éthylbenzène', 'Orthoxylène','Para- et métaxylène','Xylènes',
'Styrène','BTEX totaux','PHENOLS','Phénol','Indice phénol','HYDROCARBURES AROMATIQUES POLYCYCLIQUES','Naphtalène',
'Acénaphtylène','Acénaphtène', 'Fluorène','Phénanthrène','Anthracène','Fluoranthène','Pyrène','Benzo(a)anthracène',
'Chrysène','Benzo(b)fluoranthène','Benzo(k)fluoranthène','Benzo(a)pyrène','Dibenzo(ah)anthracène',
'Benzo(ghi)pérylène','Indéno(1,2,3-cd)pyrène','HAP Totaux (16) - EPA','COMPOSES ORGANOHALOGENES VOLATILS',
'Tétrachloroéthylène','Trichloroéthylène','1,1-dichloroéthène','Cis-1,2-dichloroéthène',
'Trans 1,2-dichloroéthylène','Totaux (cis,trans) 1,2-dichloroéthènes','Chlorure de vinyle',
'1,1,1-Trichloroéthane','1,1,2-Trichloroéthane','1,1-Dichloroéthane','1,2-Dichloroéthane','Tétrachlorométhane',
'Chloroforme','Dichlorométhane','1,2-dichloropropane','EOX','HYDROCARBURES TOTAUX',
'fraction aromat. >C6-C7','fraction aromat. >C7-C8','fraction aromat. >C8-C10','fraction aliphat. C5-C6',
'fraction aliphat. >C6-C8','fraction aliphat. >C8-C10','Fraction C5 - C8','Fraction C8 - C10','Fraction C10-C12',
'Fraction C12-C16','Fraction C16 - C21','Fraction C21 - C35','Fraction C35 - C40','Hydrocarbures totaux C10-C35',
'Hydrocarbures totaux C10-C40','METHYL-TERT-BUTYL-ETHER','MTBE']

an=an.iloc[:,:-17]
an=col_ren(an, name=name, mode=1)

In [None]:
an=dble_col_drop(an)

In [None]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,3)
an.insert(1,'Type_ech','Sol')

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [None]:
an.rename(columns={'cyanure (totaux)':'CN_tot', 'cyanure (APE)':'CN_EPA'}, inplace=True)

In [None]:
dataframe_viewer(an, rows=5) 

In [None]:
dataframe_viewer(ech_df, rows=5) 

In [None]:
#source_ech_df.info()#, ech_df.info()

In [None]:
source_ech_df=source_ech_df[['ID_ech', 'Type_ech', 'Date_ech','Long_for', 'Refus', 'Description', 
                               'Ech_top', 'Ech_base', 'MS', 'Fract_2', 'Fract_2+']]

In [None]:
source_ech_df=data_merger(source_ech_df, ech_df, on='ID_ech', how='outer')[0]

In [None]:
source_an, conflict=data_merger(source_an,an, on='ID_ech', how='outer')

In [None]:
source_an=source_an.query('ID_ech==ID_ech')

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bh_soil_an, conflict_df = data_merger(source_ech_df, source_an, how='outer', on='ID_ech', drop_skip_col=['index'])

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 14-Logs_forages_vUmons_2018-03-20.xlsx
* **Sheet : 'Analyse_eau_Phases1&2'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/vUmons_logsFor/'
sheet='Analyse_eau_Phases1&2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_eau_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [None]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [None]:
df=col_ren(df,mode=1,name=[re.sub('9999','-',x) for x in df.columns])
df=col_ren(df,mode=1, name=POL_NAMES_MODEL)

In [None]:
name=['ID', 'ID_ech', 'Date_ech', 'X', 'Y', 'Z', 'Long_for','Long_pz_sol', 'Niv_eau_sol', 'pH', 'CE', 'T', 
      'As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg','Pb', 'Ni', 'Zn', 'CN_libre', 'CN_tot', 'CN_APE', 'CN_comp',
      'thioCN', 'Bnz_vn', 'Bnz', 'Toln_vn', 'Toln', 'EthylBnz','O-Xyl', 'P-M-Xyl', 'Xyl_vn', 'Xyl', 'Styr', 
      'Phenol','Naphta_vn', 'Naphta', 'Acenaphtyl', 'Acenaphtn', 'Fluorene',
       'Phenanthr', 'Anthrc', 'Flranth', 'Pyr', 'Bnz(a)anthrc', 'Chrys',
       'Bnz(b)flranth', 'Bnz(k)flranth', 'Bnz(a)pyr', 'Dibnz(ah)anthrc',
       'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr', 'HAP_tot_EPA',
       '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       '(cis,trans) 1,2-DCE_tot', 'Trans 1,2-DCEyl', 'DCM', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35', 'MTBE', 'Chlorure']
df=col_ren(df, mode=1,name=name)

In [None]:
df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)
df.insert(1,'Type_ech','Eau')

In [None]:
df.drop([20,39], axis=0,inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.loc[38:,'Date_ech']=df.loc[38:,'Date_ech'].apply(lambda x : dtm.datetime.fromordinal(dtm.datetime(1900, 1, 1).toordinal() + x - 2))

In [None]:
for i in range(len(df['ID_ech'])):
    if pd.isnull(df.loc[i,'ID_ech']): 
        df.loc[i,'ID_ech']=df.loc[i,'ID']

In [None]:
pz=df[['ID', 'X', 'Y', 'Z', 'Long_for','Long_pz_sol']]
pz['Type'] = 'Piezo'

ech_df=df[['ID','ID_ech','Type_ech','Date_ech', 'X', 'Y', 'Z','Niv_eau_sol', 'pH', 'CE', 'T']]
an=df[['ID','ID_ech','Type_ech','Date_ech', 'X', 'Y', 'Z','As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg','Pb', 'Ni', 'Zn', 'CN_libre', 'CN_tot', 'CN_APE', 
       'CN_comp','thioCN', 'Bnz_vn', 'Bnz', 'Toln_vn', 'Toln', 'EthylBnz','O-Xyl', 'P-M-Xyl', 'Xyl_vn', 'Xyl',
       'Styr', 'Phenol','Naphta_vn', 'Naphta', 'Acenaphtyl', 'Acenaphtn', 'Fluorene',
       'Phenanthr', 'Anthrc', 'Flranth', 'Pyr', 'Bnz(a)anthrc', 'Chrys',
       'Bnz(b)flranth', 'Bnz(k)flranth', 'Bnz(a)pyr', 'Dibnz(ah)anthrc',
       'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr', 'HAP_tot_EPA',
       '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       '(cis,trans) 1,2-DCE_tot', 'Trans 1,2-DCEyl', 'DCM', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35', 'MTBE', 'Chlorure']]

In [None]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

In [None]:
dataframe_viewer(an, rows=5)

In [None]:
source_an=an
source_pz=pz
source_ech_df=ech_df

In [None]:
for i in range(len(excel_water_an.ID_ech)):
    c=excel_water_an.loc[i, 'ID_ech']
    excel_water_an.loc[i, 'ID']=re.search("(\w+\d+(?:\w)?)",c).group(1)

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

an.to_csv(save_dir+'Water_analysis.csv', index=False)

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Analyse_sol_Phases1&2'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/vUmons_logsFor/'
sheet='Analyse_sol_Phases1&2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_sol_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=col_ren(df, mode=1, name=POL_NAMES_MODEL)

In [None]:
name=['ID_ech','Date_ech','ID','X','Y','Z','Nature_ech','Organo','Long_for','Refus','Ech_top','Ech_base',
      'MS','Broyage < 150 µm','Broyage ','Fract_2','Fract_2+','As','Cd','Cr','Cr_VI','Cu',
       'Hg','Pb','Ni','Zn','CN_libre','CN_tot','CN_APE',
       'CN_comp','thioCN','Bnz','Toln','EthylBnz','O-Xyl','P-M-Xyl',
       'Xyl','Styr','Phenol','Naphta','Acenaphtyl','Acenaphtn',
       'Fluorene','Phenanthr','Anthrc','Flranth','Pyr','Bnz(a)anthrc',
       'Chrys','Bnz(b)flranth','Bnz(k)flranth','Bnz(a)pyr',
       'Dibnz(ah)anthrc','Bnz(ghi)peryl','Indeno(1,2,3-cd)pyr',
       'HAP_tot_EPA','1,1-DCE','1,2-DCE','1,1-DCEn',
       'Cis-1,2-DCEn','Trans 1,2-DCEyl','DCM',
       '(cis,trans) 1,2-DCE_tot','1,2-DCP','TetraCEyn','TCM',
       '1,1,1-TCE','1,1,2-TCE','TCEyn','Chloroforme','CVinyl','Arom_C6C7',
       'Arom_C7C8','Arom_C8C10','Aliphat_C5C6','Aliphat_C6C8',
       'Aliphat_C8C10','Fract_C5C8','Fract_C8C10','Fract_C10C12',
       'Fract_C12C16','Fract_C16C21','Fract_C21C35','HC_tot_C10C35']
df=col_ren(df, mode=1, name=name)

In [None]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [None]:
for i in range(len(df['Nature_ech'])):
    x = df.loc[i,'Nature_ech']
    if x in ['R','R ']: df.loc[i,'Nature_ech']='Remblais'
    elif x in ['L']: df.loc[i,'Nature_ech']='Limons'
    elif x in ['LA']: df.loc[i,'Nature_ech']='Limons et argiles'
    elif x in ['LS']: df.loc[i,'Nature_ech']='Limons et sables'

df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
df.insert(1,'Type_ech','Sol')

In [None]:
df.drop(14, axis=0, inplace=True)
df.drop(['Broyage < 150 µm', 'Broyage '], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.loc[8, 'ID_ech']='F4/2M'
df.loc[31, 'ID_ech']='P19/1'
df.loc[32, 'ID_ech']='P19/2'

In [None]:
pz=df[['ID', 'X', 'Y', 'Z', 'Long_for','Refus']]
pz['Type'] = 'Piezo'

ech_df=df[['ID_ech', 'Type_ech', 'Date_ech', 'X', 'Y', 'Z', 'Nature_ech','Organo', 
            'Ech_top', 'Ech_base', 'MS', 'Fract_2','Fract_2+']]
an=df[['ID','ID_ech', 'Date_ech', 'X', 'Y', 'Z', 'Type_ech','As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg', 'Pb', 'Ni', 'Zn',
       'CN_libre', 'CN_tot', 'CN_APE', 'CN_comp', 'thioCN', 'Bnz', 'Toln',
       'EthylBnz', 'O-Xyl', 'P-M-Xyl', 'Xyl', 'Styr', 'Phenol', 'Naphta',
       'Acenaphtyl', 'Acenaphtn', 'Fluorene', 'Phenanthr', 'Anthrc', 'Flranth',
       'Pyr', 'Bnz(a)anthrc', 'Chrys', 'Bnz(b)flranth', 'Bnz(k)flranth',
       'Bnz(a)pyr', 'Dibnz(ah)anthrc', 'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr',
       'HAP_tot_EPA', '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       'Trans 1,2-DCEyl', 'DCM', '(cis,trans) 1,2-DCE_tot', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35']]

In [None]:
an['ID'] = an['ID'].apply(lambda x: re.sub('^P', 'F', x)) 

In [None]:
dataframe_viewer(an, rows=5)

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

an.to_csv(save_dir+'Soil_analysis.csv', index=False)

In [None]:
source_an=source_an.append(an, ignore_index=True)
source_pz=pz
source_ech_df=ech_df

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Synthèse'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/vUmons_logsFor/'
sheet='Synthese'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Synthèse', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df=df[:29]
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')

In [None]:
name=['ID','X','Y','Z', 'Refus','Long_for', 'RB', 'ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)

In [None]:
cols=['ID','X','Y','Z', 'Refus','Long_for']

for i in range(len(df)):
    if not pd.isnull(df.loc[i, 'RB']): 
        df.loc[i, 'Nappe']='Remblais'
        df.loc[i, 'Litho_top']=0
        if not pd.isnull(df.loc[i, 'Rb_base']):
            df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else:
            df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'ALL']):
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Nappe']='Alluvions'
        df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        if not pd.isnull(df.loc[i, 'S_A']):
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else:
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'S_A']):
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Nappe']='Socle altéré'
        df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        if not pd.isnull(df.loc[i, 'S_S']):
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else:
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if not pd.isnull(df.loc[i, 'S_S']):
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Nappe']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S', 'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
dataframe_viewer(df, rows=5, cols=15)

In [None]:
df.columns

In [None]:
bh=df[['ID','X','Y','Z','Long_for','Refus']]
bh['Type']='Forage'

litho=df[['ID','X','Y','Z','Litho_top','Litho_base','Nappe']]
source_litho=litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Sond2017v2'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/vUmons_logsFor/'
sheet='Sond2017v2'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Sond2017v2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if x==1 else '')

In [None]:
name=['R_ID','ID','X','Y','Z','Refus','Date_for','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','cote_rb','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)
df=df[['ID','X','Y','Z','Refus','Date_for','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']]

In [None]:
cols=['ID','Date_for','X','Y','Z','Z_fond','Refus','Long_for']

for i in range(len(df)):    
    if df.loc[i, 'RB']==1: 
        df.loc[i, 'Nappe']='Remblais'
        df.loc[i, 'Litho_top']=0
        
        if not pd.isnull(df.loc[i, 'Rb_base']): df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else: df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    val_def=df.loc[i, 'Litho_base'] # temporary value of litho_base if nan
    
    if df.loc[i, 'ALL']==1:
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Nappe']='Alluvions'
        
        if not pd.isnull(df.loc[i, 'All_top']): df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        else: df.loc[i+.2, 'Litho_top']=val_def #df.loc[i, 'litho_base']
            
        if df.loc[i, 'S_A']==1: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if df.loc[i, 'S_A']==1:
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Nappe']='Socle altéré'
        
        if not pd.isnull(df.loc[i, 'Soc_alt_top']): df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.5, 'Litho_top']=val_def #df.loc[i+.2, 'litho_base']
        
        if df.loc[i, 'S_S']==1: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if df.loc[i, 'S_S']==1:
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Nappe']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S','Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df=df[:-1]

In [None]:
dataframe_viewer(df, rows=5, cols=15)

In [None]:
bh=df[['ID','X','Y','Z','Z_fond','Date_for','Long_for','Refus']]
bh['Type']='Forage'

litho=df[['ID','X','Y','Z','Litho_top','Litho_base','Nappe']]
source_litho = source_litho.merge(df, 'outer')

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

# Processing for new data added - April 2021

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 15-Profils de sol et données de terrain 2019.xlsx
* **Sheet : 'Log'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Log'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Log', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name = ['ID','Litho_top', 'Litho_base', 'Keyword', 'Description']
df = col_ren(df, name=name, mode=1, )
df = df[1:]
df['Date_for'] = dtm.datetime(2019,12,18)

In [None]:
df.drop(index=df.query('Litho_base.isnull() or Litho_top.isnull()').index, inplace=True)

In [None]:
compute_BH_length(df)

In [None]:
df.query('Litho_base.isnull() or Litho_top.isnull()')

In [None]:
dataframe_viewer(df, rows=5, cols=15)

In [None]:
bh = df[1:62]
pza = df[65:80] #piezair
pz = df[83:]

In [None]:
bh.reset_index(drop=True, inplace=True)
pza.reset_index(drop=True, inplace=True)
pz.reset_index(drop=True, inplace=True)

In [None]:
bh.insert(1,'Type', 'Forage')
bh.insert(1,'Zone', 'Extension Pilote')
pza.insert(1,'Type', 'Piezair')
pza.insert(1,'Zone', 'Extension Pilote')
pz.insert(1,'Type', 'Piezo')
pz.insert(1,'Zone', 'Mini-Pilote')

In [None]:
litho=bh.append(pza)
litho=litho.append(pz)
litho=litho[['ID','Type','Zone','Litho_top','Litho_base','Description','Keyword']]
litho.reset_index(inplace=True, drop=True)

In [None]:
litho.loc[litho[litho['ID']==50].index, 'Type'] = 'Piezo'

In [None]:
dataframe_viewer(litho, rows=3)

In [None]:
source_litho=litho

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#pza.to_csv(tmp_dir+sheet+'_Piezairs.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Echantillon'+'Organoleptique**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Echantillon'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Echantillon', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Ech_top', 'Ech_base', 'ID_ech']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [None]:
df.drop(index=[43,44,55,56,66], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
ech=df.copy()

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Organoleptique', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,4)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID','Pol_top', 'Pol_base','Polluant','Intensite']
df=col_ren(df, name=name, mode=1)

In [None]:
df.drop(index=[10,11,14,15], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
mdf, conflict_df =data_merger(ech, df, on='ID', how='outer')

In [None]:
dataframe_viewer(mdf)

In [None]:
ech_df=mdf
source_ech_df=ech_df

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Données de forage'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Donnees_forage'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Données de forage', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
name=['ID', 'X', 'Y', 'Z', 'Date_for', 'Long_for', 'Methode', 'Diam_for','Rmq', 'Long_pz', 'Diam_pz', 
      'Crep_long','Societe', 'Resp_chantier']
df=col_ren(df, name=name, mode=1)
df.drop(index=[16,23], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
df.insert(5, 'Type', '')
df.loc[:15,'Type']='Forage'
df.loc[16:21,'Type']='Piezair'
df.loc[22:,'Type']='Piezo'

In [None]:
df.loc[9,'ID']='224 bis'

In [None]:
df['Refus'] = ''
df['Type_refus']=''

for i in range(len(df['Rmq'])):
    val = str(df.loc[i,'Rmq'])
    if re.search('[Bb]loqué', val) :
        df.loc[i,'Refus'] = 'x'
        
        if re.search('[lL]aitier', val):
            df.loc[i,'Type_refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Type_refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Type_refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Refus'] = '' 

df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[1]) if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[0]) if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x) if not pd.isnull(x) else x)

df.insert(10, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(11, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Rmq', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines
df.reset_index(drop=True, inplace=True)

gen_dated_id(df,'ID','Date_for')  

In [None]:
pz = df.query("Type=='Piezo'")
pza=df.query("Type=='Piezair'")
bh = df.query("Type=='Forage'")

pz.reset_index(inplace=True, drop=True)
pza.reset_index(inplace=True, drop=True)
bh.reset_index(inplace=True, drop=True)

In [None]:
dataframe_viewer(df, rows=3)

In [None]:
source_pz = pz
source_pza = pza
source_bh = bh

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
pza.to_csv(tmp_dir+sheet+'_Piezairs.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
source_pza.to_csv(tmp_dir+'source_merge/source_Piezairs.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)}; source_pza:{len(source_pza)} ;'
      f'source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Equipement'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Equipement'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Equipement', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID','Equip_top', 'Equip_base', 'Diam_for', 'Diam_ext_pz', 'Legende']
df=col_ren(df, mode=1, name=name)

In [None]:
df.drop(index=[24,25], inplace=True)
df.reset_index(drop=True, inplace=True)

In [None]:
compute_BH_length(df, top_col='Equip_top', base_col='Equip_base')

In [None]:
coi = ['ID', 'Profondeur', 'Diam_for', 'Diam_ext_pz']
pz=df[coi].drop_duplicates(['ID'])
pz['Type'] = 'Piezo'

In [None]:
dataframe_viewer(df)

In [None]:
equip=df
source_eqp=equip

In [None]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
equip.to_csv(tmp_dir+sheet+'_Equipment.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)
source_eqp.to_csv(tmp_dir+'source_merge/source_eqpment.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

* **Sheet : 'Piézométrie'**

In [618]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='piezometrie'

In [619]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et donnees de terrain 2019.xlsx', 
                   sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

Rows : 3, columns : 4


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=4, …

In [620]:
name=['ID','Niv_pz_sol', 'Type_ech', 'Date_mes']
df=col_ren(df, name=name, mode=1)

In [621]:
mes_pz=df
source_mes_pz=mes_pz

In [622]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

NameError: name 'source_mes_sol' is not defined

In [None]:
dataframe_viewer(source_ech_df, rows=5)

In [None]:
dataframe_viewer(excel_bhs, rows=5)

### $\color{red}{\textbf{Excel source data merging}}$

In [None]:
excel_bhs, conflict_df = data_merger(source_bh, source_pz, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bhs, conflict_df = data_merger(excel_bhs, source_pza, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bh_litho, conflict_df = data_merger(excel_bhs, source_litho, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bh_soil_sp, conflict_df = data_merger(excel_bhs, source_ech_df, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
excel_bh_equip, conflict_df = data_merger(excel_bhs, source_eqp, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
data_validation(overall_data=excel_bh_equip, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Diam_ext_pz_x':list(conflict_df.index)})

In [None]:
excel_bh_mes, conflict_df = data_merger(excel_bhs, source_mes_pz, how='outer', on='ID', dist_max=1., drop_skip_col=['index'])

In [None]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
#excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

#### ======================================================================================

In [None]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

## 16-Résultats SOL extension pilote et piézairs.xlsx
* **Sheet : 'Résult SOL'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/result_sol_ext_pilote/'
sheet='Result_Sol'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Resultats SOL extension pilote et piezairs.xlsx', 
                   sheet_name='Résult SOL', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [None]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [None]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

In [None]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

In [None]:
ech_df=dble_col_drop(ech_df)

In [None]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,3)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

In [None]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [None]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
ech_df=col_ren(ech_df, name=name, mode=1)

In [None]:
set(ech_df.Description)

In [None]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x in ['R','R ']: ech_df.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Description']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [None]:
for i in range(len(ech_df)):
    x=ech_df.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        ech_df.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
dataframe_viewer(ech_df, rows=3)

In [623]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [624]:
an=col_ren(an, 1)

IndexError: single positional indexer is out-of-bounds

In [None]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [None]:
an.columns

In [None]:
an=an[an.columns[:-17]]
an.rename(columns={'col_35':'Phénanthrène'}, inplace=True)

In [None]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [625]:
an=dble_col_drop(an)

column(s) dropped: []


In [626]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an = na_line_drop(an, 1)
an.insert(1,'Type_ech','Sol')

KeyError: '[1 2] not found in axis'

In [None]:
data = an
for i in range(len(data)):
    x=data.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        data.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
dataframe_viewer(an, rows=5) 

In [627]:
source_ech_df=ech_df
source_an=an

In [628]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

NameError: name 'source_mes_sol' is not defined

* **Sheet : 'inorganiques et composés majeur'**

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'
sheet='Inorg_comp_majeur'

In [None]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Resultats SOL extension pilote et piezairs.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

In [629]:
ech_df=df.loc[:20] # not really interesting here!
an=df.loc[21:]

In [630]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [631]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [632]:
an=col_ren(an, 1)

In [633]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [634]:
an=an[an.columns[:-7]]

In [635]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)

In [636]:
an=dble_col_drop(an)

column(s) dropped: []


In [637]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,2)
an = na_line_drop(an,1)
an.insert(1,'Type_ech','Sol')

IndexError: index 1 is out of bounds for axis 0 with size 0

In [None]:
data = an
for i in range(len(data)):
    x=data.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        data.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [None]:
#source_ech_df=ech_df
source_an=data_merger(source_an, an, how='outer', on='ID_ech')[0]

In [None]:
dataframe_viewer(source_an, rows=5) 

In [638]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/') 

#bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Unkown-facility.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#ech_df.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_merge/source_Piezometers.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unkown-facility.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_merge/source_Measures-soil.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-water.csv', index=False)
#source_ech_df.to_csv(tmp_dir+'source_merge/source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ukw)} ; source_an:{len(source_an)} ;\nsource_ech_df:{len(source_ech_df)} ;'
     f'source_ech_df:{len(source_ech_df)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

NameError: name 'source_mes_sol' is not defined

### $\color{red}{\textbf{merging of 'Donnees_2019' and 'result_sol' data}}$

In [639]:
excel_bh_soil_an, conflict_df = data_merger(excel_bh_soil_sp, source_an, how='outer', on='ID_ech', dist_max=1., drop_skip_col=['index'])

NameError: name 'excel_bh_soil_sp' is not defined

In [None]:
tmp_dir=ROOT_DIR + '/CF_data/Result_traitem/donnees_terrain_2019/'

In [640]:
save_dir = tmp_dir + 'source_merge/final_'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

#excel_bhs.to_csv(save_dir + 'Boreholes.csv', index=False)
#excel_bh_litho.to_csv(save_dir+'Lithologies.csv', index=False)
#excel_bh_equip.to_csv(save_dir+'Equipments.csv', index=False)
#excel_bh_soil_sp.to_csv(save_dir+'Soil_samples.csv', index=False)
#excel_bh_water_sp.to_csv(save_dir+'Water_samples.csv', index=False)
excel_bh_soil_an.to_csv(save_dir+'Soil_analysis.csv', index=False)
#excel_bh_water_an.to_csv(save_dir+'water_analysis.csv', index=False)
#excel_bh_mes.to_csv(save_dir+'Measures.csv', index=False)
#excel_bh_fac.to_csv(save_dir+'Unknow_facilities.csv', index=False)

In [641]:
#dataframe_viewer(test.query('ID=="F16M"'), rows=5)
dataframe_viewer(mdf, rows=5), dataframe_viewer(conflict_df, rows=5)
#dataframe_viewer(source_bh, rows=5), dataframe_viewer(source_mes, rows=5)
#dataframe_viewer(bh, rows=5), dataframe_viewer(eqp, rows=5)

Rows : 11, columns : 86


interactive(children=(IntSlider(value=5, description='rows', max=11, min=5, readout=False), IntSlider(value=12…

Rows : 0, columns : 0


interactive(children=(IntSlider(value=0, description='rows', max=0, readout=False), IntSlider(value=0, descrip…

(None, None)