# ORGANISATION DES DONNEES

In [1]:
from utils.io import update_dict, gen_dated_id, dataframe_viewer, gen_geodf_geom, data_merger, data_validation, \
data_slicer, replicate_values, collect_measure, collect_time_data, gen_id_from_ech, na_col_drop, na_line_drop, col_ren, \
dble_col_drop, dict_viewer

from utils.config import DEFAULT_POL_LEXICON, POL_NAMES_MODEL 
from difflib import get_close_matches

import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR

In [2]:
def compute_BH_length(df, id_col='ID', length_col_name='Long_for', top_col='Intv_top', base_col='Intv_base', verbose=False):
    
    if length_col_name in df.columns:
        raise(NameError(f'{length_col_name} is already in columns. Give another name'))
    
    for i in df.index:
        try:
            float(df.loc[i, top_col])
        except ValueError:
            df.loc[i, top_col] = np.nan

        try:
            float(df.loc[i, base_col])
        except ValueError:
            df.loc[i, base_col] = np.nan

    df[top_col] = df[top_col].astype('float64')
    df[base_col] = df[base_col].astype('float64')

    # compute length based on litho_top and litho_base
    id_list = []

    for i in df.index:
        id_ = df.loc[i,id_col]
        
        if verbose : print(i, id_, df.loc[i, top_col], df.loc[i, base_col])
        if id_ not in id_list:
            id_list.append(id_)
            if isinstance(id_, str):
                sql_id = f"{id_}"
            elif isinstance(id_, float) or isinstance(id_, int):
                sql_id = id_
                
            tmp = df[df[id_col] == sql_id]
            
            if verbose : print(len(tmp))
            #if len(tmp) > 0:
            df.loc[tmp.index, length_col_name] = float(max(tmp[base_col])) - float(min(tmp[top_col]))
    
    df.drop(index=df.query(f'{base_col}.isnull() and {top_col}.isnull()').index, inplace=True)
    df.insert(df.columns.to_list().index(id_col)+1, length_col_name, df.pop(length_col_name))
    #df.reset_index(drop=True, inplace=True)
    

### Creation du répertoire de sauvegarde

In [3]:
save_dir = ROOT_DIR + '/CF_data/Result_traitem/organisation/'

In [4]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)

### Definition de variables usuelles

In [5]:
MEAS_NAMES_MODEL = {'Fraction   2000 µm':'Fract_2000µ', 'Fraction   63 µm':'Fract_63µ', 'Fraction   45 µm':'Fract_45µ', 'Fraction   16 µm':'Fract_16µ', 
                    'Fraction   2 µm':'Fract_2µ', 'Fraction 2 mm':'Fract_2', 'Fraction +2 mm':'Fract_2+', 'Fract_2':'Fract_2', 'Fract_2+':'Fract_2+', 
                    'Mat. organique':'MO', 'Mat. sèche':'MS', 'Argile':'Fract_arg', 'Fraction argileuse':'Fract_arg'}

In [6]:
POL_NAMES_MODEL = {**POL_NAMES_MODEL, **MEAS_NAMES_MODEL}

In [7]:
params_kw = ['O_diss','Niv_eau', 'temp', '^T$', '^CE$', 'pH$', 'ORP']
meas_kw_col = ['O_diss','pH','CE','ORP','Niv_eau_pz','Niv_eau_sol','Temp']
sufx = ['sup', 'prof', 'inf', '/\dM(\*)?']
prefx = ['eau forage ']
id_reg = '\s*(?P<id>(?:^canne |Piezair )*\w*\d+\w*)\s*'
pollutants_names = list(set(list(DEFAULT_POL_LEXICON.abbreviations.keys()) + list(POL_NAMES_MODEL.values())))

In [8]:
bh_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Sect_crep','Long_pz_sol','Ht_pz_sol',
           'Diam_for','Diam_int_pz','Diam_ext_pz','Ht_chbre','Refus','Societe','Zone','Sous_zone','Etude','Method','Resp_chantier',
           'Emplacement','ID_date','Rmq']))

mes_cols = list(set(['Date_mes','ID','ID_ech','X','Y','Z','Zsol','pH_H2O', 'Temp_pH_H2O', 'Temp_pH_CaCl2','pH_CaCl2','Temp_pH_KCl',
            'pH_KCl','Residu_perte_feu','Fract_arg','Fract_min_2µ','Fract_min_50µ','Fract_min_2','Temp_pH_mes',
            'pH_H20', 'Fract_min_2µ', 'Fract_min_50µ', 'Fract_min_2', 'pH_KCl', 'Temp_pH_mes', 'pH_H20', 'sulfures_tot''N_Kjdl','Temp_CE','Temp_pH','Nappe','Rmq','Fract_2000µ','Fract_63µ','Fract_45µ','Fract_16µ',
            'Fract_2µ','Temp_ech', 'Periode'] + meas_kw_col + list(MEAS_NAMES_MODEL.values())))

eqp_cols = list(set(list(set(['Date_for','ID','X','Y','Z','Zsol','Type_equip','Equip_base','Equip_top','Rmq']))))

litho_cols = list(set(['Date_for','ID','ID_ech','X','Y','Z','Zsol','Long_for','Litho_top','Litho_base','Intv_top','Intv_base',
              'Description','Rmq']))

an_cols = list(set(['ID','X','Y','Z','Zsol','Date_ech','ID_ech','Type_ech','Ech_top','Ech_base','Intv_top','Intv_base',
           'Description','Nappe','Organo','Intensite', 'Min_organo', 'Max_organo', 'Polluant',
           'Surnageant','Sousnageant','Caractere','Opacite','Rmq'] + pollutants_names))

ukw_cols = list(set(['Date_for','ID','X','Y','Z','Zsol','Type','Long_for','Method','Societe','Rmq']))

cols_dict = {'borehole': bh_cols, 'measure': mes_cols, 'lithology': litho_cols, 'analysis': an_cols, 
 'equipement': eqp_cols, 'unknown': ukw_cols}

In [9]:
bh_crit = ['ID','X','Y','Z','Zsol','Type','Long_for','Long_pz','Diam_for','Diam_int_pz','Diam_ext_pz']

mes_crit = ['ID','ID_ech','Date_mes'] + meas_kw_col

eqp_crit = ['Type_equip','Equip_base','Equip_top']

litho_crit = ['Litho_top','Litho_base','Intv_top','Intv_base','Description']

an_crit = ['ID_ech','Type_ech','Organo','Surnageant','Sousnageant'] + list(DEFAULT_POL_LEXICON.abbreviations.keys()) 

ukw_crit = ['ID','X','Y','Z','Zsol','Long_for','Type']

crit_dict = {'borehole': bh_crit, 'measure': mes_crit, 'lithology': litho_crit, 'analysis': an_crit, 
 'equipement': eqp_crit, 'unknown': ukw_crit}

variables utilisées par jeu de données
================================
- bh 	: 	forages (simple ou piezo)
- equip	:	equipements d'un forage (outils, méthodes utilisés, ...)
- ukw	:	objets physiques indéterminés
- litho :	descriptions lithologiques
- an 	: 	analyses de contaminants sur des échantillons (sol, eau)
- mes	:	mesures de propriétés sur des échantillons (sol, eau), de paramètres hydrochimiques, ...


# ---------------------------------------------------------

In [10]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 11-Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [11]:
tmp_dir= save_dir + 'Siterem_Ext_Pilote/'
sheet='Result_eau'

In [12]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

5 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 115, columns : 37


  warn(msg)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=115, min=5, readout=False), IntSlider(value=1…

In [13]:
ech_df=df.loc[:31]
an=df.loc[list(range(0,4))+list(range(32, len(df)))]

In [14]:
an = an.sort_index().reset_index(drop=True)

In [15]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('Autre zone suspecte investiguée', 2), ('pH', 3)]


In [16]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['6:Autre zone suspecte investiguée', '25:pH', '30:pH']


In [17]:
ech_df.drop(list(range(5)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Profondeur de la nappe/piezo', 'Profondeur de la nappe/chambre visite', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'Profondeur de la nappe/sol ', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'CE', 'ORP', 'Oxygène dissous', 'PARAMETRES PHYSICO-CHIMIQUES \n(mesures au labo)']



In [18]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','pH','Temp_ech','Temp_pH']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [19]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [20]:
an=col_ren(an, 1)

In [21]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [22]:
an.rename(columns={an.columns[0]:'ID_ech', 'col_35':'phénanthrène', 'Période ':'Periode', 
                   'Date de prélèvement':'Date_ech'}, inplace=True)

In [23]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'Periode', 'Emplacement','Date_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre',
      'Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE)", "cyanure complex - méthode interne ", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g"]

an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-7]

In [24]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'Styrène', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'MTBE']



In [25]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Periode', 'Emplacement', 'Date_ech', 'Cyanures (libres)  -  NEN-EN-ISO 14403', 'CN_totaux - NEN-EN-ISO 14403', 'cyanure (APE)', 'cyanure complex - méthode interne ', 'thiocyanate - méthode interne', 'Tétrachloroéthylène ', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [26]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [27]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [28]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

borehole: 33 ; measure: 33 ; lithology: 0 ; analysis: 33 ; equipement: 0 ; unknown: 0 ; 


In [29]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 10 ; measure: 33 ; lithology: 0 ; analysis: 33 ; equipement: 0 ; unknown: 0


In [30]:
source_an = an
source_bh = bh
source_mes = mes

In [31]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 10 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 33 ; source_mes: 33


* **Sheet : 'Param physico'**

In [32]:
tmp_dir= save_dir + 'Siterem_Ext_Pilote/'
sheet='Param_physico'

In [33]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

8 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 53, columns : 77


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=53, min=5, readout=False), IntSlider(value=12…

In [34]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [35]:
df=col_ren(df, 1)

[1;31mDouble columns' name found :[0;0m
[('Température de prélèvement ', 2), ('Date de prélèvement', 2), ('Autre zone suspecte investiguée', 2), ('Nom échantillon', 2), ('Organoleptique odeur type', 2), ('Organoleptique odeur intensité (**)', 2), ('Oxygène dissous', 2), ('Profondeur de la nappe/chambre visite', 2), ('Période ', 3), ('PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 2), ('CE', 2), ('ORP', 2), ('Emplacement \n- S : Simulateur \n- HZS : Hors zone simulateur', 2), ('pH', 4), ('Profondeur de la nappe/piezo', 2), ('Profondeur de la nappe/sol ', 2)]


In [36]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [37]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [38]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

column(s) dropped: ['3:Période ']
column(s) dropped: ['6:Autre zone suspecte investiguée', '26:pH', '32:pH']


In [39]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

41 NaN lines dropped
27 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [40]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)

Columns dropped :['Profondeur de la nappe/chambre visite', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Profondeur de la nappe/sol ', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', "*paramètres n'ont pas été pris en débit continu (dans seau) - peu de débit", 'col_52']

Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'PARAMETRES PHYSICO-CHIMIQUES \n(mesures au labo)']



In [41]:
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','Temp_ech','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [42]:
sdf=sdf.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Niv_eau_chbre','pH','Niv_eau_sol','Long_pz',
      'Temp_ech ','CE','ORP','O_diss']
sdf=col_ren(sdf, mode=1, name=name)

In [43]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [44]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)

In [45]:
data=[df, sdf]
for d in data:
    d['Rmq']=''
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        n=str(d.loc[i, 'ID_ech'])
        d.loc[i,'ID_ech']=n.replace('*', '')
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan
        
        if re.match('\d+\*{1}$',n, re.I): 
            d.loc[i,'Rmq']="mesures faites dans un seau (débit non continu ou peu de débit)"
        elif re.match('\d+\*{2}$',n, re.I): 
            d.loc[i,'Rmq']="mésures faites dans une eau quasi-stagnante (Piezo rempli de sédiment et débit très faible)"

In [46]:
df.insert(1, 'Type_ech', 'Eau')
sdf.insert(1, 'Type_ech', 'Eau')

In [47]:
ech_df=data_merger(sdf, df, 'outer', 'ID_ech')[0]

In [48]:
ech_df=na_col_drop(df,2)
ech_df=na_line_drop(df,1)
ech_df.reset_index(drop=True, inplace=True)

In [49]:
for i in ech_df.index:
    if not pd.isnull(ech_df.loc[i, 'Emplacement']):
        val = ech_df.loc[i, 'Emplacement']
    else:
        ech_df.loc[i, 'Emplacement'] = val

In [50]:
mdf = ech_df

In [51]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [52]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [53]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

borehole: 33 ; measure: 33 ; lithology: 0 ; analysis: 33 ; equipement: 0 ; unknown: 0 ; 


In [54]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 6 ; measure: 33 ; lithology: 0 ; analysis: 33 ; equipement: 0 ; unknown: 0


In [55]:
# conflict with 'emplacement' (ext_pilote | simulateur)
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [56]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Emplacement_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_bh = dataset

all conflicts have been fixed!


In [57]:
source_an['Date_ech'] = source_an['Date_ech'].astype('datetime64')

In [58]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [59]:
source_mes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33 entries, 0 to 32
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Periode   33 non-null     object
 1   pH        5 non-null      object
 2   Temp_pH   5 non-null      object
 3   ID        33 non-null     object
 4   ID_ech    33 non-null     object
 5   Date_mes  33 non-null     object
 6   Temp_ech  31 non-null     object
dtypes: object(7)
memory usage: 1.9+ KB


In [60]:
source_mes['Date_mes'] = source_mes['Date_mes'].astype('datetime64')

In [61]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [62]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Periode_y':list(conflict_df.index), 'pH_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset

all conflicts have been fixed!


In [63]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 10 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 43 ; source_mes: 43


* **Sheet : 'Inorganiques et composés majeurs'**

In [64]:
tmp_dir= save_dir + 'Siterem_Ext_Pilote/'
sheet='Inorganic_major'

In [65]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

3 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34']

Rows : 68, columns : 28


interactive(children=(IntSlider(value=5, description='rows', max=68, min=5, readout=False), IntSlider(value=12…

In [66]:
ech_df=df.loc[:21]
an=df.loc[list(range(0,4))+list(range(22, len(df)))]

In [67]:
an = an.sort_index().reset_index(drop=True)

In [68]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('Autre zone suspecte investiguée', 2)]


In [69]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['6:Autre zone suspecte investiguée']


In [70]:
ech_df.drop(list(range(2)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,2)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Profondeur de la nappe', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'pH']

1 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [71]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Temp_ech']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [72]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [73]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('nitrate', 2), ('nitrite', 2), ('ammonium', 2)]


In [74]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [75]:
an=dble_col_drop(an)

column(s) dropped: ['11:ammonium', '15:nitrite', '17:nitrate']


In [76]:
an=na_col_drop(an,3)

Columns dropped :['CARBONE ORGANIQUE', 'DEMANDE EN O2', 'COMPOSES AZOTES', 'COMPOSES SOUFRES ', 'ELEMENTS MAJEURS', 'AUTRES ANALYSES', 'cyanure (libre)', 'METHYL-TERT-BUTYL-ETHER', 'MTBE', 'Teneur mesurée ', 'Teneur mesurée', 'VS : Valeur seuil', "(*) RP : Rapport de prélèvements; ES : Etude de sol; EO : Etude d'orientation; EC : Etude de caractérisation; SA : Suivi d'assainissement; EF : Evaluation finale", "(**) - : Pas d'impression organoleptique; + : Impression organoleptique faible; ++ : Impression organoleptique forte", '(***) + : Limpide; - : Trouble; -- : Opaque', "Le contenu des tableaux est conforme au modèle repris à l'annexe IX du GREO V03. Le formalisme a été adapté par SITEREM tout en garantissant la lisibilité du document imprimé. "]



In [77]:
an.rename(columns={an.columns[0]:'ID_ech','ammoniaque - libre':'ammoniaque libre','Période ':'Periode', 
                   'Date de prélèvement':'Date_ech', 'Emplacement ':'Emplacement'}, inplace=True)

In [78]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [79]:
an = col_ren(an, name=POL_NAMES_MODEL, mode=1)#,verbose=True)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Periode', 'Emplacement', 'Date_ech', 'sulfures (libre)', 'sulfites', 'sulfate', 'fer (Fe) total', 'fluorures']


In [80]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [81]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [82]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [83]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

borehole: 25 ; measure: 25 ; lithology: 0 ; analysis: 25 ; equipement: 0 ; unknown: 0 ; 


In [84]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 7 ; measure: 25 ; lithology: 0 ; analysis: 25 ; equipement: 0 ; unknown: 0


In [85]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [86]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [87]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [88]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Periode_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset

all conflicts have been fixed!


In [89]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 10 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 44 ; source_mes: 44


#### ======================================================================================

In [90]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 12-Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [91]:
tmp_dir= save_dir + 'Siterem_Pilote/'
sheet='Result_eau'

In [92]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

3 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 117, columns : 91


  warn(msg)


interactive(children=(IntSlider(value=5, description='rows', max=117, min=5, readout=False), IntSlider(value=1…

In [93]:
ech_df=df.loc[:32]
an=df.loc[list(range(0,4))+list(range(33, len(df)))]

In [94]:
an = an.sort_index().reset_index(drop=True)

In [95]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('Autre zone suspecte investiguée', 2), ('pH', 3)]


In [96]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['6:Autre zone suspecte investiguée', '25:pH', '31:pH']


In [97]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'Profondeur de la nappe/sol ', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'PARAMETRES PHYSICO-CHIMIQUES \n(mesures au labo)']



In [98]:
ech_df.columns

Index(['Nom échantillon', 'Période ',
       'Emplacement \n- P : Pilote \n- HZP : Hors zone pilote',
       'Date de prélèvement', 'Profondeur de la nappe/piezo',
       'Profondeur de la nappe/chambre visite', 'pH',
       'Température de prélèvement ', 'CE', 'ORP', 'Oxygène dissous', 'col_29',
       'température pour mes. pH'],
      dtype='object')

In [99]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Niv_eau_pz','Niv_eau_chbre','pH','Temp_ech','CE','ORP',
      'O_diss','col_29','Temp_pH']
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [100]:
ech_df.drop(columns=['col_29'], inplace=True)
ech_df['CE']=ech_df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [101]:
ech_df['Periode'].replace('\n',' ', regex=True, inplace=True)
ech_df.replace('\n','', regex=True, inplace=True)

In [102]:
data=[ech_df]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [103]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [104]:
an=col_ren(an, 1)

In [105]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)

In [106]:
an=dble_col_drop(an)

column(s) dropped: []


In [107]:
an.rename(columns={an.columns[0]:'ID_ech','Période ':'Periode', 'Emplacement \n- P : Pilote \n- HZP : Hors zone pilote':'Emplacement',
                  'Date de prélèvement':'Date_ech'}, inplace=True)

In [108]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'Periode', 'Emplacement', 'Date_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre','Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE)", "cyanure complex", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g","h"]

In [109]:
an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-8]

In [110]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'MTBE']



In [111]:
an['Periode'].replace('\n',' ', regex=True, inplace=True)
an.replace('\n','', regex=True, inplace=True)

In [112]:
data=[an]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [113]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Periode', 'Emplacement', 'Date_ech', 'Cyanures (libres)  -  NEN-EN-ISO 14403', 'CN_totaux - NEN-EN-ISO 14403', 'cyanure (APE)', 'thiocyanate - méthode interne', 'Tétrachloroéthylène ', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [114]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [115]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [116]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [117]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

borehole: 87 ; measure: 87 ; lithology: 0 ; analysis: 87 ; equipement: 0 ; unknown: 0 ; 

[1;32mNot used columns:[0;0m
 ['Niv_eau_chbre']


In [118]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 16 ; measure: 87 ; lithology: 0 ; analysis: 87 ; equipement: 0 ; unknown: 0


In [119]:
source_bh = bh
source_mes = mes
source_an = an

In [120]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 16 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 87 ; source_mes: 87


* **Sheet : 'Param physico'**

In [121]:
tmp_dir= save_dir + 'Siterem_Pilote/'
sheet='Param_physico'

In [122]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

7 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 52, columns : 92


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=52, min=5, readout=False), IntSlider(value=12…

In [123]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [124]:
df=col_ren(df, 1)

[1;31mDouble columns' name found :[0;0m
[('Température de prélèvement ', 2), ('PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 2), ('Date de prélèvement', 2), ('Autre zone suspecte investiguée', 2), ('Nom échantillon', 2), ('Période ', 2), ('Organoleptique odeur type', 2), ('Organoleptique odeur intensité (**)', 2), ('Oxygène dissous', 2), ('Profondeur piezo', 2), ('CE', 2), ('ORP', 2), ('pH', 4), ('Profondeur de la nappe/piezo', 2), ('Profondeur de la nappe/sol ', 2)]


In [125]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [126]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [127]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

column(s) dropped: []
column(s) dropped: ['6:Autre zone suspecte investiguée', '25:pH', '31:pH']


In [128]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

83 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [129]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)

Columns dropped :['Profondeur de la nappe/chambre visite', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Profondeur de la nappe/sol ', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'col_51']

Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'PARAMETRES PHYSICO-CHIMIQUES \n(mesures au labo)']



In [130]:
df=df.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','Temp_ech','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [131]:
sdf.drop(columns=['col_29'], inplace=True)
name=['ID_ech','Periode','Emplacement','Date_ech','Niv_eau_pz','Long_pz','pH','Niv_eau_sol','Temp_ech','CE',
      'ORP','O_diss','Temp_pH']
sdf=col_ren(sdf, mode=1, name=name)

In [132]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [133]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)
sdf.drop(columns=["Niv_eau_sol"], inplace=True)

In [134]:
set(sdf['Emplacement'])

{'HZP', 'P'}

In [135]:
data=[df, sdf]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [136]:
df.replace('\*|à compléter',np.nan, inplace=True, regex=True)

In [137]:
mdf, conflict_df = data_merger(sdf, df, 'outer', 'ID_ech')

In [138]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [139]:
mdf = na_col_drop(mdf, 3)

Columns dropped :['ID']



In [140]:
mdf['Type'] = 'Piezo'
mdf.rename(columns={'Date_ech':'Date_mes', 'ID_ech':'ID'}, inplace=True)

In [141]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

borehole: 95 ; measure: 95 ; lithology: 0 ; analysis: 0 ; equipement: 0 ; unknown: 0 ; 


In [142]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 17 ; measure: 95 ; lithology: 0 ; analysis: 0 ; equipement: 0 ; unknown: 0


In [143]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [144]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [145]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Periode_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset

all conflicts have been fixed!


In [146]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
#source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 21 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 87 ; source_mes: 127


* **Sheet : 'Inorganiques et composés majeurs'**

In [147]:
tmp_dir= save_dir + 'Siterem_Pilote/'
sheet='Inorganic_major'

In [148]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

4 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2']

Rows : 68, columns : 54


interactive(children=(IntSlider(value=5, description='rows', max=68, min=5, readout=False), IntSlider(value=12…

In [149]:
ech_df=df.loc[:21]
an=df.loc[list(range(0,4))+list(range(22, len(df)))]

In [150]:
an = an.sort_index().reset_index(drop=True)

In [151]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('Autre zone suspecte investiguée', 2)]


In [152]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['6:Autre zone suspecte investiguée']


In [153]:
ech_df.drop(list(range(2)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,2)
ech_df=na_line_drop(ech_df,2)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Profondeur de la nappe', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'pH']



In [154]:
name=['ID_ech', 'Periode', 'Emplacement','Date_ech','Temp_ech']
ech_df.replace(r'\n',' ', inplace=True, regex=True)
ech_df=col_ren(ech_df, name=name, mode=1)
ech_df=ech_df.query('ID_ech==ID_ech')
ech_df.insert(1,'Type_ech','Eau')

In [155]:
dataframe_viewer(ech_df, rows=3)

Rows : 51, columns : 6


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=6,…

In [156]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [157]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('nitrate', 2), ('nitrite', 2), ('ammonium', 2)]


In [158]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [159]:
an=dble_col_drop(an)

column(s) dropped: ['11:ammonium', '15:nitrite', '17:nitrate']


In [160]:
an=na_col_drop(an,3)

Columns dropped :['CARBONE ORGANIQUE', 'DEMANDE EN O2', 'COMPOSES AZOTES', 'COMPOSES SOUFRES ', 'ELEMENTS MAJEURS', 'AUTRES ANALYSES', 'cyanure (libre)', 'METHYL-TERT-BUTYL-ETHER', 'MTBE', 'Teneur mesurée ', 'Teneur mesurée', 'VS : Valeur seuil', "(*) RP : Rapport de prélèvements; ES : Etude de sol; EO : Etude d'orientation; EC : Etude de caractérisation; SA : Suivi d'assainissement; EF : Evaluation finale", "(**) - : Pas d'impression organoleptique; + : Impression organoleptique faible; ++ : Impression organoleptique forte", '(***) + : Limpide; - : Trouble; -- : Opaque', "Le contenu des tableaux est conforme au modèle repris à l'annexe IX du GREO V03. Le formalisme a été adapté par SITEREM tout en garantissant la lisibilité du document imprimé. "]



In [161]:
an.rename(columns={'Période ':'Periode', 'Emplacement \n- S : Simulateur \n- HZS : Hors zone simulateur':'Emplacement',
                  'Date de prélèvement':'Date_ech', 'col_9':'ammoniaque libre'}, inplace=True)

In [162]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')

In [163]:
data=[ech_df, an]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan

In [164]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)
#an=an.iloc[:,:-7]


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'Periode', 'Emplacement', 'Date_ech', 'ammoniaque - libre', 'sulfures (libre)', 'sulfites', 'sulfate', 'fer (Fe) total', 'fluorures']


In [165]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [166]:
dataset = mdf
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Periode_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
mdf = dataset

all conflicts have been fixed!


In [167]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')

In [168]:
dataframe_viewer(mdf, rows=3)

Rows : 51, columns : 33


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=12…

In [169]:
mdf['Type'] = 'Piezo'
mdf['Date_mes'] = mdf['Date_ech']
mdf['ID'] = mdf['ID_ech']

In [170]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

borehole: 51 ; measure: 51 ; lithology: 0 ; analysis: 51 ; equipement: 0 ; unknown: 0 ; 


In [171]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 10 ; measure: 51 ; lithology: 0 ; analysis: 51 ; equipement: 0 ; unknown: 0


In [172]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [173]:
dataset = source_bh
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Emplacement_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_bh = dataset

all conflicts have been fixed!


In [174]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [175]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

Conflict values present. Please resolve this manually !


In [176]:
dataset = source_mes
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Temp_ech_x':list(conflict_df.index), 'Periode_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)
source_mes = dataset

all conflicts have been fixed!


In [177]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 22 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 95 ; source_mes: 135


#### ======================================================================================

In [178]:
# initialisation des variables
_df = pd.DataFrame()
source_bh, source_eqp, source_ukw = _df, _df, _df
source_litho, source_an, source_mes = _df, _df, _df

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 0 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 0 ; source_mes: 0


## 13-Resultats_Siterem_SOL.xlsx
* **Sheet : 'Résult SOL ext. pilote'**

In [179]:
tmp_dir= save_dir + 'Siterem_Result_Sol/'
sheet='Result_sol_ExtP'

In [180]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='Résult SOL ext. pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

2 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 49']

Rows : 103, columns : 48


  warn(msg)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=103, min=5, readout=False), IntSlider(value=1…

In [181]:
ech_df=df.loc[:22]
an=df.loc[23:]

In [182]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [183]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('température pour mes. pH', 2)]


In [184]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['16:température pour mes. pH']


In [185]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['MO et COT', 'pH', 'GRANULOMETRIE']



In [186]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [187]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
ech_df=col_ren(ech_df, name=name, mode=1)

In [188]:
set(ech_df.Description)

{'R', 'R ', 'TN', 'TN '}

In [189]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x in ['R','R ']: ech_df.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Description']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [190]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [191]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('Teneur mesurée', 2)]


In [192]:
an.rename(columns={an.columns[0]:'ID_ech', 'col_33':'Phénanthrène'}, inplace=True)

In [193]:
an=dble_col_drop(an)

column(s) dropped: ['79:Teneur mesurée']


In [194]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')

Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'HYDROCARBURES TOTAUX', 'Teneur mesurée', 'VS : Valeur seuil']



In [195]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'cyanure (libre)', 'cyanure (totaux)', 'cyanure (APE)', 'fraction aromat. C6-C7', 'fraction aromat. C7-C8', 'fraction aromat. C8-C10', 'fraction aliphat. C6-C8', 'fraction aliphat. C8-C10', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [196]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [197]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')
mdf['Date_mes'] = mdf['Date_ech']

In [198]:
mdf = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [199]:
mdf['Type'] = 'Piezo'

In [200]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

borehole: 44 ; measure: 44 ; lithology: 0 ; analysis: 44 ; equipement: 0 ; unknown: 0 ; 


In [201]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 14 ; measure: 44 ; lithology: 0 ; analysis: 44 ; equipement: 0 ; unknown: 0


In [202]:
source_bh = bh
source_mes = mes
source_an = an

In [203]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 14 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 44 ; source_mes: 44


* **Sheet : 'SOL T1 pilote'**

In [204]:
tmp_dir= save_dir + 'Siterem_Result_Sol/'
sheet='SOL_T1_Pilote'

In [205]:
df = pd.read_excel(ROOT_DIR + '/CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='SOL T1 pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

dataframe_viewer(df, rows=5)

3 NaN lines dropped
Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36']

Rows : 135, columns : 29


  warn(msg)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [206]:
ech_df=df.loc[:35]
an=df.loc[36:]

In [207]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [208]:
ech_df=ech_df.transpose()
ech_df.reset_index(drop=True, inplace=True)
ech_df=col_ren(ech_df, 1)

[1;31mDouble columns' name found :[0;0m
[('Autre zone suspecte investiguée', 2), ('pH (H20)', 2), ('température pour mes. pH', 2)]


In [209]:
ech_df=dble_col_drop(ech_df)

column(s) dropped: ['9:Autre zone suspecte investiguée', '27:température pour mes. pH', '30:pH (H20)']


In [210]:
ech_df.drop(list(range(3)), axis=0, inplace=True)
ech_df.reset_index(drop=True, inplace=True)
ech_df=na_col_drop(ech_df,1)
ech_df=na_line_drop(ech_df,3)
ech_df.reset_index(drop=True, inplace=True)

Columns dropped :['Matières organiques', 'SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Organoleptique couleur suspecte', 'Organoleptique odeur intensité (***)', 'Organoleptique odeur type', 'MO et COT', 'matières organiques', 'COT', 'pH', 'pH (KCl)', 'température pour mes. pH', 'pH (H20)', 'GRANULOMETRIE', 'Fraction argileuse', 'parties min. 2µm', 'parties min. 50µm', 'parties min. 2mm']

9 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [211]:
ech_df=ech_df[:-1]
ech_df.drop(columns=['broyage'], inplace=True)

In [212]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_ech','Long_for','Refus','Description','Fract_2','Fract_2+']
ech_df=col_ren(ech_df, name=name, mode=1)

In [213]:
for i in range(len(ech_df['Description'])):
    x = ech_df.loc[i,'Description']
    if x in ['R','R ']: ech_df.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: ech_df.loc[i,'Description']='Terrain naturel'

ech_df['Refus']=ech_df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
ech_df.insert(1,'Type_ech','Sol')#

In [214]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [215]:
an=col_ren(an, 1)

[1;31mDouble columns' name found :[0;0m
[('Teneur mesurée', 2)]


In [216]:
an=an.iloc[:,:-17]
an=dble_col_drop(an)

column(s) dropped: []


In [217]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,3)
an.insert(1,'Type_ech','Sol')

Columns dropped :['METAUX LOURDS', 'Chrome VI', 'Cobalt', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Phénol', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'EOX (****)', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER']



In [218]:
an.rename(columns={an.columns[0]:'ID_ech', 'col_35':'Phénanthrène'}, inplace=True)

In [219]:
an=col_ren(an, name=POL_NAMES_MODEL, mode=1)


[1;34mPossible new pollutants names:[0;0m
['ID_ech', 'Type_ech', 'cyanure (libre)', 'cyanure (totaux)', 'cyanure (APE)', 'fraction aromat. C6-C7', 'fraction aromat. C7-C8', 'fraction aromat. C8-C10', 'fraction aliphat. C6-C8', 'fraction aliphat. C8-C10', 'Fraction C5 - C8', 'Fraction C8 - C10', 'Fraction C16 - C21']


In [220]:
mdf, conflict_df = data_merger(ech_df, an, how='outer', on=['ID_ech'], dist_max=1., drop_skip_col=['index'])

In [221]:
mdf['Date_ech'] = mdf['Date_ech'].astype('datetime64')
mdf['Date_mes'] = mdf['Date_ech']

In [222]:
mdf = gen_id_from_ech(mdf, suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [223]:
mdf['Type'] = 'Piezo'

In [224]:
for i in mdf.index:
    if not pd.isnull(mdf.loc[i, 'ID']) and re.search('\w+\s+\d+', mdf.loc[i, 'ID']): 
        mdf.loc[i, 'Type'] = 'Forage'

In [225]:
df_dict = data_slicer(mdf, cols_dict, crit_dict)

borehole: 17 ; measure: 17 ; lithology: 0 ; analysis: 17 ; equipement: 0 ; unknown: 0 ; 


In [226]:
ukw = df_dict['unknown']
bh = df_dict['borehole']

bh = bh.drop(index=ukw.index)
ukw.drop_duplicates(['ID'], inplace=True)
ukw.reset_index(drop=True, inplace=True)
bh.drop_duplicates(['ID'], inplace=True)
if 'X' in bh.columns: 
    bh = bh.query('ID==ID and X==X')
else:
    bh = bh.query('ID==ID')
bh.reset_index(drop=True, inplace=True)

mes = df_dict['measure']
an = df_dict['analysis']
litho = df_dict['lithology']
eqp = df_dict['equipement']

print(f'borehole: {len(bh)} ; measure: {len(mes)} ; lithology: {len(litho)} ; analysis: {len(an)} ; ' 
      f'equipement: {len(eqp)} ; unknown: {len(ukw)}')

borehole: 10 ; measure: 17 ; lithology: 0 ; analysis: 17 ; equipement: 0 ; unknown: 0


In [227]:
source_bh, conflict_df = data_merger(source_bh, bh, how='outer', on=['ID'], dist_max=1., drop_skip_col=['index'])

In [228]:
source_an, conflict_df = data_merger(source_an, an, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1., drop_skip_col=['index'])

In [229]:
source_mes, conflict_df = data_merger(source_mes, mes, how='outer', on=['ID_ech', 'Date_mes'], dist_max=1., drop_skip_col=['index'])

In [230]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir), os.makedirs(tmp_dir+'source_merge/')
    
#eqp.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
mes.to_csv(tmp_dir+sheet+'_Measures.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Samples.csv', index=False)
#ukw.to_csv(tmp_dir+sheet+'_Unknown.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithologies.csv', index=False)

source_bh.to_csv(tmp_dir+'source_merge/source_Boreholes.csv', index=False) #all Boreholes data in the source
source_mes.to_csv(tmp_dir+'source_merge/source_Measures.csv', index=False) #all Measures data in the source
#source_eqp.to_csv(tmp_dir+'source_merge/source_Equipments.csv', index=False)
source_an.to_csv(tmp_dir+'source_merge/source_Samples.csv', index=False)
#source_ukw.to_csv(tmp_dir+'source_merge/source_Unknown.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_merge/source_Lithologies.csv', index=False)

print(f'source_bh: {len(source_bh)} ; source_eqp: {len(source_eqp)} ; source_uknw: {len(source_ukw)} ; '
      f'source_litho: {len(source_litho)} ; source_an: {len(source_an)} ; source_mes: {len(source_mes)}')

source_bh: 23 ; source_eqp: 0 ; source_uknw: 0 ; source_litho: 0 ; source_an: 60 ; source_mes: 59
