# DATA ORGANIZATION

In [1]:
%matplotlib widget

In [2]:
from utils.io import gen_id_dated, gdf_viewer, gdf_geom, gdf_merger
import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import datetime as dtm
import matplotlib.pyplot as plt

Data format (excel files)

In [3]:
def na_line_drop(data, col_n=3):
    l1=len(data)
    data['line_na']=False

    for i in range(len(data)):
        verif=True
        for j in data.columns.to_list()[col_n:-1]:
            if not pd.isnull(data.loc[i,j]): verif=False

        data.loc[i,'line_na']=verif

    data=data.query('line_na==False')
    data.reset_index(drop=True, inplace=True)
    data.drop('line_na', axis=1, inplace=True)
    l2=len(data)
    print(f'{l1-l2} NaN lines dropped')
    
    return data

In [4]:
# drop columns if not enough data
def na_col_drop(data, crit=10, drop=True, verbose=False):
    """
    delete NaN columns in the dataframe based on a minimum number of non-NaN values 
    """

    drop_cols=[]
    if verbose: print('Non-NaN values\n----------------')
    for c in data.columns:
        v=len(data.iloc[:,0])-data[c].isnull().sum()
        if verbose: print(f'{c} --> val: {v} | Nan: {data[c].isnull().sum()}')
        if v<crit:
            drop_cols.append(c)

    if drop:
        print(f'\nColumns dropped :{drop_cols}')
        data.drop(drop_cols, axis=1, inplace=True)

    return data

In [5]:
def dble_col_drop(data, drop=True):
    twins={}
    idx_drop={}
    for i in range(len(data.columns)): # locate double columns
        c=data.columns[i]
        if c not in twins.keys(): 
            twins.update({c:i})
        #elif data.iloc[:,i].isnull().sum()<data.iloc[:,twins[c]].isnull().sum(): 
        #    idx_drop.update({twins[c]:i})
        #    twins.update({c:i})
        else:
            idx_drop.update({i:c})
    
    for i in range(len(data.columns)): # attempt to collect data if exist in double columns
        for k,v in idx_drop.items():
            if data.columns[i]==v:
                for j in range(len(data)):
                    if pd.isnull(data.iloc[j, i]):
                        data.iloc[j, i] = data.iloc[j, k]
                    elif not isinstance(data.iloc[j, i],str) and not isinstance(data.iloc[j, k], str):
                        data.iloc[j, i] = max(data.iloc[j, i],data.iloc[j, k])
    
    print(f"column(s) dropped: {[f'{x}:{y}' for x,y in idx_drop.items()]}")
    new_col=list(set(range(len(data.columns)))-set(idx_drop.keys()))
    if drop : data=data.iloc[:, new_col] 
    
    return data

In [6]:
def col_ren(data, line_to_col=1, mode=0, name=[]):
    """
    mode: int
        set 0 to rename columns with a line, set 1 if provide name list or dict
    """
    new_name={}
    
    if mode!=0 and mode!=1:
        print("Error! Parameter \'Mode\' must be 0 or 1 (if 1, colums length must be equal to name length)")
        
    elif mode==0:
        for i in data.columns:
            col = str(data.iloc[line_to_col, i])
            if re.search('nan',col, flags=re.IGNORECASE):
                new_name.update({i:f'col_{i}'})
            else:
                new_name.update({i:col})
        
        data.drop([line_to_col], axis=0, inplace=True)
        data.reset_index(drop=True, inplace=True)

    elif mode==1 :
        if isinstance(name, list) and len(name)==len(data.columns):
            for i in range(len(name)):
                new_name.update({data.columns[i]:name[i]})
                
        elif isinstance(name, dict):
            strp=',| |>|<|-|\n|_|\(|\.|\)'
            
            for i in range(len(data.columns)):
                keys=list(pol_field_model.keys())
                old=data.columns[i]
                
                for k in keys:
                    if re.match(f"{re.sub(strp,'',k)}", re.sub(strp,'',old), flags=re.I):
                        new_name.update({old:pol_field_model[k]})

        elif isinstance(name, list) and len(name)!=len(data.columns):
            print('Error! names list length and columns length are not the same.')
    
    data.rename(columns=new_name, inplace=True)
    
    return data 

In [7]:
pol_field_model={'Arsenic': 'As', 'Cadmium': 'Cd', 'Chrome': 'Cr', 'Chrome VI': 'Cr_VI', 'Cuivre': 'Cu', 
'Mercure': 'Hg', 'Plomb': 'Pb', 'Nickel': 'Ni', 'Zinc': 'Zn', 'Cyanure(?:s)? (?libre(?:s)?)?': 'CN_libre', 
'Cyanures (totaux)': 'CN_tot','Cyanure (totaux)': 'CN_tot','CN_totaux':'CN_tot','Cyanures (APE)': 'CN_APE',
'cyanure (totaux)':'CN_tot', 'cyanure (APE)':'CN_APE', 'cyanure complex': 'CN_comp','Cyanure (APE)': 'CN_APE', 
'thiocyanate': 'thioCN',
'Benzène': 'Bnz', 'Toluène': 'Toln', 'Éthylbenzène': 'EthylBnz', 'Orthoxylène': 'O-Xyl', 
'Para- et métaxylène': 'P-M-Xyl', 'Xylènes': 'Xyl', 'Styrène': 'Styr', 'BTEX totaux': 'BTEX_tot', 
'Phénol': 'Phenol', 'Indice phénol': 'Idc_Phenol', 'Naphtalène': 'Naphta', 'Acénaphtylène': 'Acenaphtyl', 
'Acénaphtène': 'Acenaphtn', 'Fluorène': 'Fluorene', 'Phénanthrène': 'Phenanthr', 'Anthracène': 'Anthrc', 
'Fluoranthène': 'Flranth', 'Pyrène': 'Pyr', 'Benzo(a)anthracène': 'Bnz(a)anthrc', 'Chrysène': 'Chrys', 
'Benzo(b)fluoranthène': 'Bnz(b)flranth', 'Benzo(k)fluoranthène': 'Bnz(k)flranth', 
'Benzo(a)pyrène': 'Bnz(a)pyr','Dibenzo(ah)anthracène': 'Dibnz(ah)anthrc',
'Benzo(ghi)pérylène': 'Bnz(ghi)peryl', 
'Indéno(1,2,3-cd)pyrène': 'Indeno(1,2,3-cd)pyr', 'HAP Totaux (16) - EPA': 'HAP_tot_EPA', 
'1,1-Dichloroéthane': '1,1-DCE', '1,2-Dichloroéthane': '1,2-DCE', '1,1-dichloroéthène': '1,1-DCEn', 
'Cis-1,2-dichloroéthène': 'Cis-1,2-DCEn', 'Trans 1,2-dichloroéthylène': 'Trans 1,2-DCEyl', 
'Dichlorométhane': 'DCM', 'Totaux (cis,trans) 1,2-dichloroéthène(?:s)?': '(cis,trans) 1,2-DCE_tot', 
'1,2-dichloropropane': '1,2-DCP', 'Tétrachloroéthylène': 'TetraCEyn', 'Tétrachlorométhane': 'TCM', 
'1,1,1-Trichloroéthane': '1,1,1-TCE', '1,1,2-Trichloroéthane': '1,1,2-TCE', 'Trichloroéthylène': 'TCEyn', 
'Chloroforme': 'Chloroforme', 'Chlorure de vinyle': 'CVinyl', 'EOX': 'EOX', 
'fraction aromat. >C6-C7': 'Arom_C6C7', 'fraction aromat. >C7-C8': 'Arom_C7C8', 
'fraction aromat. >C8-C10': 'Arom_C8C10', 'fraction aliphat. C5-C6': 'Aliphat_C5C6', 
'fraction aliphat. >C6-C8': 'Aliphat_C6C8', 'fraction aliphat. >C8-C10': 'Aliphat_C8C10', 
'Fraction C5 - C8': 'Fract_C5C8', 'Fraction C8-C10': 'Fract_C8C10', 'Fraction C10-C12': 'Fract_C10C12', 
'Fraction C12-C16': 'Fract_C12C16', 'Fraction C16 - C21': 'Fract_C16C21', 'Fraction C21 - C35': 'Fract_C21C35', 
'Fraction C35 - C40': 'Fract_C35C40', 'Hydrocarbures totaux C10-C35': 'HC_tot_C10C35','C5-C8':'Fract_C5C8', 
'C8-C10':'Fract_C8C10','C10-C12':'Fract_C10C12','C12-C16':'Fract_C12C16','C16-C21':'Fract_C16C21', 
'C21-C35':'Fract_C21C35','C35-C40':'Fract_C35C40', 'totaux C10-C35':'HC_tot_C10C35','C12-C22':'Fract_C12C22', 
'C22-C30':'Fract_C22C30','C30-C40':'Fract_C30C40', 'Totaux C10-C40':'HC_tot_C10C40',
'Hydrocarbures totaux C10-C40':'HC_tot_C10C40', 'MTBE': 'MTBE', 'PCB 28': 'PCB_28', 'PCB 52': 'PCB_52', 
'PCB 101': 'PCB_101', 'PCB 118': 'PCB_118', 'PCB 138': 'PCB_138', 'PCB 153': 'PCB_153', 'PCB 180': 'PCB_180', 
'PCB totaux (7)?': 'PCB_tot', 'Chlorure(?:s)?': 'Chlorure', 'Soufre Total': 'S_tot', 'sulfite(?:s)?': 'sulfite', 
'sulfate(?:s)?': 'sulfate', 'COT':'COT','DBO (5 jours)':'DBO_5j','DCO':'DCO', 
'Ammonium':'NH4','ammoniaque libre':'NH3_libre','Nitrate':'HNO3', 'Nitrite':'HNO2','azote Kjeldahl':'N_Kjdl','sulfures totaux':'Sulfure_tot', 
'sulfure(?:s)? (libre(?:s)?)':'Sulfure_libre','calcium':'Ca','potassium':'K', 'magnésium':'Mg', 'manganèse':'Mn', 
'sodium':"Na", 'fer':'Fe','phosphore (total)':'P_tot','carbonate':'CaCO3', 'bicarbonate':'Bicarb','Phoshore':'P',
'fer ((Fe))? total':'Fe_tot', 'fer (2\+)':'Fe2','fluorure(?:s)?':'Fluorure','bromure (libre)':'B_libre'}


source_dfs initialization

In [8]:
_df = pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau = _df, _df, _df, _df
source_prv_sol, source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df, _df

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:0 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


## 1- Profils sols et données forages.xls
* **Sheet : 'Données de forage'**

In [9]:
tmp_dir='../../CF_data/synthese/Result_traitem/profils_sols_donnees_forages/'
sheet='donnees_forage'

In [10]:
df = pd.read_excel('../../CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', 
                   sheet_name='Données de forage')#, skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

Rows : 25, columns : 15


interactive(children=(IntSlider(value=10, description='rows', max=25, min=10, readout=False), IntSlider(value=…

In [11]:
df.rename(columns={'Date':'Date_ouv','Profondeur':'Long_for', 'Méthode':'Method', 
                        'Diamètre forage':'Diam_for','Niv. Eau p/r sol':'Niv_eau_sol',
                        'PZ Prof.':'Long_pz', 'PZ Diamètre':'Diam_pz','PZ L.crépinée':'Long_crep', 
                        'Société forage':'Societe'}, inplace=True)

df=df[['ID', 'X', 'Y', 'Z', 'Date_ouv', 'Long_for', 'Diam_for', 'Long_pz', 'Diam_pz', 'Long_crep',
                 'Remarque','Niv_eau_sol','Method', 'Societe']]

In [12]:
df['Type'] = df['Long_pz'].apply(lambda x: 'Forage' if pd.isnull(x) else 'Piezo')
df['Refus'] = ''
df['Type_refus']=''

for i in range(len(df['Remarque'])):
    val = str(df.loc[i,'Remarque'])
    if re.search('[Bb]loqué', val) :
        df.loc[i,'Type_refus'] = 'x'
        
        if re.search('[lL]aitier', val):
            df.loc[i,'Type_refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Type_refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Type_refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Type_refus'] = '' 

df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace(' mm','').split('x')[1].strip(' m'))/1000 
                                                  if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace(' mm','').split('x')[0].strip(' m'))/1000 
                                                  if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)

df.insert(7, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(8, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Remarque', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines

gen_id_dated(df,'ID','Date_ouv')  

Generation of ID-dated...
Using column ' Date_ouv ' in the (geo)dataframe !
Process ended, check you (geo)dataframe


In [13]:
pz = df.query("Type=='Piezo'")
bh = df.query("Type!='Piezo'")

pz.reset_index(inplace=True, drop=True)
bh.reset_index(inplace=True, drop=True)

In [14]:
bh.drop(columns=['Diam_ext_pz', 'Diam_int_pz', 'Long_pz', 'Long_crep', 'Niv_eau_sol',], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [15]:
gdf_viewer(bh, rows=3), gdf_viewer(pz, rows=3)

Rows : 13, columns : 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=12…

Rows : 12, columns : 18


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=12…

(None, None)

In [16]:
source_pz = pz
source_bh = bh

In [17]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)
    
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
bh.to_csv(tmp_dir+sheet+'_Boreholes.csv', index=False)
source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False) #all Boreholes data in the source
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False) #all Piezometers data in the source
print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)}')

source_bh:13 ; source_pz:12


* **Sheet : 'Piézométrie'**

In [18]:
tmp_dir='../../CF_data/synthese/Result_traitem/profils_sols_donnees_forages/'
sheet='piezometrie'

In [19]:
df = pd.read_excel('../../CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Piézométrie', skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

Rows : 37, columns : 21


interactive(children=(IntSlider(value=10, description='rows', max=37, min=10, readout=False), IntSlider(value=…

In [20]:
df=na_col_drop(df, 3)
sdf=df[:11].copy()
sdf.reset_index(inplace=True, drop=True)
sdf.rename(columns={'z':'Z',}, inplace=True)
sdf=sdf[['ID', 'Z']]
sdf['Type']='Piezo'


Columns dropped :['Label', 'Commentaires ', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7']


In [21]:
a=0
for x in df.columns:
    if pd.isnull(df.loc[16,x]):
        df.loc[16,x]='col'+str(a)
    a+=1

In [22]:
df.loc[16]=df.loc[16].apply(lambda x : x if not pd.isnull(x) else '')
df.columns = df.loc[16]
df=df[17:]
#df.rename_axis(None, inplace=True)
df.reset_index(inplace=True, drop=True)

#df.drop(columns=[df.columns.to_list()[x] for x in range(0,8)
#                      if re.compile(r"col|unnamed").match(df.columns.to_list()[x])], axis=1, inplace=True) 

In [23]:
df.rename(columns={'col3':'Date_prv', 'col4':'Terrain', 'col5':'ID', 'NP/piézo [m]':'Niv_eau_pz', 
                        'dim. piezo hors sol [m]':'haut_pz-sol', 'NP/sol [m]':'Niv_eau_sol', 
                        'Prof. piézo/piézo [m]':'Long_pz', 'Prof. piézo/sol [m]':'Long_pz-sol', 
                        'CE [mS/cm]':'CE','t° [°C]':'Temp', 'Observations':'Organo'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [24]:
df.columns

Index(['col0', 'col1', 'col2', 'Date_prv', 'Terrain', 'ID', 'Niv_eau_pz',
       'haut_pz-sol', 'Niv_eau_sol', 'Long_pz', 'Long_pz-sol', 'pH', 'CE',
       'CE [µS/cm]', 'Temp', 'Organo'],
      dtype='object', name=16)

In [25]:
df.insert(0, 'ID', df.pop('ID')) # move to first column
df.replace('-', np.nan, inplace=True)
df['CE']=df['CE [µS/cm]'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
df.drop('CE [µS/cm]', axis=1, inplace=True)
df.loc[18, 'Niv_eau_pz']=np.nan
#df['Date_prv']=df['Date_prv'].apply(lambda x : str(x.year) if not pd.isnull(x) else '')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CE']=df['CE [µS/cm]'].apply(lambda x: pd.to_numeric(x)/1000
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

In [26]:
df.rename_axis(None, inplace=True, axis=1)
df=na_col_drop(df,2)
df.drop(df.query("ID!=ID").index, inplace=True) # supprimer les lignes avec ID='NaN'
df.reset_index(inplace=True, drop=True)


Columns dropped :['col0', 'col1', 'col2']


In [27]:
gdf_viewer(df, rows=3)

Rows : 17, columns : 12


interactive(children=(IntSlider(value=3, description='rows', max=17, min=3, readout=False), IntSlider(value=12…

In [28]:
mes_pz=df # piezometry and phys-chem measures
pz=sdf # piezometers

In [29]:
source_mes_pz = mes_pz
source_pz, error_df = gdf_merger(source_pz, pz, how='outer', col='ID')

In [30]:
gdf_viewer(mes_pz, rows=3)#, gdf_viewer(error_df)

Rows : 17, columns : 12


interactive(children=(IntSlider(value=3, description='rows', max=17, min=3, readout=False), IntSlider(value=12…

In [31]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)
    
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False) #all Boreholes data in the source
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False) #all Piezometers data in the source
print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_mes_pz:{len(source_mes_pz)}')

source_bh:13 ; source_pz:12 ; source_mes_pz:17


* **Sheet : 'Equipement'**

In [32]:
tmp_dir='../../CF_data/synthese/Result_traitem/profils_sols_donnees_forages/'
sheet='Equipement'

In [33]:
df = pd.read_excel('../../CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Equipement')#, skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

0 NaN lines dropped

Columns dropped :[]
Rows : 36, columns : 7


interactive(children=(IntSlider(value=10, description='rows', max=36, min=10, readout=False), IntSlider(value=…

In [34]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID', 'Equip_top', 'Equip_base', 'Diam_for','Dim_ext_pz', 'Legende']
df=col_ren(df, mode=1, name=name)

In [35]:
gdf_viewer(df, un_val='ID', rows=3)

Rows : 36, columns : 6, Unique on 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=36, min=3, readout=False), IntSlider(value=6,…

In [36]:
equip=df
source_equip=equip

In [37]:
gdf_viewer(source_equip, rows=3)

Rows : 36, columns : 6


interactive(children=(IntSlider(value=3, description='rows', max=36, min=3, readout=False), IntSlider(value=6,…

In [38]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)
    
equip.to_csv(tmp_dir+sheet+'_Equipments.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
source_equip.to_csv(tmp_dir+'source_Equipments.csv', index=False) #all Piezometers data in the source
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False) #all Boreholes data in the source
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False) #all Piezometers data in the source
print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_mes_pz:{len(source_mes_pz)}')

source_bh:13 ; source_pz:12 ; source_mes_pz:17


* **Sheets: 'Echantillon' + 'Organoleptique'**

In [39]:
tmp_dir='../../CF_data/synthese/Result_traitem/profils_sols_donnees_forages/'
sheet='Echant-organo'

In [40]:
df = pd.read_excel('../../CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Echantillon')#, skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

Rows : 29, columns : 4


interactive(children=(IntSlider(value=10, description='rows', max=29, min=10, readout=False), IntSlider(value=…

In [41]:
df.rename(columns={'De':'Ech_top', 'A':'Ech_base', 'Numéro':'ID_ech'}, inplace=True)

In [42]:
sdf = pd.read_excel('../../CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Organoleptique')#, skiprows=1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
gdf_viewer(sdf)

Rows : 7, columns : 5


interactive(children=(IntSlider(value=7, description='rows', max=7, min=7, readout=False), IntSlider(value=5, …

In [43]:
list(sdf.columns)

['ID', 'De', 'A', 'Polluant', 'Intensité']

In [44]:
sdf.rename(columns={'De':'Pol_top', 'A':'Pol_base'}, inplace=True)

In [45]:
mdf, error_df = gdf_merger(df, sdf, 'outer', 'ID')
mdf['Type_ech']='Sol'
mdf.insert(4, 'Type_ech', mdf.pop('Type_ech'))

In [46]:
gdf_viewer(mdf, rows=3)

Rows : 32, columns : 9


interactive(children=(IntSlider(value=3, description='rows', max=32, min=3, readout=False), IntSlider(value=9,…

In [47]:
prv_sol = mdf
source_prv_sol=prv_sol

In [48]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)
    
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'Source_Samples-soil.csv', index=False) #all Samples and organoleptic data in the source
print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_mes_pz:{len(source_mes_pz)} ; '
     f'source_prv_sol:{len(source_prv_sol)} ;')

source_bh:13 ; source_pz:12 ; source_mes_pz:17 ; source_prv_sol:32 ;


* **Sheet : 'Log'**

In [49]:
tmp_dir='../../CF_data/synthese/Result_traitem/profils_sols_donnees_forages/'
sheet='Log'

In [50]:
df = pd.read_excel('../../CF_data/Data_UMONS/ouvrages/Profils sols et données forages.xls', sheet_name='Log')#, skiprows=1)
gdf_viewer(df)

Rows : 55, columns : 5


interactive(children=(IntSlider(value=10, description='rows', max=55, min=10, readout=False), IntSlider(value=…

In [51]:
df.rename(columns={'De':'Litho_top', 'A':'Litho_base'}, inplace=True)

In [52]:
q=df.query('Keyword.str.contains(".ointe", regex=True)', engine='python').index
df.drop(q, inplace=True)
df.reset_index(drop=True, inplace=True)

In [53]:
gdf_viewer(df, rows=3)

Rows : 54, columns : 5


interactive(children=(IntSlider(value=3, description='rows', max=54, min=3, readout=False), IntSlider(value=5,…

In [54]:
litho=df
source_litho=litho

In [55]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)
    
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False) #all lithologies or descriptions data in the source

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_mes_pz:{len(source_mes_pz)} ; '
     f'source_prv_sol:{len(source_prv_sol)} ; source_litho:{len(source_litho)} ;')

source_bh:13 ; source_pz:12 ; source_mes_pz:17 ; source_prv_sol:32 ; source_litho:54 ;


## 2-Database MEMORIS3.xlsx
* **Sheet : 'PROFILS_SOL'**

In [56]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_bh, source_pz, source_litho, source_prv_sol, source_mes_pz= _df, _df, _df, _df, _df

In [57]:
tmp_dir='../../CF_data/synthese/Result_traitem/database_Memoris3/'
sheet='Profils_sol'

In [58]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. '+
                        'Siterem - 2017/Database MEMORIS3.xlsx', sheet_name='PROFILS_SOL')#, skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

Rows : 2041, columns : 16


interactive(children=(IntSlider(value=10, description='rows', max=2041, min=10, readout=False), IntSlider(valu…

In [59]:
df = df[['Date', 'N°', 'Id', 'Profondeur', 'Description', 'Piézo', 'Unnamed: 6',
                             'Gouge Ø75', 'MFT Ø145', 'carottier', 'tarrière', 'Liner Ø60']] 

In [60]:
df.rename({'Date':'Date_ouv', 'N°':'Ref', 'Id':'idx', 'Piézo':'Type', 'Unnamed: 6':'Societe',
                'MFT Ø145':'MFT_145', 'Gouge Ø75':'Gouge_75', 'Liner Ø60': 'Liner_60'}, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [61]:
print(list(set(df['Date_ouv'].apply(lambda x: x.year if not pd.isnull(x) else x))))

[NaT, 2009, 2010, 2015]


In [62]:
df.loc[df.fillna('').query("Societe.str.contains('x|X')").index, 'Type']='X'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [63]:
df.loc[df.fillna('').query("Gouge_75.str.contains('SBS|SITER')").index, 'Societe']='SBS Environnement'
df.loc[df.fillna('').query("Gouge_75.str.contains('SBS|SITER')").index, 'Gouge_75']=''

In [64]:
for i in range(len(df['Date_ouv'])-1):
    if not pd.isnull(df.loc[i, 'Date_ouv']) and pd.isnull(df.loc[i+1, 'Date_ouv']):
        df.loc[i+1, 'Date_ouv']=df.loc[i, 'Date_ouv']
        
    if not pd.isnull(df.loc[i, 'Societe']) and pd.isnull(df.loc[i+1, 'Societe']):
        df.loc[i+1, 'Societe']=df.loc[i, 'Societe']
        
    if not pd.isnull(df.loc[i, 'Type']) and pd.isnull(df.loc[i+1, 'Type']) and \
       df.loc[i, 'Ref']==df.loc[i+1, 'Ref']:
        df.loc[i+1, 'Type']=df.loc[i, 'Type']

In [65]:
for i in range(len(df['idx'])-1):    
    if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
    and re.findall('Forage',df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
        w=df.loc[i, 'Profondeur'][0]
    elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])
    
    if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
    and re.findall('Tranch',df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
        w=df.loc[i, 'Profondeur'][0]
    elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
        df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])
     
   # if df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur'])\
   # and re.findall('Moni',df.loc[i, 'Profondeur']):
   #     df.loc[i+1,'idx']=df.loc[i, 'Profondeur'][0]+str(df.loc[i, 'Ref'])
   #     w=df.loc[i, 'Profondeur'][0]
   # elif df.loc[i,'Ref']==df.loc[i+1,'Ref'] and not pd.isnull(df.loc[i, 'Profondeur']):
   #     df.loc[i+1,'idx']=w+str(df.loc[i, 'Ref'])

In [66]:
df['Ref']=df['idx'].apply(lambda x : x if re.findall('F|T', str(x)) else '')
df['Ref']=df['idx'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ref']=df['idx'].apply(lambda x : x if re.findall('F|T', str(x)) else '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ref']=df['idx'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)


In [67]:
df['Type']=df['Type'].apply(lambda x: 'Piezo' if not pd.isnull(x) else '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Type']=df['Type'].apply(lambda x: 'Piezo' if not pd.isnull(x) else '')


In [68]:
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.1","a",str(x)) if re.search(r"\.1", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.2","b",str(x)) if re.search(r"\.2", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.3","c",str(x)) if re.search(r"\.3", str(x)) else x)
df['Ref']=df['Ref'].apply(lambda x: re.sub("\.4","d",str(x)) if re.search(r"\.4", str(x)) else x)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ref']=df['Ref'].apply(lambda x: re.sub("\.1","a",str(x)) if re.search(r"\.1", str(x)) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ref']=df['Ref'].apply(lambda x: re.sub("\.2","b",str(x)) if re.search(r"\.2", str(x)) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Ref']

In [69]:
gen_id_dated(df, ref_col='Ref', date_col='Date_ouv')

Generation of ID-dated...
Using column ' Date_ouv ' in the (geo)dataframe !
Process ended, check you (geo)dataframe


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gdf['ID_date'] = gdf[date_col].apply(lambda x: str(x.year) + '-' if not pd.isnull(x) else '') + gdf[


In [70]:
df.loc[df.query('Profondeur!=Profondeur' ).index,'Profondeur']=''

In [71]:
df['Method']=''
            
for i in range(len(df['Method'])):
    if not pd.isnull(df.loc[i, 'Gouge_75']) : df.loc[i, 'Method']='Gouge_75'
    if not pd.isnull(df.loc[i, 'MFT_145']) : df.loc[i, 'Method']='MFT_145'
    if not pd.isnull(df.loc[i, 'Liner_60']) : df.loc[i, 'Method']='Liner_60'
    if not pd.isnull(df.loc[i, 'carottier']) : df.loc[i, 'Method']='carrotier'
    if not pd.isnull(df.loc[i, 'tarrière']) : df.loc[i, 'Method']='tarrière'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Method']=''


In [72]:
df.drop(df.query('Profondeur.str.contains("Forage") and Profondeur!="Forage bloqué"', engine='python').index, inplace=True)
df.drop(df.query('Profondeur.str.contains("Tranc") and Profondeur!="Tranchée bloqué"', engine='python').index, inplace=True)
df.drop(df.query('Profondeur.str.contains(".orage|..ranch", regex=True)', engine='python').index, inplace=True)
df.drop(df.fillna('').query('Description.str.contains("^.orage bloq|^.ranc.* bloq|^.*efus", regex=True)', engine='python').index, inplace=True)
df.drop(df.query('Ref!=Ref').index, inplace=True)
df.drop(columns=['MFT_145','Gouge_75','Liner_60', 'carottier', 'tarrière', 'idx'], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [73]:
df['Litho_top'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[0].strip(' m'))
df['Litho_base'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[-1].strip(' m'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Litho_top'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[0].strip(' m'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Litho_base'] = df['Profondeur'].apply(lambda x: x.replace(',','.').split('-')[-1].strip(' m'))


In [74]:
df.rename({'Ref':'ID'}, axis=1, inplace=True)
if 'Profondeur' in df.columns: df.drop(columns=['Profondeur'], axis=1, inplace=True)

In [75]:
set([x[0] for x in list(set(df.ID)) if isinstance(x,str)])

{'F', 'T'}

In [76]:
# we have only lithologies here
trch=df.loc[df.query('ID_date.str.contains("T")', engine='python').index] # trenches
trch['Type']='Tranchee'

bh=df.loc[df.query('ID_date.str.contains("F")', engine='python').index] # boreholes
pz=bh.query("Type=='Piezo'")
bh=bh.query("Type!='Piezo'")
bh['Type']='Forage'

In [77]:
len(bh), len(pz), len(trch)

(826, 762, 40)

In [78]:
gdf_viewer(bh, un_val='ID', rows=3)

Rows : 826, columns : 9, Unique on 'ID': 172


interactive(children=(IntSlider(value=3, description='rows', max=826, min=3, readout=False), IntSlider(value=9…

In [79]:
litho=df
source_litho=df.copy()

In [80]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)
    
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#bh.to_csv(tmp_dir+sheet+'_BH.csv', index=False)
#trch.to_csv(tmp_dir+sheet+'_TRCH.csv', index=False)
source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False) #all lithologies or descriptions data in the source

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_mes_pz:{len(source_mes_pz)} ; '
     f'source_prv_sol:{len(source_prv_sol)} ; source_litho:{len(source_litho)} ;')

source_bh:0 ; source_pz:0 ; source_mes_pz:0 ; source_prv_sol:0 ; source_litho:1629 ;


* **Sheet : 'DONNEES PIEZOS'**

In [81]:
tmp_dir='../../CF_data/synthese/Result_traitem/database_Memoris3/'
sheet='Donnees_piezos'

In [82]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. '+
                        'Siterem - 2017/Database MEMORIS3.xlsx', sheet_name='DONNEES PIEZOS', skiprows=2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

Rows : 147, columns : 186


interactive(children=(IntSlider(value=10, description='rows', max=147, min=10, readout=False), IntSlider(value…

In [83]:
sdf=df[[df.columns.to_list()[1]]+df.columns.to_list()[13:17]]
df=df[df.columns.to_list()[1:13]+df.columns.to_list()[17:22]]

In [84]:
df.rename(columns={'Campagne':'Societe','N_piezo.':'ID','X [m]':'X','Y [m]':'Y','Z tête PZ [m]':'Z',
                        'Zsol [m]':'Zsol', 'Prof_PZ [m]':'Long_pz','Section_crépinée [m]':'Long_crep',
                        'Aquifère':'Terrain', 'Caractéristique':'Caractere',
                        'Diamètre_int [m]':'Diam_int_pz','Surnageant [cm]':'Surnageant','Sousnageant [cm]':'Sousnageant',
                        'Description éch. \nOd/turb.':'Opacite_eau','Remarques':'Rmq'}, inplace=True)

#df=df[['ID','X', 'Y', 'Z', 'Zsol', 'Long_pz', 'Long_crep', 'Diam_int_pz', 'Societe']]#[:130]
df=df.query("ID ==ID")
df.replace('-',np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [85]:
df['Type']=df['Long_crep'].apply(lambda x: 'Piezo' if not pd.isnull(x) else '')

In [86]:
df=df[['ID','X','Y','Z','Zsol','Type','Societe','Terrain','Long_pz','Long_crep','Diam_int_pz', 'Surnageant', 
         'Sousnageant','Caractere', 'Rmq', 'Opacite_eau','Zone', 'Sous_zone']]
df['Sousnageant']=df['Sousnageant'].apply(lambda x: x/100 if not pd.isnull(x) else x) #convert unit in [m]
df['Surnageant']=df['Surnageant'].apply(lambda x: x/100 if not pd.isnull(x) else x)

In [87]:
gdf_viewer(df, un_val='ID', rows=3) # all units in [m]

Rows : 130, columns : 18, Unique on 'ID': 130


interactive(children=(IntSlider(value=3, description='rows', max=130, min=3, readout=False), IntSlider(value=1…

In [88]:
prv_eau=df[['ID','Surnageant', 'Sousnageant', 'Caractere','Rmq', 'Opacite_eau']]
prv_eau['Type_ech']='Eau'

pz=df.query("Type=='Piezo'")
pz=pz[['ID','X','Y','Z','Zsol','Type','Societe','Terrain','Long_pz','Long_crep','Diam_int_pz','Zone','Sous_zone']]

ouv=df.query("Type!='Piezo'") # unknown facilities' type (it seems like they are not boreholes)
ouv=ouv[['ID','X','Y','Z','Type','Societe']]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prv_eau['Type_ech']='Eau'


In [89]:
# data in the second part of the initial dataframe
sdf.rename(columns={'N_piezo.':'ID'}, inplace=True)
sdf=sdf.query("ID==ID")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [90]:
df_tmp=pd.DataFrame()
col=sdf.columns.to_list()
ID_mes=0

for i in range(len(sdf)):
    for j in range(1,len(col)):
        d=col[j].strip('\n|.1').split('/')
        df_tmp.loc[ID_mes,'Date_mes']=dtm.date(int(d[2]), int(d[1]), int(d[0]))
        df_tmp.loc[ID_mes,'ID']=str(sdf.loc[i,'ID'])
        
        if j<=2: 
            df_tmp.loc[ID_mes,'Niv_eau_pz']=sdf.iloc[i,j]
            df_tmp.loc[ID_mes,'Niv_eau_sol']=sdf.iloc[i,j+2]
            ID_mes+=1

df_tmp=df_tmp.sort_values('Date_mes').reset_index(drop=True)
df_tmp['ID_mes']=df_tmp['ID'].apply(lambda x: 'Mes_'+str(x))
df_tmp.insert(0, 'ID_mes', df_tmp.pop('ID_mes'))

mes_pz = df_tmp[:-1]

In [91]:
source_pz = pz
source_prv_eau = prv_eau
source_ouv = ouv
source_mes_pz = mes_pz

In [92]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)


source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False) #all lithologies or descriptions data in the source
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_prv_sol:{len(source_prv_sol)} ; source_prv_eau:{len(source_prv_eau)} ;\n'
     f'source_mes_pz:{len(source_mes_pz)} ; ')

source_bh:0 ; source_pz:117 ; source_litho:1629 ; source_Fac-uknw:13 ; source_prv_sol:0 ; source_prv_eau:130 ;
source_mes_pz:260 ; 


* **Sheet : 'DRAINS ET PIEZOS ENEL'**

In [93]:
tmp_dir='../../CF_data/synthese/Result_traitem/database_Memoris3/'
sheet='Drains_Pz_ENEL'

In [94]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/Database MEMORIS3.xlsx', 
                        sheet_name='DRAINS ET PIEZOS ENEL', skiprows=1)

df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

Rows : 147, columns : 68


interactive(children=(IntSlider(value=10, description='rows', max=147, min=10, readout=False), IntSlider(value…

In [95]:
df.rename(columns={'N°':'ID', 'Date ':'Date_prv','Hauteur de la chambre ':'Ht_Chbre','T':'Temp'}, inplace=True)

df_tmp=df[df.columns.to_list()[:2]+df.columns.to_list()[10:-2]]
df=df[df.columns.to_list()[:7]]

In [96]:
df.columns.to_list()[:12]

['ID', 'Date_prv', 'ETUDE', 'X', 'Y', 'Zsol', 'Ht_Chbre']

In [97]:
df=df[['ID', 'X', 'Y', 'Zsol', 'Ht_Chbre']][2:21]
df.replace('-',np.nan, inplace=True)
df.drop(index=[5], inplace=True)
df.loc[:14,'Type']=''
df.loc[15:,'Type']='Piezo'
#df['Zsol']=df['Zsol'].apply(lambda x: x if not pd.isnull(x) else np.nan)

#gdf_viewer(df, un_val='ID', rows=3)

In [98]:
pz=df.query("Type=='Piezo'")
ouv=df.query("Type!='Piezo'")

In [99]:
sdf=df_tmp.query('ID==ID').reset_index(drop=True)

In [100]:
df_tmp=pd.DataFrame()
cols=[sdf.columns.to_list()[2]]+sdf.columns.to_list()[4:6]
ID_mes=0
d=['01/10/2013','01/11/2015','01/12/2016'] # 01/11/2015 added by me (according to data observation)

for i in range(len(sdf)):
    k=0
    for j in cols:
        #df_tmp.loc[ID_mes,'ID_mes']='Mes_'+str(ID_mes)
        df_tmp.loc[ID_mes,'Date_mes']=dtm.date(int(d[k].split('/')[2]), int(d[k].split('/')[1]), 
                                                             int(d[k].split('/')[0]))
        df_tmp.loc[ID_mes,'ID']=str(sdf.loc[i,'ID'])
        df_tmp.loc[ID_mes,'Niv_eau_sol']=sdf.loc[i,j]
                
        if df_tmp.loc[ID_mes, 'Date_mes']==sdf.loc[i, 'Date_prv'] and \
        df_tmp.loc[ID_mes, 'ID']==sdf.loc[i, 'ID']:
            df_tmp.loc[ID_mes, sdf.columns.to_list()[6:-1]]=sdf.iloc[i, 6:-1]
        
        ID_mes+=1
        k+=1
df_tmp.replace('-', np.nan, inplace=True)

In [101]:
df_tmp=na_line_drop(df_tmp, 3)

39 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [102]:
df_tmp['ID_mes']=df_tmp['ID'].apply(lambda x: 'Mes_'+x)
df_tmp['CE']=df_tmp['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [103]:
mes_pz=df_tmp[['ID_mes', 'Date_mes', 'ID', 'Niv_eau_sol', 'pH', 'CE', 'Temp', 'ORP','Odiss']]

an=df_tmp[['ID','arsenic', 'cadmium', 'chrome', 'cobalt', 'cuivre', 'mercure',
       'plomb', 'nickel', 'zinc', 'CN_libre', 'CN_totaux', 'CN_totaux.1',
       'CN_totaux.2', 'thiocyanate', 'benzène', 'toluène', 'éthylbenzène',
       'orthoxylène', 'para- et métaxylène', 'xylènes', 'BTEX total',
       'styrène', 'Iph.', 'naphtalène', 'anthracène', 'phénanthrène',
       'fluoranthène', 'benzo(a)anthracène', 'chrysène', 'benzo(a)pyrène',
       'benzo(ghi)pérylène', 'benzo(k)fluoranthène', 'indéno(1,2,3-cd)pyrène',
       '\nC5-C8', 'C8-C10', 'C10-C12', 'C12-C16', ' C16 - C21', 'C21 - C35',
       'C35 - C40', 'totaux C10-C35', 'C10-C12.1', 'C12-C22', 'C22-C30',
       'C30-C40', 'Totaux C10-C40']]

#another way to do this
#an=df_tmp[df_tmp.columns.to_list()[1:3]+df_tmp.columns.to_list()[9:]]

In [104]:
an=na_line_drop(an, 2)
an.insert(1, 'Type_ech', 'Eau')
an.rename(columns={'ID':'ID_ech'}, inplace=True)
#an['Anl_ID']=an['ID'].apply(lambda x: 'Anl_'+str(x))
#an.insert(0, 'Anl_ID', an.pop('Anl_ID'))
# or #an.insert(an.columns.to_list().index('ID')+1, 'Type_ech', an.pop('Type_ech'))

1 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['line_na']=False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [105]:
an=col_ren(an, name=pol_field_model, mode=1)

In [106]:
gdf_viewer(an)

Rows : 17, columns : 48


interactive(children=(IntSlider(value=10, description='rows', max=17, min=10, readout=False), IntSlider(value=…

In [107]:
an=dble_col_drop(an)

column(s) dropped: ['13:CN_tot', '14:CN_tot', '43:Fract_C10C12']


Data merging

In [108]:
source_pz, error_df=gdf_merger(source_pz, pz, 'outer', 'ID')

In [109]:
source_mes_pz, error_df=gdf_merger(source_mes_pz, mes_pz, 'outer', 'ID', fcol='ID_mes')

In [110]:
source_ouv, error_df=gdf_merger(source_ouv, ouv, 'outer', 'ID')

In [111]:
source_an=an

In [112]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False) #all lithologies or descriptions data in the source
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_prv_sol:{len(source_prv_sol)} ; source_prv_eau:{len(source_prv_eau)} ;\n'
     f'source_mes_pz:{len(source_mes_pz)} ; source_an:{len(source_an)} ;')

source_bh:0 ; source_pz:123 ; source_litho:1629 ; source_Fac-uknw:25 ; source_prv_sol:0 ; source_prv_eau:130 ;
source_mes_pz:278 ; source_an:17 ;


* **Sheet : 'RESULTS_EAU' (F)**

In [113]:
tmp_dir='../../CF_data/synthese/Result_traitem/database_Memoris3/'
sheet='Result_eau'

In [114]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/Database MEMORIS3.xlsx', 
                        sheet_name='RESULTS_EAU', skiprows=1)

df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

Rows : 204, columns : 185


interactive(children=(IntSlider(value=5, description='rows', max=204, min=5, readout=False), IntSlider(value=1…

In [115]:
df.drop(0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [116]:
df.rename(columns={'Campagne':'Societe','N_piezo.':'ID','Z tête PZ':'Z','Zsol':'Zsol', 'Prof_PZ':'Long_pz',
                        'Section_crépinée':'Long_crep','Diamètre_int':'Diam_int_pz','Surnageant':'Surnageant',
                        'Sousnageant':'Sousnageant','Description éch.':'Opacite_eau','Remarques':'Rmq',
                        'Aquifère_échantillonné':'Terrain', 'Caractéristique':'Caractere'}, inplace=True)

#df=df[['ID','X', 'Y', 'Z', 'Zsol', 'Long_pz', 'Long_crep', 'Diam_int_pz', 'Societe']]#[:130]
df=df.query("ID ==ID")
df.replace('-',np.nan, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(


In [117]:
# split
sdf=df[[df.columns.to_list()[0]]+df.columns.to_list()[12:16]+df.columns.to_list()[21:26]]
an=df[[df.columns.to_list()[0]]+df.columns.to_list()[26:]]
prv_eau=df[df.columns.to_list()[:3]+df.columns.to_list()[16:21]+['Terrain']]
df=df[df.columns.to_list()[:12]]

In [118]:
df['Type']=df['Long_crep'].apply(lambda x: 'Piezo' if not pd.isnull(x) else '')
df.insert(8, 'Type', df.pop('Type'))

In [119]:
pz=df.query("Type=='Piezo'")
ouv=df.query("Type!='Piezo'")[['ID', 'Societe', 'Zone', 'Sous_zone', 'X', 'Y', 'Z', 'Type']]

In [120]:
prv_eau['Surnageant']=prv_eau['Surnageant'].apply(lambda x: x/100) # to express value in [m]
prv_eau['Sousnageant']=prv_eau['Sousnageant'].apply(lambda x: x/100)

In [121]:
df_tmp=pd.DataFrame()
cols=sdf.columns.to_list()[5:]
ID_mes=0
d=['27/04/2010', '08/09/2010']

for i in range(len(sdf)):
    for k in [0,1]:
        df_tmp.loc[ID_mes,'ID_mes']='Mes_'+str(ID_mes)
        df_tmp.loc[ID_mes,'ID']=str(sdf.loc[i,'ID'])
        
        dt=d[k].split('/')
        df_tmp.loc[ID_mes,'Date_Mes']=dtm.date(int(dt[2]), int(dt[1]),int(dt[0]))
        df_tmp.loc[ID_mes,'Niv_eau_pz']=sdf.iloc[i,k+1]
        df_tmp.loc[ID_mes,'Niv_eau_sol']=sdf.iloc[i,k+3]
        df_tmp.loc[ID_mes, cols]=list(sdf.loc[i,cols])
        ID_mes+=1          
        
df_tmp.replace('-', np.nan, inplace=True)
#df_tmp=df_tmp.sort_values('Date_Mes').reset_index(drop=True)
df_tmp['ID_mes']=df_tmp['ID'].apply(lambda x: 'Mes_'+str(x))
df_tmp.insert(0, 'ID_mes', df_tmp.pop('ID_mes'))

In [122]:
q=df_tmp.query('Niv_eau_pz.isnull() and Niv_eau_sol.isnull()').index
df_tmp.drop(q, inplace=True)
df_tmp['CE']=df_tmp['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
mes_pz=df_tmp

In [123]:
an=na_line_drop(an,col_n=3)

11 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [124]:
an=dble_col_drop(an)

column(s) dropped: []


In [125]:
an.insert(1, 'Type_ech', 'Eau')
an.rename(columns={'ID':'ID_ech'}, inplace=True)
#an['Anl_ID']=an['ID'].apply(lambda x: 'Anl_'+str(x))
#an.insert(0, 'Anl_ID', an.pop('Anl_ID'))
# or #an.insert(an.columns.to_list().index('ID')+1, 'Type_ech', 'Eau')

In [126]:
an=col_ren(an, name=pol_field_model, mode=1)

data merging

In [127]:
source_pz, error_df=gdf_merger(source_pz, pz, 'outer', 'ID')

In [128]:
source_mes_pz, error_df=gdf_merger(source_mes_pz, mes_pz, 'outer', 'ID', fcol='ID_mes', scope=globals())
if len(error_df)>0 :gdf_viewer(error_df, rows=3)

Ambiguous values in both columns compared, change it manually !
Columns ['Niv_eau_pz_x', 'Niv_eau_pz_y', 'Niv_eau_sol_x', 'Niv_eau_sol_y'] must be dropped manually !
error file created in 'tmp_files/merging_error_log(source_mes_pz-mes_pz).csv'
Rows : 134, columns : 5


interactive(children=(IntSlider(value=3, description='rows', max=134, min=3, readout=False), IntSlider(value=5…

In [129]:
source_ouv, error_df=gdf_merger(source_ouv, ouv, 'outer', 'ID')
if len(error_df)>0 :gdf_viewer(error_df, rows=3)

In [130]:
len(source_an.columns),len(set(source_an.columns))
source_an.columns

Index(['ID_ech', 'Type_ech', 'As', 'Cd', 'Cr', 'cobalt', 'Cu', 'Hg', 'Pb',
       'Ni', 'Zn', 'CN_libre', 'CN_tot', 'thioCN', 'Bnz', 'Toln', 'EthylBnz',
       'O-Xyl', 'P-M-Xyl', 'Xyl', 'BTEX total', 'Styr', 'Iph.', 'Naphta',
       'Anthrc', 'Phenanthr', 'Flranth', 'Bnz(a)anthrc', 'Chrys', 'Bnz(a)pyr',
       'Bnz(ghi)peryl', 'Bnz(k)flranth', 'Indeno(1,2,3-cd)pyr', 'Fract_C5C8',
       'Fract_C8C10', 'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21',
       'Fract_C21C35', 'Fract_C35C40', 'HC_tot_C10C35', 'Fract_C12C22',
       'Fract_C22C30', 'Fract_C30C40', 'HC_tot_C10C40'],
      dtype='object')

In [131]:
source_an, error_df=gdf_merger(source_an, an, 'outer', 'ID_ech')
if len(error_df)>0 :gdf_viewer(error_df, rows=3)

In [132]:
gdf_viewer(source_an)

Rows : 136, columns : 165


interactive(children=(IntSlider(value=10, description='rows', max=136, min=10, readout=False), IntSlider(value…

In [133]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False) #all lithologies or descriptions data in the source
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_prv_sol:{len(source_prv_sol)} ; source_prv_eau:{len(source_prv_eau)} ;\n'
     f'source_mes_pz:{len(source_mes_pz)} ; source_an:{len(source_an)} ;')

source_bh:0 ; source_pz:123 ; source_litho:1629 ; source_Fac-uknw:25 ; source_prv_sol:0 ; source_prv_eau:130 ;
source_mes_pz:412 ; source_an:136 ;


* **Sheet : 'RESULTS_SOL'**

In [134]:
tmp_dir='../../CF_data/synthese/Result_traitem/database_Memoris3/'
sheet='Result_sol'

In [135]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Rapport de synthèse des études de sol et des eaux souterraines. Siterem - 2017/'
                   'Database MEMORIS3.xlsx', sheet_name='RESULTS_SOL', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

  warn(msg)


508 NaN lines dropped

Columns dropped :['Unnamed: 94', 'Unnamed: 95', 'Unnamed: 96', 'Unnamed: 97', 'Unnamed: 98', 'Unnamed: 99', 'Unnamed: 100', 'Unnamed: 101', 'Unnamed: 102', 'Unnamed: 103', 'Unnamed: 104', 'Unnamed: 105', 'Unnamed: 106', 'Unnamed: 107', 'Unnamed: 108', 'Unnamed: 109', 'Unnamed: 110', 'Unnamed: 111', 'Unnamed: 112', 'Unnamed: 113', 'Unnamed: 114', 'Unnamed: 115', 'Unnamed: 116', 'Unnamed: 117', 'Unnamed: 118', 'Unnamed: 119', 'Unnamed: 120', 'Unnamed: 121', 'Unnamed: 122', 'Unnamed: 123', 'Unnamed: 124', 'Unnamed: 125', 'Unnamed: 126', 'Unnamed: 127', 'Unnamed: 128', 'Unnamed: 129', 'Unnamed: 130', 'Unnamed: 131', 'Unnamed: 132', 'Unnamed: 133', 'Unnamed: 134', 'Unnamed: 135', 'Unnamed: 136', 'Unnamed: 137', 'Unnamed: 138', 'Unnamed: 139', 'Unnamed: 140', 'Unnamed: 141', 'Unnamed: 142', 'Unnamed: 143', 'Unnamed: 144', 'Unnamed: 145', 'Unnamed: 146', 'Unnamed: 147', 'Unnamed: 148', 'Unnamed: 149', 'Unnamed: 150', 'Unnamed: 151', 'Unnamed: 152', 'Unnamed: 153', 'Unna

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=10, description='rows', max=1423, min=10, readout=False), IntSlider(valu…

In [136]:
df.rename(columns={'Unnamed: 92':'EOX', 'Unnamed: 93':'Idc_phenol','Campagne':'Societe','N_forage':'ID','refus':'Refus',
                   'Prof.\nforage':'Long', 'N_ech':'ID_ech', 'Min_Ech':'Ech_top','Max_Ech':'Ech_base',
                  'Epaisseur remblais':'Ep_remb', 'Epaisseur alluvions':'Ep_alluv', 'Nature':'Polluant',
                  'Min_organo':'Pol_top', 'Max_organo':'Pol_base', 'Fraction   2000 µm':'Fract_2000µ',
                   'Fraction   63 µm':'Fract_63µ', 'Fraction   45 µm':'Fract_45µ','Fraction   16 µm':'Fract_16µ',
                   'Fraction   2 µm':'Fract_2µ'}, inplace=True)

In [137]:
df.drop(columns=[df.columns.to_list()[x] for x in range(len(df.columns))
                      if re.search(r"Unnamed",df.columns.to_list()[x])], axis=1, inplace=True) 
df.replace(r'<|>','', inplace=True, regex=True)
df=df.query('ID==ID')
df['ID']=df['ID'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)
df['ID_ech']=df['ID_ech'].apply(lambda x : x.replace('Monito ', 'Mon') if re.findall('Monit', str(x)) else x)
df.replace('-',np.nan, inplace=True)

In [138]:
for i in range(len(df['ID'])):
    #r=re.search('(\w+)/.+',str(df.loc[i, 'ID_ech']))
    #if r : df.loc[i, 'ID']=r.group(1)
    r=re.search('^\d+',str(df.loc[i, 'ID']))
    if r : df.loc[i, 'ID']='F'+str(df.loc[i, 'ID'])

In [139]:
df.columns

Index(['Societe', 'Zone', 'Sous_zone', 'Numéro_zone', 'ID', 'Affectation', 'X',
       'Y', 'Z', 'Long', 'Refus', 'ID_ech', 'Ech_top', 'Ech_base', 'Soumis',
       'Terrain', 'Description', 'Ep_remb', 'Ep_alluv', 'Intensité', 'Pol_top',
       'Pol_base', 'Polluant', 'MS', 'pH H2O', 'T° pH H2O', 'T° pH CaCl2',
       'pH CaCl2', 'T° pH KCl', 'pH KCl', 'T° CE', 'CE', 'MO',
       'Résidus chauffage', 'Argile ', 'Fract_2000µ', 'Fract_63µ', 'Fract_45µ',
       'Fract_16µ', 'Fract_2µ', 'Arsenic', 'Cadmium', 'Chrome_total',
       'Chrome_VI', 'Cobalt', 'Cuivre', 'Mercure', 'Plomb', 'Nickel', 'Zinc',
       'Libres', 'Totaux', 'Non chloro destruct.', 'Thiocyantes',
       'Cyanures totaux EPA', 'Benzène', 'Toluène', 'Ethylbenzène',
       'o-Xylènes', 'mp-Xylènes', 'Xylènes', 'SOM BTEX', 'Styrène',
       'Naphtalène', 'Anthracene', 'Phénanthrène', 'Fluoranthène',
       'Benzoaanthracène', 'Chrysène', 'Benzo(a)pyrene', 'Benzo(ghi)pérylène',
       'Benzo(k)fluoranthène', 'Indéno[123cd]pyrè

In [140]:
sdf=df[['ID','X','Y','Z','Long','Description','Terrain','Ep_remb','Ep_alluv','Refus','Societe','Zone','Sous_zone']]
sdf.insert(5, 'Type', '')

prv_sol=df[['ID','ID_ech', 'Ech_top', 'Ech_base','Polluant','Intensité', 'Pol_top','Pol_base','MS','pH H2O',
            'T° pH H2O', 'T° pH CaCl2','pH CaCl2','T° pH KCl','pH KCl','T° CE','CE','MO',
       'Résidus chauffage', 'Argile ', 'Fract_2000µ','Fract_63µ','Fract_45µ','Fract_16µ','Fract_2µ']]
#mes_sol=df[['ID','ID_ech','MS','pH H2O', 'T° pH H2O', 'T° pH CaCl2','pH CaCl2','T° pH KCl','pH KCl','T° CE','CE','MO',
#       'Résidus chauffage', 'Argile ', 'Fract_2000','Fract_63','Fract_45','Fract_16','Fract_2']]
prv_sol.insert(2, 'Type_ech', 'Sol')

an=df[['ID','ID_ech','Arsenic','Cadmium','Chrome_total','Chrome_VI','Cobalt','Cuivre','Mercure','Plomb','Nickel','Zinc','Libres',
       'Totaux', 'Non chloro destruct.', 'Thiocyantes', 'Cyanures totaux EPA','Benzène', 'Toluène', 'Ethylbenzène',
       'o-Xylènes','mp-Xylènes','Xylènes','SOM BTEX','Styrène','Naphtalène','Anthracene','Phénanthrène',
       'Fluoranthène', 'Benzoaanthracène', 'Chrysène','Benzo(a)pyrene','Benzo(ghi)pérylène','Benzo(k)fluoranthène',
       'Indéno[123cd]pyrène', 'Acenaphtylene', 'Acenaphthene', 'Fluorène','Pyrène', 'Benzo_b_fluoranthene', 
       'Dibenzo[ah]anthracène','SOM VROM 10', 'SOM EPA 16', 'C5_C8', 'C8_C10', 'C10_C12', 'C12_C16','C16_C21', 
       'C21_C35', 'C35_C40', 'SOM_C5_C35', 'C21_C30', 'C30_C35','SOM C10_C40', 'EOX', 'Idc_phenol']]

dfs processing

In [141]:
sdf['Refus']=sdf['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sdf['Refus']=sdf['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')


In [142]:
for i in range(len(sdf['Terrain'])):
    x=sdf.loc[i,'Terrain']
    if re.search('[R|r]em', str(x)) : sdf.loc[i,'Terrain']='RMB'
    elif re.search('[A|a]ll', str(x)) : sdf.loc[i,'Terrain']='ALL'
    elif re.search('[S|s]oc', str(x)) : sdf.loc[i,'Terrain']='SOC'
    elif re.search('[A|a]rg', str(x)) : sdf.loc[i,'Terrain']='ARG'
    else : sdf.loc[i,'Terrain']=''

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [143]:
litho=sdf #lithologies and all facilities without distinction here (because type of facility not defined clearly !)

In [144]:
prv_sol=na_line_drop(prv_sol, 3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['line_na']=False


72 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [145]:
prv_sol=na_col_drop(prv_sol, crit=5, verbose=False)


Columns dropped :['Résidus chauffage']


In [146]:
an.replace('#',np.nan, inplace=True)
an=na_line_drop(an, 2)
an.insert(1, 'Type_ech', 'Sol')
#an['Anl_ID']=an['ID'].apply(lambda x: 'Anl_'+str(x))
#an.insert(0, 'Anl_ID', an.pop('Anl_ID'))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['line_na']=False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


662 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


data merging

In [147]:
#source_mes_sol=mes_sol
source_prv_sol=prv_sol

In [148]:
source_litho, error_df=gdf_merger(source_litho, litho, 'outer', 'ID', scope=globals())

Ambiguous values in both columns compared, change it manually !
Columns ['Societe_x', 'Societe_y', 'Description_x', 'Description_y', 'Type_x', 'Type_y'] must be dropped manually !
error file created in 'tmp_files/merging_error_log(source_litho-sdf).csv'


In [149]:
source_an.columns
an.columns

Index(['ID', 'Type_ech', 'ID_ech', 'Arsenic', 'Cadmium', 'Chrome_total',
       'Chrome_VI', 'Cobalt', 'Cuivre', 'Mercure', 'Plomb', 'Nickel', 'Zinc',
       'Libres', 'Totaux', 'Non chloro destruct.', 'Thiocyantes',
       'Cyanures totaux EPA', 'Benzène', 'Toluène', 'Ethylbenzène',
       'o-Xylènes', 'mp-Xylènes', 'Xylènes', 'SOM BTEX', 'Styrène',
       'Naphtalène', 'Anthracene', 'Phénanthrène', 'Fluoranthène',
       'Benzoaanthracène', 'Chrysène', 'Benzo(a)pyrene', 'Benzo(ghi)pérylène',
       'Benzo(k)fluoranthène', 'Indéno[123cd]pyrène', 'Acenaphtylene',
       'Acenaphthene', 'Fluorène', 'Pyrène', 'Benzo_b_fluoranthene',
       'Dibenzo[ah]anthracène', 'SOM VROM 10', 'SOM EPA 16', 'C5_C8', 'C8_C10',
       'C10_C12', 'C12_C16', 'C16_C21', 'C21_C35', 'C35_C40', 'SOM_C5_C35',
       'C21_C30', 'C30_C35', 'SOM C10_C40', 'EOX', 'Idc_phenol'],
      dtype='object')

In [150]:
source_an, error_df=gdf_merger(source_an, an, 'outer', 'ID_ech', scope=globals()) 

In [151]:
gdf_viewer(prv_sol, un_val='ID', rows=3), gdf_viewer(an, un_val='ID', rows=3) 

Rows : 1351, columns : 25, Unique on 'ID': 259


interactive(children=(IntSlider(value=3, description='rows', max=1351, min=3, readout=False), IntSlider(value=…

Rows : 761, columns : 57, Unique on 'ID': 242


interactive(children=(IntSlider(value=3, description='rows', max=761, min=3, readout=False), IntSlider(value=1…

(None, None)

In [152]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)
source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:123 ; source_litho:7511 ; source_Fac-uknw:25 ; source_an:897 ;
source_prv_sol:1351 ;source_prv_eau:130 ; source_mes_pz:412 ; source_mes_sol:0 ;


--------------------------------------------------------------------------------------------------------

## 3-obsrevations terrain et mesures piézos phase 2.xlsx

* **Sheet : 'Piézométrie'**

In [153]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [154]:
tmp_dir='../../CF_data/synthese/Result_traitem/observ_terrain/'
sheet='Piezometrie'

In [155]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'obsrevations terrain et mesures piézos phase 2.xlsx', sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

18 NaN lines dropped

Columns dropped :['Unnamed: 3', 'Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26']
Rows : 31, columns : 19


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=10, description='rows', max=31, min=10, readout=False), IntSlider(value=…

In [156]:
sdf=df[df.columns.to_list()[:3]]
sdf=na_line_drop(sdf,0)
sdf.rename(columns={'Niveau \npiézométrique':'Niv_eau_sol', 'Commentaires ':'Date_prv'}, inplace=True)

9 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['line_na']=False
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [157]:
sdf2=df.loc[:11, df.columns.to_list()[3:-1]]
sdf2.rename(columns={'Unnamed: 7':'Date_prv', 'Unnamed: 8':'Terrain', 'Unnamed: 9':'ID', 'NP/piézo [m]':'Niv_eau_pz',
       'dim. piezo hors sol [m]':'Dim_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 'Prof. piézo/piézo [m]':'Long_pz',
       'Prof. piézo/sol [m]':'Long_pz_sol', 'CE [mS/cm]':'CE','t° [°C]':'Temp','O2 dissous\n[%]':'O_diss', 
        'Observations':'Rmq'}, 
           inplace=True)

In [158]:
for i in range(len(sdf2['ID'])):
    sdf2.loc[i,'ID']=re.sub(r'^P','F', sdf2.loc[i,'ID'])
    
    if pd.isnull(sdf2.loc[i,'CE']) and not pd.isnull(sdf2.loc[i,'CE [µS/cm]']):
        sdf2.loc[i,'CE']=sdf2.loc[i,'CE [µS/cm]']/1000

sdf2.drop(['CE [µS/cm]'], axis=1, inplace=True)

In [159]:
df=df.loc[14:, df.columns.to_list()[3:-1]]
df.rename(columns={'Unnamed: 7':'Date_prv', 'Unnamed: 8':'Terrain', 'Unnamed: 9':'ID', 'NP/piézo [m]':'Niv_eau_pz',
       'dim. piezo hors sol [m]':'Dim_pz_sol', 'NP/sol [m]':'Niv_eau_sol', 'Prof. piézo/piézo [m]':'Long_pz',
       'Prof. piézo/sol [m]':'Long_pz_sol', 'CE [mS/cm]':'CE','t° [°C]':'Temp','O2 dissous\n[%]':'O_diss', 
        'Observations':'Rmq'}, 
           inplace=True)
df.drop([19,20], inplace=True)
df.reset_index(drop=True, inplace=True)

In [160]:
df['ID']

0      P2Msup
1      P4Msup
2     P12Msup
3        P537
4         P99
5        P18c
6         P19
7         P20
8         P21
9         P22
10        P23
11       P24b
12        P25
13        P26
14       P27d
Name: ID, dtype: object

In [161]:
for i in range(len(df['ID'])):
    df.loc[i,'ID']=re.sub(r'^P','F', df.loc[i,'ID'])
    
    if pd.isnull(df.loc[i,'CE']) and not pd.isnull(df.loc[i,'CE [µS/cm]']):
        df.loc[i,'CE']=df.loc[i,'CE [µS/cm]']/1000
        
df.drop(['CE [µS/cm]', 'O_diss'], axis=1, inplace=True)

In [162]:
df=na_col_drop(df, 5)
sdf2=na_col_drop(sdf2, 5,)


Columns dropped :[]

Columns dropped :['ORP', 'O_diss']


In [163]:
prv_eau, error_df=gdf_merger(sdf2, df, how='outer', col='ID', fcol='ID')

In [164]:
gdf_viewer(prv_eau, rows=5, un_val='ID')

Rows : 27, columns : 13, Unique on 'ID': 27


interactive(children=(IntSlider(value=5, description='rows', max=27, min=5, readout=False), IntSlider(value=12…

In [165]:
prv_eau=prv_eau[['ID','Date_prv','Long_pz', 'Long_pz_sol','Dim_pz_sol','Terrain','Niv_eau_sol', 'Niv_eau_pz',
                 'pH', 'Temp', 'CE', 'ORP','Rmq']]
prv_eau.insert(1,'Type_ech','Eau')

source_prv_eau=prv_eau

In [166]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)

source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:0 ;source_prv_eau:27 ; source_mes_pz:0 ; source_mes_sol:0 ;


--------------------------------------------------------------------------------------------------------

## 4-profondeur de contact campagne de forages octobre 2019.xlsx

* **Sheet : 'Feuil1'**

In [167]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [168]:
tmp_dir='../../CF_data/synthese/Result_traitem/Prof_contact_sol_forage/'
sheet='Feuil1'

In [169]:
df = pd.read_excel('../../CF_data/Data_UMONS/ouvrages/profondeur de contact campagne de forages octobre 2019.xlsx', 
                   sheet_name='Feuil1', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df)

2 NaN lines dropped

Columns dropped :[]
Rows : 8, columns : 5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=8, description='rows', max=8, min=8, readout=False), IntSlider(value=5, …

In [170]:
df.rename(columns={'n°forage ':'ID','profondeur(m)':'Long_for','x':'X', 'y':'Y', 'z':'Z'}, inplace=True)
df['Type']='Forage' # type is not defined clearly in data
df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

source_bh=df

In [171]:
source_bh

Unnamed: 0,ID,Long_for,X,Y,Z,Type
0,F205,3.2,152887.693,122594.62,101.804,Forage
1,F208,3.4,152885.296,122592.986,101.848,Forage
2,F212,3.4,152882.85,122591.453,101.93,Forage
3,F207,3.4,152892.925,122592.662,101.889,Forage
4,F214,3.6,152888.082,122588.486,101.854,Forage
5,F217,4.2,152886.185,122587.152,101.815,Forage
6,F225,4.0,152881.112,122580.962,101.669,Forage
7,F304,3.6,152882.735,122586.452,101.824,Forage


In [172]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:8 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:0 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


--------------------------------------------------------------------------------------------------------

## 5-Forages_Pilote_Decoupe.xlsx

* **Sheet : 'leve'**

In [173]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [174]:
tmp_dir='../../CF_data/synthese/Result_traitem/Forage_Pilote/'
sheet='leve_Z_elect_pos'

In [175]:
df = pd.read_excel('../../CF_data/Data_UMONS/geometrie_electrodes_et_sondes/Forages_Pilote_Decoupe.xlsx', 
                   sheet_name='leve')#, skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

0 NaN lines dropped

Columns dropped :['Unnamed: 10']
Rows : 72, columns : 11


interactive(children=(IntSlider(value=5, description='rows', max=72, min=5, readout=False), IntSlider(value=11…

In [176]:
df.columns

Index(['Ref_puits', 'Niveau mesuré', 'Niveau corrigé', 'Bouteille',
       'decoupage [m]', 'Z_diff [m] repere_local', 'long_fin [m]',
       'Pos_Inox_#1 [m]', 'Pos_Inox_#6 [m]', 'Pos_Impol_#3 [m]',
       'Unnamed: 11'],
      dtype='object')

In [177]:
df.rename(columns={'Ref_puits':'ID','Niveau mesuré':'Z_mes', 'Niveau corrigé':'Z','Z_diff [m] repere_local':'Diff_Z_local',
                   'long_fin [m]':'Long_for','Pos_Inox_#1 [m]':'Pos_Inox_#1', 
                   'Pos_Inox_#6 [m]':'Pos_Inox_#6', 'Pos_Impol_#3 [m]':'Pos_Impol_#3'}, inplace=True)

In [178]:
df['Type']='Forage' # type is not defined clearly in data
df['ID']=df['ID'].apply(lambda x: 'F'+str(x).replace('.0',''))

source_elc = df[['ID','Pos_Inox_#6', 'Pos_Impol_#3']] # 'ID' is for boreholes
source_bh = df[['ID','Z','Diff_Z_local','Long_for', 'Type']]# Z_local origin = 145.5 [m]

In [179]:
source_bh

Unnamed: 0,ID,Z,Diff_Z_local,Long_for,Type
0,F10,153.5,-8.0,,Forage
1,F11,164.5,-19.0,,Forage
2,F12,156.5,-11.0,,Forage
3,F13,155.0,-9.5,,Forage
4,F14,149.7,-4.2,,Forage
...,...,...,...,...,...
67,F223,177.0,-31.5,5.680,Forage
68,F224,175.5,-30.0,5.665,Forage
69,F225,181.5,-36.0,5.740,Forage
70,F226,179.5,-34.0,5.765,Forage


In [180]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
source_elc.to_csv(tmp_dir+'source_Electrodes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:72 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:0 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


--------------------------------------------------------------------------------------------------------

## 6-Liste XY investigations.xlsx
* **Sheet : 'SOL_EAU'**

In [181]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [182]:
tmp_dir='../../CF_data/synthese/Result_traitem/Liste_XY/'
sheet='Sol_Eau'

In [183]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='SOL')#, skiprows=4)
df['Type_ech']='Sol'

df1 = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU PR')#, skiprows=4)
df1['Type_ech']='Eau'
df1['Terrain']='SOC'

df2 = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU RB')#, skiprows=4)
df2['Type_ech']='Eau'
df2['Terrain']='RMB'

df3 = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Doc_SITEREM/Etude de caracterisation. SITEREM - 2011/Documents '
                   'supplémentaires/Liste XY investigations.xlsx', sheet_name='EAU ALL')#, skiprows=4)
df3['Type_ech']='Eau'
df3['Terrain']='ALL'

In [184]:
df2=na_line_drop(df2,0)
df2=na_col_drop(df2,1)

0 NaN lines dropped

Columns dropped :['Unnamed: 3', 'Unnamed: 4']


In [185]:
mdf, error_df=gdf_merger(df1, df2, 'outer', 'N°')

In [186]:
mdf=mdf.append(df3)
mdf=mdf.dropna(how='any', subset=['N°'])

In [187]:
mdf, error_df=gdf_merger(mdf, df, 'outer', 'N°') 

In [188]:
mdf.rename(columns={'N°':'ID'}, inplace=True)
source_bh = mdf # we only have boreholes 'ID' here, no Z, no date

In [189]:
gdf_viewer(source_bh, rows=5)

Rows : 257, columns : 5


interactive(children=(IntSlider(value=5, description='rows', max=257, min=5, readout=False), IntSlider(value=5…

In [190]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:257 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:0 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


--------------------------------------------------------------------------------------------------------

## 7-Résultats phase 1_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [191]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [192]:
tmp_dir='../../CF_data/synthese/Result_traitem/Phase_1_Memoris/'
sheet='Result_sol'

In [193]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

1 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 135, columns : 35


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [194]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [195]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [196]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [197]:
prv_sol.drop(list(range(5)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)


Columns dropped :['col_1', "Nom / description d'échantillon", 'Date de prélèvement', "Nature de l'étude (*)", 'Terrain', 'Epaisseur de remblais', 'Epaisseur alluvions', "Nature de l'observation organoleptique", 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'zone', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'X Lambert', 'Y Lambert', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (***)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'Matières organiques', 'GRANULOMETRIE', 'Fraction argileuse']
0 NaN lines dropped


In [198]:
name=['ID_ech', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [199]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x=='R': prv_sol.loc[i,'Description']='Remblais'
    elif x=='L': prv_sol.loc[i,'Description']='Limons'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not re.search('x|X', str(x)) else '')
prv_sol.insert(1,'Type_ech','Sol')

In [200]:
gdf_viewer(prv_sol, rows=3)

Rows : 29, columns : 11


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=11…

In [201]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [202]:
an=col_ren(an, 1)

In [203]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [204]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'col_63', 'EOX (****)', 'HYDROCARBURES TOTAUX', 'fraction aromat. C6-C7', 'fraction aromat. C7-C8', 'fraction aromat. C8-C10', 'fraction aliphat. C5-C6', 'fraction aliphat. C6-C8', 'fraction aliphat. C8-C10', 'Fraction C35 - C40', 'Hydrocarbures totaux C10-C40', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYL (PCB)', 'PCB 28', 'PCB 52', 'PCB 101', 'PCB 118', 'PCB 138', 'PCB 153', 'PCB 180', 'PCB totaux (7)', 'F4/2M*', 'Teneur mesurée', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = Suivi d'assainissement", '(**) L: limon, A: Argile, S: Sable, R: Remblai', '(***) ib : imperméable (béton) ; ih : imperméable hydrocarboné ; p : perméable (gra

In [205]:
an=col_ren(an, name=pol_field_model, mode=1)

In [206]:
gdf_viewer(an, rows=5) 

Rows : 29, columns : 64


interactive(children=(IntSlider(value=5, description='rows', max=29, min=5, readout=False), IntSlider(value=12…

In [207]:
source_prv_sol=prv_sol
source_an=an

In [208]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:29 ;
source_prv_sol:29 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Résult EAU'**

In [209]:
tmp_dir='../../CF_data/synthese/Result_traitem/Phase_1_Memoris/'
sheet='Result_eau'

In [210]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
gdf_viewer(df, rows=5)

1 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 136, columns : 23


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=136, min=5, readout=False), IntSlider(value=1…

In [211]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [212]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [213]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [214]:
prv_eau['CE']=prv_eau['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [215]:
prv_eau.drop(list(range(5)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)

In [216]:
prv_eau=dble_col_drop(prv_eau)

column(s) dropped: ['21:DESCRIPTION SOMMAIRE', '27:Prof. arrêt du forage']


In [217]:
prv_eau=na_col_drop(prv_eau,1)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)


Columns dropped :['col_1', "Nature de l'étude (*)", 'Observations organoleptiques', 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (**)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'PARAMETRES PHYSICO-CHIMIQUES ', 'ORP']
0 NaN lines dropped


In [218]:
name=['ID_ech','Date_prv','Num_maille','Affectation','X','Y','Zsol','Long_for','Prof_crep','Long_pz',
      'Niv_eau_sol','pH','CE','T']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau.insert(1,'Type_ech','Eau')

In [219]:
prv_eau.columns

Index(['ID_ech', 'Type_ech', 'Date_prv', 'Num_maille', 'Affectation', 'X', 'Y',
       'Zsol', 'Long_for', 'Prof_crep', 'Long_pz', 'Niv_eau_sol', 'pH', 'CE',
       'T'],
      dtype='object')

In [220]:
prv_eau['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(prv_eau)):
    c=prv_eau.loc[i,'Prof_crep']
    prv_eau.loc[i,'crep_top']=c.split('-')[0]
    prv_eau.loc[i,'crep_base']=c.split('-')[1]
    
prv_eau.drop(columns=['Prof_crep'], inplace=True)

In [221]:
prv_eau['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
prv_eau['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [222]:
pz=prv_eau[['ID_ech', 'Affectation', 'X', 'Y', 'Zsol', 'Long_for','Long_pz', 'crep_top', 'crep_base']]
pz.rename(columns={'ID_ech':'ID'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [223]:
for i in range(len(pz.ID)):
    c=pz.loc[i, 'ID']
    pz.loc[i, 'ID']=re.search("(\w+\d+(?:\w)?)",c).group(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [224]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)


In [225]:
gdf_viewer(prv_eau, rows=5)

Rows : 17, columns : 16


interactive(children=(IntSlider(value=5, description='rows', max=17, min=5, readout=False), IntSlider(value=12…

In [226]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [227]:
an=col_ren(an, 1)

In [228]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [229]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [230]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'PHENOLS', 'crésols (total)', 'CHLOROPHENOLS', '2-chlorophénol', 'monochlorophénol total', 'dichlorophénol total', '2,4,5-trichlorophénol', '2,4,6-trichlorophénol', 'trichlorophénol total', '2,3,4,6- tétrachlorophénol', 'tétrachlorophénol total', 'pentachlorophénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'CHLOROBENZENES', 'monochlorobenzène', '1,3-dichlorobenzène', '1,2-dichlorobenzène', '1,4-Dichlorobenzène', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes', 'pentachlorobenzène', 'hexachlorobenzène', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYLS (PCB)', 'PCB totaux (7)', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = Suivi d'assainissement", '(**) ib : imperméable (béton) ; ih : imperméable hydrocar

In [231]:
an=col_ren(an, name=pol_field_model, mode=1)

In [232]:
gdf_viewer(an, rows=5) 

Rows : 17, columns : 70


interactive(children=(IntSlider(value=5, description='rows', max=17, min=5, readout=False), IntSlider(value=12…

In [233]:
source_pz=pz
source_prv_eau=prv_eau
source_an=source_an.append(an)

In [234]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:14 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:46 ;
source_prv_sol:29 ;source_prv_eau:17 ; source_mes_pz:0 ; source_mes_sol:0 ;


## 8-Résultats phase 2_MEMORIS.xls
* **Sheet : 'Résult SOL'**

In [235]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [236]:
tmp_dir='../../CF_data/synthese/Result_traitem/Phase_2_Memoris/'
sheet='Result_SOL'

In [237]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)
gdf_viewer(df, rows=5)

1 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 135, columns : 31


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [238]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [239]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [240]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [241]:
prv_sol.drop(list(range(5)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)


Columns dropped :['col_0', "Nom / description d'échantillon", "Nature de l'étude (*)", 'Terrain', 'Epaisseur de remblais', 'Epaisseur alluvions', "Nature de l'observation organoleptique", 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'zone', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'X Lambert', 'Y Lambert', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (***)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'Matières organiques', 'GRANULOMETRIE', 'Fraction argileuse']
0 NaN lines dropped


In [242]:
name=['ID_ech', 'Date_prv', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [243]:
set(prv_sol['Description'])

{'L', 'LA', 'LS', 'R'}

In [244]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x=='R': prv_sol.loc[i,'Description']='Remblais'
    elif x=='L': prv_sol.loc[i,'Description']='Limons'
    elif x=='LA': prv_sol.loc[i,'Description']='Limons et argiles'
    elif x=='LS': prv_sol.loc[i,'Description']='Limons et sables'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
prv_sol.insert(1,'Type_ech','Sol')

In [245]:
gdf_viewer(prv_sol, rows=3)

Rows : 25, columns : 12


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

In [246]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [247]:
an=col_ren(an, 1)

In [248]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [249]:
an.drop(list(range(5)), axis=0, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Phénol', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', '1,1-Dichloroéthane', '1,2-Dichloroéthane', '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène', 'Trans 1,2-dichloroéthylène', 'Dichlorométhane', 'Totaux (cis,trans) 1,2-dichloroéthènes', '1,2-dichloropropane', 'Tétrachloroéthylène', 'Tétrachlorométhane', '1,1,1-Trichloroéthane', '1,1,2-Trichloroéthane', 'Trichloroéthylène', 'Chloroforme', 'Chlorure de vinyle', 'col_63', 'EOX (****)', 'HYDROCARBURES TOTAUX', 'Fraction C35 - C40', 'Hydrocarbures totaux C10-C40', 'METHYL-TERT-BUTYL-ETHER', 'MTBE', 'POLYCHLOROBIPHENYL (PCB)', 'PCB 28', 'PCB 52', 'PCB 101', 'PCB 118', 'PCB 138', 'PCB 153', 'PCB 180', 'PCB totaux (7)', 'F4/2M*', 'Teneur mesurée', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = 

In [250]:
an=col_ren(an, name=pol_field_model, mode=1)

In [251]:
gdf_viewer(an, rows=5, cols=20) 

Rows : 25, columns : 53


interactive(children=(IntSlider(value=5, description='rows', max=25, min=5, readout=False), IntSlider(value=20…

In [252]:
source_prv_sol=prv_sol
source_an=an

In [253]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:25 ;
source_prv_sol:25 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Résult EAU'**

In [254]:
tmp_dir='../../CF_data/synthese/Result_traitem/Phase_2_Memoris/'
sheet='Result_eau'

In [255]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

0 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 138, columns : 17


interactive(children=(IntSlider(value=5, description='rows', max=138, min=5, readout=False), IntSlider(value=1…

In [256]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [257]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [258]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [259]:
prv_eau['CE']=prv_eau['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [260]:
prv_eau.drop(list(range(5)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)

In [261]:
prv_eau=dble_col_drop(prv_eau)

column(s) dropped: ['21:DESCRIPTION SOMMAIRE', '27:Prof. arrêt du forage']


In [262]:
prv_eau=na_col_drop(prv_eau,1)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)


Columns dropped :['Nom du piézomètre', "Nature de l'étude (*)", 'Observations organoleptiques', 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'Numéro de maille', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (**)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'PARAMETRES PHYSICO-CHIMIQUES ', 'ORP']
0 NaN lines dropped


In [263]:
prv_eau.columns

Index(['col_1', 'Date de prélèvement', 'Type d'affectation (Plan de secteur)',
       'X Lambert', 'Y Lambert', 'Z Sol', 'Prof. arrêt du forage',
       'Profondeur crépine ', 'Prof. piézo/sol mesurée sur site',
       'Niveau de la nappe/sol', 'pH', 'CE', 'T'],
      dtype='object')

In [264]:
name=['ID_ech', 'Date_prv','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol',
      'Niv_eau_sol','pH', 'CE', 'T']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau.insert(1,'Type_ech','Eau')

In [265]:
prv_eau['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
prv_eau['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [266]:
prv_eau['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(prv_eau)):
    c=prv_eau.loc[i,'Prof_crep']
    prv_eau.loc[i,'crep_top']=c.split('-')[0]
    prv_eau.loc[i,'crep_base']=c.split('-')[1]
    
prv_eau.drop(columns=['Prof_crep'], inplace=True)

In [267]:
prv_eau['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
prv_eau['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [268]:
pz=prv_eau[['ID_ech', 'Affectation', 'X', 'Y', 'Zsol', 'Long_for','Long_pz_sol', 'crep_top', 'crep_base']]
pz.rename(columns={'ID_ech':'ID'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [269]:
for i in range(len(pz.ID)):
    c=pz.loc[i, 'ID']
    pz.loc[i, 'ID']=re.search("(\w+\d+(?:\w)?)",c).group(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [270]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)


In [271]:
prv_eau=prv_eau[['ID_ech', 'Date_prv', 'X', 'Y', 'Zsol','Niv_eau_sol', 'pH', 'CE', 'T']]

In [272]:
gdf_viewer(prv_eau, rows=5)

Rows : 11, columns : 9


interactive(children=(IntSlider(value=5, description='rows', max=11, min=5, readout=False), IntSlider(value=9,…

In [273]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [274]:
an=col_ren(an, 1)

In [275]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [276]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [277]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')


Columns dropped :['METAUX LOURDS', 'Chrome VI', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'PHENOLS', 'crésols (total)', 'CHLOROPHENOLS', '2-chlorophénol', 'monochlorophénol total', 'dichlorophénol total', '2,4,5-trichlorophénol', '2,4,6-trichlorophénol', 'trichlorophénol total', '2,3,4,6- tétrachlorophénol', 'tétrachlorophénol total', 'pentachlorophénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'CHLOROBENZENES', 'monochlorobenzène', '1,3-dichlorobenzène', '1,2-dichlorobenzène', '1,4-Dichlorobenzène', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes', 'pentachlorobenzène', 'hexachlorobenzène', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYLS (PCB)', 'PCB totaux (7)', 'AUTRES ANALYSES ', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = Suivi d'assainissement", '(**) ib : imperméable (b

In [278]:
an=col_ren(an, name=pol_field_model, mode=1)

In [279]:
gdf_viewer(an, rows=5) 

Rows : 11, columns : 70


interactive(children=(IntSlider(value=5, description='rows', max=11, min=5, readout=False), IntSlider(value=12…

In [280]:
source_prv_eau=prv_eau
source_an=source_an.append(an)

In [281]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:36 ;
source_prv_sol:25 ;source_prv_eau:11 ; source_mes_pz:0 ; source_mes_sol:0 ;


## 9-Ensemble des résultats Memoris version Seafile.xls
* **Sheet : 'Résult SOL'**

In [282]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [283]:
tmp_dir='../../CF_data/synthese/Result_traitem/Memoris_seafile/'
sheet='Result_SOL'

In [284]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
gdf_viewer(df, rows=5)

2 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 138, columns : 66


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=138, min=5, readout=False), IntSlider(value=1…

In [285]:
prv_sol=df.loc[:37]
an=df.loc[38:]

In [286]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [287]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [288]:
prv_sol.drop(list(range(5)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)


Columns dropped :['col_1', "Nom / description d'échantillon", "Nature de l'étude (*)", 'Terrain', 'Epaisseur de remblais', 'Epaisseur alluvions', "Nature de l'observation organoleptique", 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'zone', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'X Lambert', 'Y Lambert', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (***)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'Matières organiques', 'GRANULOMETRIE', 'Fraction argileuse']
0 NaN lines dropped


In [289]:
prv_sol.drop(columns=prv_sol.columns[[-3,-4]], axis=1, inplace=True)

In [290]:
name=['ID_ech', 'Date_prv', 'Description','Organo', 'Long_for', 'Refus','Ech_top', 'Ech_base', 'MS','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [291]:
set(prv_sol['Description'])

{'L', 'LA', 'LS', 'R', 'R '}

In [292]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x=='R' or x=='R ': prv_sol.loc[i,'Description']='Remblais'
    elif x=='L': prv_sol.loc[i,'Description']='Limons'
    elif x=='LA': prv_sol.loc[i,'Description']='Limons et argiles'
    elif x=='LS': prv_sol.loc[i,'Description']='Limons et sables'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not re.search('#', str(x)) else '')
prv_sol.insert(1,'Type_ech','Sol')

In [293]:
gdf_viewer(prv_sol, rows=3)

Rows : 60, columns : 12


interactive(children=(IntSlider(value=3, description='rows', max=60, min=3, readout=False), IntSlider(value=12…

In [294]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [295]:
an=col_ren(an, 1)

In [296]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [297]:
an=dble_col_drop(an)

column(s) dropped: []


In [298]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'col_63', 'EOX (****)', 'HYDROCARBURES TOTAUX', 'Hydrocarbures totaux C10-C40', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYL (PCB)', 'PCB 28', 'PCB 52', 'PCB 101', 'PCB 118', 'PCB 138', 'PCB 153', 'PCB 180', 'PCB totaux (7)', 'F4/2M*', 'Teneur mesurée', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = Suivi d'assainissement", '(**) L: limon, A: Argile, S: Sable, R: Remblai', '(***) ib : imperméable (béton) ; ih : imperméable hydrocarboné ; p : perméable (gravier, fissuré,…) ; tvh : terres végétation haute ; tvb : terres végétation basse ', '(****) 3 mg/kg = Seuil limite défini dans le GREO ', "(1) l'échantillon n'a pas pu être extrait ni

In [299]:
an=col_ren(an, name=pol_field_model, mode=1)

In [300]:
gdf_viewer(an, rows=5) 

Rows : 60, columns : 71


interactive(children=(IntSlider(value=5, description='rows', max=60, min=5, readout=False), IntSlider(value=12…

In [301]:
source_prv_sol=prv_sol
source_an=an

In [302]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:60 ;
source_prv_sol:60 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Résult EAU'**

In [303]:
tmp_dir='../../CF_data/synthese/Result_traitem/Memoris_seafile/'
sheet='Result_eau'

In [304]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Ensemble des résultats Memoris version Seafile.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

4 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 154, columns : 51


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=154, min=5, readout=False), IntSlider(value=1…

In [305]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [306]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [307]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [308]:
prv_eau['CE']=prv_eau['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [309]:
prv_eau.drop(list(range(5)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)

In [310]:
prv_eau=dble_col_drop(prv_eau)

column(s) dropped: ['21:DESCRIPTION SOMMAIRE', '27:Prof. arrêt du forage']


In [311]:
prv_eau=na_col_drop(prv_eau,1)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)


Columns dropped :['col_1', "Nature de l'étude (*)", 'Observations organoleptiques', 'LOCALISATION/AFFECTATION(S) - USAGE(S)', 'parcelle (selon dénom.)', "Type d'usage de fait actuel", "Type d'usage de fait futur", "Type d'usage utilisé pour comparer les résultats", 'Lieux de prélèvement', 'DESCRIPTION SOMMAIRE', 'Type de recouvrement de surface (**)', 'Sol en place (S : souillé, NS : non souillé)', 'Remblais de terre (S : souillé, NS : non souillé)', 'Remblais technique (S : souillé, NS : non souillé)', 'Déchet', 'PARAMETRES PHYSICO-CHIMIQUES ', 'ORP']
0 NaN lines dropped


In [312]:
prv_eau.drop(columns=prv_eau.columns[[2]], axis=2, inplace=True)

In [313]:
name=['ID_ech', 'Date_prv','Affectation','X', 'Y','Zsol', 'Long_for','Prof_crep','Long_pz_sol', 
      'Niv_eau_sol','pH', 'CE', 'T']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau.insert(1,'Type_ech','Eau')

In [314]:
prv_eau['Prof_crep'].replace('\[|\]','', regex=True, inplace=True)
for i in range(len(prv_eau)):
    c=prv_eau.loc[i,'Prof_crep']
    prv_eau.loc[i,'crep_top']=c.split('-')[0]
    prv_eau.loc[i,'crep_base']=c.split('-')[1]
    
prv_eau.drop(columns=['Prof_crep'], inplace=True)

In [315]:
prv_eau['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
prv_eau['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [316]:
pz=prv_eau[['ID_ech', 'Affectation', 'X', 'Y', 'Zsol', 'Long_for','Long_pz_sol', 'crep_top', 'crep_base']]
pz.rename(columns={'ID_ech':'ID'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [317]:
for i in range(len(pz.ID)):
    c=pz.loc[i, 'ID']
    pz.loc[i, 'ID']=re.search("(\w+\d+)",c).group(1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [318]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)


In [319]:
prv_eau=prv_eau[['ID_ech', 'Date_prv', 'X', 'Y', 'Zsol','Niv_eau_sol', 'pH', 'CE', 'T']]

In [320]:
gdf_viewer(prv_eau, rows=3)

Rows : 45, columns : 9


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=9,…

In [321]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [322]:
an=col_ren(an, 1)

In [323]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [324]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [325]:
an=dble_col_drop(an)

column(s) dropped: ['104:nitrite', '106:nitrate', '112:ammonium', '117:Teneur mesurée']


In [326]:
an.drop(columns=an.columns[[-6,-5]], axis=1, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'PHENOLS', 'crésols (total)', 'CHLOROPHENOLS', '2-chlorophénol', 'monochlorophénol total', 'dichlorophénol total', '2,4,5-trichlorophénol', '2,4,6-trichlorophénol', 'trichlorophénol total', '2,3,4,6- tétrachlorophénol', 'tétrachlorophénol total', 'pentachlorophénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'CHLOROBENZENES', 'monochlorobenzène', '1,3-dichlorobenzène', '1,2-dichlorobenzène', '1,4-Dichlorobenzène', '1,2,4,5- et 1,2,3,5-tétrachlorobenzènes', 'pentachlorobenzène', 'hexachlorobenzène', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYLS (PCB)', 'PCB totaux (7)', 'AUTRES ANALYSES ', 'azote Kjeldahl', 'COMPOSES INORGANIQUES ', 'sulfures totaux', 'Teneur mesurée (souligne/gras)', 'Teneur mesurée (gras/grisé)', "(*) CP = Campagne de prélèvements; Ex = Expertise; ES = Etude de sol; EO = Etude d'orientation; EC = Etude de caractérisation; SA = 

In [327]:
an['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
an['ID_ech'].replace('\n', ' ', inplace=True, regex=True)

In [328]:
an=col_ren(an, name=pol_field_model, mode=1)

In [329]:
gdf_viewer(an, rows=5) 

Rows : 45, columns : 80


interactive(children=(IntSlider(value=5, description='rows', max=45, min=5, readout=False), IntSlider(value=12…

In [330]:
source_pz=pz
source_prv_eau=prv_eau
source_an=source_an.append(an)

In [331]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:30 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:105 ;
source_prv_sol:60 ;source_prv_eau:45 ; source_mes_pz:0 ; source_mes_sol:0 ;


## 10-Résultats SOL container phyto t=0_décret sol.xls
* **Sheet : 'Résult SOL'**

In [332]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [333]:
tmp_dir='../../CF_data/synthese/Result_traitem/Container_phyto/'
sheet='Result_SOL'

In [334]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

2 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 16', 'Unnamed: 17']
Rows : 121, columns : 15


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=121, min=5, readout=False), IntSlider(value=1…

In [335]:
prv_sol=df.loc[:21]
an=df.loc[22:]

In [336]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [337]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [338]:
prv_sol=dble_col_drop(prv_sol)

column(s) dropped: ['8:Autre zone suspecte investiguée']


In [339]:
prv_sol.drop(list(range(5)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,2)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)


Columns dropped :['Matières organiques', 'SPP/zone suspecte invetiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Refus de forage', 'Terrain naturel/Remblai (**)', 'Organoleptique couleur suspecte', 'Organoleptique odeur intensité (***)', 'Organoleptique odeur type', 'GRANULOMETRIE']
4 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [340]:
prv_sol.drop(columns=prv_sol.columns[[-3]], axis=1, inplace=True)

In [341]:
name=['ID_ech', 'Ech_top', 'Ech_base','MS','Date_prv','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)
prv_sol=prv_sol.query('ID_ech==ID_ech')
prv_sol.insert(1,'Type_ech','Sol')

In [342]:
gdf_viewer(prv_sol, rows=3)

Rows : 5, columns : 8


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=8, …

In [343]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [344]:
an=col_ren(an, 1)

In [345]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [346]:
an=dble_col_drop(an)

column(s) dropped: ['92:Teneur mesurée', '93:Teneur mesurée', '94:Teneur mesurée']


In [347]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')


Columns dropped :['METAUX LOURDS', 'Cobalt', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'col_64', 'EOX (****)', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'POLYCHLOROBIPHENYL (PCB)', 'PCB 28', 'PCB 52', 'PCB 101', 'PCB 118', 'PCB 138', 'PCB 153', 'PCB 180', 'PCB totaux (7)', 'Teneur mesurée', "VR : Valeur de référence; VS : Valeur seuil; VI : Valeur d'intervention", "(*) RP : Rapport de prélèvements; ES : Etude de sol; EO : Etude d'orientation; EC : Etude de caractérisation; SA : Suivi d'assainissement; EF : Evaluation finale", '(**) TN : Terrain naturel; R : Remblai', "(***) - : Pas d'impression organoleptique; + : Impression organoleptique faible; ++ : Impression organoleptique forte", '(****) 3 mg/kg = Seuil limite défini dans le GREO ']


In [348]:
an=col_ren(an, name=pol_field_model, mode=1)

In [349]:
gdf_viewer(an, rows=5) 

Rows : 9, columns : 70


interactive(children=(IntSlider(value=5, description='rows', max=9, min=5, readout=False), IntSlider(value=12,…

In [350]:
source_prv_sol=prv_sol
source_an=an

In [351]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:9 ;
source_prv_sol:5 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Paramètres agro.'**

In [352]:
tmp_dir='../../CF_data/synthese/Result_traitem/Container_phyto/'
sheet='Param_agro'

In [353]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/'
                   'Résultats SOL container phyto t=0_décret sol.xls', sheet_name='Paramètres agro.', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

1 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 1', 'Unnamed: 2']
Rows : 28, columns : 10


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=28, min=5, readout=False), IntSlider(value=10…

In [354]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)
df=col_ren(df, 0)

In [355]:
df.drop(list(range(1)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [356]:
df=dble_col_drop(df)

column(s) dropped: ['22:température pour mes. pH']


In [357]:
df=na_col_drop(df,1)
df=na_line_drop(df,3)
df.reset_index(drop=True, inplace=True)


Columns dropped :['Matières organiques', 'GRANULOMETRIE', 'pH', 'Composés inorganiques ', 'Autres analyses chimiques ']
3 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [358]:
df.drop(columns=df.columns[[5,6]], axis=2, inplace=True)

In [359]:
df.columns

Index(['Nom de l'échantillon', 'Profondeur échantillon dans container de',
       '                                           à', 'Matière sèche',
       'Date de prélèvement', 'Matières organiques ',
       'résidu après perte au feu', 'COT', 'fraction argileuse',
       'parties min. 2µm', 'parties min. 50µm', 'parties min. 2mm',
       'fraction  2 mm (prép. séché à 40°C) ',
       'fraction 2 mm (prép. séché à 40°C) ', 'pH (KCl)',
       'température pour mes. pH', 'pH (H20)', 'sulfures totaux', 'chlorures',
       'azote Kjeldahl'],
      dtype='object')

In [360]:
name=['ID_ech','Ech_top','Ech_base','MS','Date_prv','MO','Residu_perte_feu','COT','Fract_arg','Fract_min_2µ', 
      'Fract_min_50µ', 'Fract_min_2', 'Fract_2', 'Fract_2+', 'pH_KCl','Tem_pH_mes', 'pH_H20', 'sulfures_tot', 
      'chlorures', 'azote_Kjeldahl']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [361]:
prv_sol=df

In [362]:
gdf_viewer(prv_sol, rows=5)

Rows : 5, columns : 21


interactive(children=(IntSlider(value=5, description='rows', max=5, min=5, readout=False), IntSlider(value=12,…

In [363]:
gdf_merger(source_prv_sol, prv_sol, col='ID_ech', how='outer', scope=globals())[0]

Ambiguous values in both columns compared, change it manually !
Columns ['Fract_2_x', 'Fract_2_y', 'Fract_2+_x', 'Fract_2+_y', 'MS_x', 'MS_y'] must be dropped manually !
error file created in 'tmp_files/merging_error_log(source_prv_sol-df).csv'


Unnamed: 0,ID_ech,MS_x,Fract_2_x,Fract_2+_x,MS_y,MO,Residu_perte_feu,COT,Fract_arg,Fract_min_2µ,...,pH_H20,sulfures_tot,chlorures,azote_Kjeldahl,Ech_base,Date_prv,Fract_2,Fract_2+,Ech_top,Type_ech
0,Ech. 1,88.8,33,67,89.8,6.3,93.3,160000,1.0,1,...,11.3,130,34,1320,15,2017-12-14,,,12,Sol
1,Ech. 2,80.0,47,53,75.9,11.5,88.2,400000,1.9,1,...,8.6,78,18,3810,12,2017-12-14,47.0,53.0,9,Sol
2,Ech. 3,87.9,40,60,80.1,10.2,89.4,200000,1.0,1,...,11.0,86,36,3040,9,2017-12-14,,,6,Sol
3,Ech. 4,90.5,45,55,85.8,9.0,90.6,300000,1.0,1,...,11.1,60,36,2550,6,2017-12-14,,,3,Sol
4,Ech. 5,76.1,29,71,84.9,8.4,91.2,210000,1.0,1,...,11.4,80,43,2300,3,2017-12-14,,,0,Sol


In [364]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:9 ;
source_prv_sol:5 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


--------------------------------------------------------------------------------------------------------

## 11-Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [365]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [366]:
tmp_dir='../../CF_data/synthese/Result_traitem/Siterem_Ext_Pilote/'
sheet='Result_eau'

In [367]:
df = pd.read_excel('../../CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

  warn(msg)


11 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69']
Rows : 115, columns : 37


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=115, min=5, readout=False), IntSlider(value=1…

In [368]:
prv_eau=df.loc[:31]
an=df.loc[32:]

In [369]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [370]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [371]:
prv_eau=dble_col_drop(prv_eau)

column(s) dropped: ['6:Autre zone suspecte investiguée', '25:pH', '30:pH']


In [372]:
prv_eau.drop(list(range(5)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=na_col_drop(prv_eau,2)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)


Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Profondeur de la nappe/piezo', 'Profondeur de la nappe/chambre visite', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'Profondeur de la nappe/sol ', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'CE', 'ORP', 'Oxygène dissous', 'PARAMETRES PHYSICO-CHIMIQUES \n(mesures au labo)']
0 NaN lines dropped


In [373]:
prv_eau.columns

Index(['Nom échantillon', 'Période ', 'Emplacement ', 'Date de prélèvement',
       'pH', 'Température de prélèvement ', 'température pour mes. pH'],
      dtype='object')

In [374]:
name=['ID_ech', 'Periode', 'Emplacement','Date_prv','pH','Temp_prv','Temp_pH_mes']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau=prv_eau.query('ID_ech==ID_ech')
prv_eau.insert(1,'Type_ech','Eau')

In [375]:
gdf_viewer(prv_eau, rows=3)

Rows : 31, columns : 8


interactive(children=(IntSlider(value=3, description='rows', max=31, min=3, readout=False), IntSlider(value=8,…

In [376]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [377]:
an=col_ren(an, 1)

In [378]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [379]:
an=dble_col_drop(an)

column(s) dropped: []


In [380]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre','Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE) - méthode basée sur EPA 335.3", "cyanure complex - méthode interne ", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g"]

an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-7]

In [381]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'Styrène', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'MTBE']


In [382]:
an=col_ren(an, name=pol_field_model, mode=1)

In [383]:
gdf_viewer(an, rows=3)

Rows : 33, columns : 67


interactive(children=(IntSlider(value=3, description='rows', max=33, min=3, readout=False), IntSlider(value=12…

In [384]:
source_prv_eau=prv_eau
source_an=an

In [385]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:33 ;
source_prv_sol:0 ;source_prv_eau:31 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Param physico'**

In [386]:
tmp_dir='../../CF_data/synthese/Result_traitem/Siterem_Ext_Pilote/'
sheet='Param_physico'

In [387]:
df = pd.read_excel('../../CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

8 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 79', 'Unnamed: 80', 'Unnamed: 81', 'Unnamed: 82', 'Unnamed: 83', 'Unnamed: 84', 'Unnamed: 85', 'Unnamed: 86', 'Unnamed: 87', 'Unnamed: 88', 'Unnamed: 89', 'Unnamed: 90', 'Unnamed: 91', 'Unnamed: 92', 'Unnamed: 93', 'Unnamed: 94', 'Unnamed: 95', 'Unnamed: 96', 'Unnamed: 97', 'Unnamed: 98', 'Unnamed: 99', 'Unnamed: 100', 'Unnamed: 101', 'Unnamed: 102']
Rows : 53, columns : 77


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=53, min=5, readout=False), IntSlider(value=12…

In [388]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [389]:
df=col_ren(df, 1)

In [390]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [391]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [392]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

column(s) dropped: ['3:Période ']
column(s) dropped: ['6:Autre zone suspecte investiguée', '26:pH', '32:pH']


In [393]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

41 NaN lines dropped
27 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [394]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)


Columns dropped :['Profondeur de la nappe/chambre visite', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Profondeur de la nappe/sol ', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', "*paramètres n'ont pas été pris en débit continu (dans seau) - peu de débit", 'col_52']

Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'PARAMETRES PHYSICO-CHIMIQUES \n(mesures au labo)']


In [395]:
df.columns

Index(['Nom échantillon', 'Période ',
       'Emplacement \n- S : Simulateur \n- HZS : Hors zone simulateur',
       'Date de prélèvement', 'Profondeur de la nappe/piezo',
       'Profondeur piezo/piezo', 'Température de prélèvement ', 'pH', 'CE',
       'ORP', 'Oxygène dissous'],
      dtype='object')

In [396]:
name=['ID_ech','Periode','Emplacement','Date_prv','Niv_eau_pz','Long_pz','Temp_prv ','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [397]:
sdf=sdf.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_prv','Niv_eau_pz','Niv_eau_chbre','pH','Niv_eau_sol','Long_pz',
      'Temp_prv ','CE','ORP','O_diss']
sdf=col_ren(sdf, mode=1, name=name)

In [398]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [399]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)

In [400]:
data=[df, sdf]
for d in data:
    d['Rmq']=''
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        n=str(d.loc[i, 'ID_ech'])
        d.loc[i,'ID_ech']=n.replace('*', '')
        
        if re.match('S',e, re.I): 
            d.loc[i,'Emplacement']='Simulateur'
        elif re.match('HZS',e, re.I): 
            d.loc[i,'Emplacement']='Hors Simulateur'
        else:
            d.loc[i,'Emplacement']=np.nan
        
        if re.match('\d+\*{1}$',n, re.I): 
            d.loc[i,'Rmq']="mesures faites dans un seau (débit non continu ou peu de débit)"
        elif re.match('\d+\*{2}$',n, re.I): 
            d.loc[i,'Rmq']="mésures faites dans une eau quasi-stagnante (Piezo rempli de sédiment et débit très faible)"

In [401]:
df.insert(1, 'Type_ech', 'Eau')
sdf.insert(1, 'Type_ech', 'Eau')

In [402]:
prv_eau=gdf_merger(sdf, df, 'outer', 'ID_ech')[0]

In [403]:
prv_eau=na_col_drop(df,2)
prv_eau=na_line_drop(df,1)
prv_eau.reset_index(drop=True, inplace=True)


Columns dropped :[]
0 NaN lines dropped


In [404]:
gdf_viewer(prv_eau, rows=3)

Rows : 33, columns : 13


interactive(children=(IntSlider(value=3, description='rows', max=33, min=3, readout=False), IntSlider(value=12…

In [405]:
source_prv_eau=gdf_merger(source_prv_eau, prv_eau, col='ID_ech', how='outer', scope=globals())[0]

Ambiguous values in both columns compared, change it manually !
Columns ['Periode_x', 'Periode_y', 'Date_prv_x', 'Date_prv_y', 'pH_x', 'pH_y', 'Emplacement_x', 'Emplacement_y'] must be dropped manually !
error file created in 'tmp_files/merging_error_log(source_prv_eau-prv_eau).csv'


In [406]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:33 ;
source_prv_sol:0 ;source_prv_eau:162 ; source_mes_pz:0 ; source_mes_sol:0 ;


In [407]:
# continue here

* **Sheet : 'Inorganiques et composés majeurs'**

In [408]:
tmp_dir='../../CF_data/synthese/Result_traitem/Siterem_Ext_Pilote/'
sheet='Inorganic_major'

In [409]:
df = pd.read_excel('../../CF_data/Data_SITEREM/Resultats_Siterem_eau_extension_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

9 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 36', 'Unnamed: 37', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69', 'Unnamed: 70', 'Unnamed: 71', 'Unnamed: 72', 'Unnamed: 73', 'Unnamed: 74', 'Unnamed: 75', 'Unnamed: 76', 'Unnamed: 77', 'Unnamed: 78', 'Unnamed: 79', 'Unnamed: 80', 'Unnamed: 81', 'Unnamed: 82', 'Unnamed: 83', 'Unnamed: 84', 'Unnamed: 85', 'Unnamed: 86', 'Unnamed: 87', 'Unnamed: 88', 'Unnamed: 89', 'Unnamed: 90', 'Unnamed: 91', 'Un

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=68, min=5, readout=False), IntSlider(value=12…

In [410]:
prv_eau=df.loc[:21]
an=df.loc[22:]

In [411]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [412]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [413]:
prv_eau=dble_col_drop(prv_eau)

column(s) dropped: ['6:Autre zone suspecte investiguée']


In [414]:
prv_eau.drop(list(range(2)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=na_col_drop(prv_eau,2)
prv_eau=na_line_drop(prv_eau,2)
prv_eau.reset_index(drop=True, inplace=True)


Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Profondeur de la nappe', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'pH']
1 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [415]:
prv_eau.columns

Index(['Nom échantillon', 'Période ', 'Emplacement ', 'Date de prélèvement',
       'Température de prélèvement '],
      dtype='object')

In [416]:
name=['ID_ech', 'Periode', 'Emplacement','Date_prv','Temp_prv']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau=prv_eau.query('ID_ech==ID_ech')
prv_eau.insert(1,'Type_ech','Eau')

In [417]:
gdf_viewer(prv_eau, rows=3)

Rows : 24, columns : 6


interactive(children=(IntSlider(value=3, description='rows', max=24, min=3, readout=False), IntSlider(value=6,…

In [418]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [419]:
an=col_ren(an, 1)

In [420]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [421]:
an=dble_col_drop(an)

column(s) dropped: ['8:ammonium', '12:nitrite', '14:nitrate']


In [422]:
an=na_col_drop(an,3)


Columns dropped :['CARBONE ORGANIQUE', 'DEMANDE EN O2', 'COMPOSES AZOTES', 'COMPOSES SOUFRES ', 'ELEMENTS MAJEURS', 'AUTRES ANALYSES', 'cyanure (libre)', 'METHYL-TERT-BUTYL-ETHER', 'MTBE', 'Teneur mesurée ', 'Teneur mesurée', 'VS : Valeur seuil', "(*) RP : Rapport de prélèvements; ES : Etude de sol; EO : Etude d'orientation; EC : Etude de caractérisation; SA : Suivi d'assainissement; EF : Evaluation finale", "(**) - : Pas d'impression organoleptique; + : Impression organoleptique faible; ++ : Impression organoleptique forte", '(***) + : Limpide; - : Trouble; -- : Opaque', "Le contenu des tableaux est conforme au modèle repris à l'annexe IX du GREO V03. Le formalisme a été adapté par SITEREM tout en garantissant la lisibilité du document imprimé. "]


In [423]:
an.columns

Index(['ID_ech', 'COT', 'DBO (5 jours)', 'DCO', 'ammonium',
       'ammoniaque - libre', 'azote Kjeldahl', 'nitrite', 'nitrate',
       'sulfures totaux', 'sulfures (libre)', 'Soufre Total', 'sulfites',
       'sulfate', 'calcium', 'potassium', 'magnésium', 'manganèse', 'sodium',
       'fer', 'fer (Fe) total', 'fer (2+)', 'chlorures', 'fluorures',
       'bromure (libre)', 'phosphore (total)', 'carbonate', 'bicarbonate'],
      dtype='object')

In [424]:
an.rename(columns={'col_9':'ammoniaque libre'}, inplace=True)

In [425]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')


Columns dropped :[]


In [426]:
an=col_ren(an, name=pol_field_model, mode=1)
#an=an.iloc[:,:-7]

In [427]:
gdf_viewer(an, rows=3)

Rows : 25, columns : 29


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

In [428]:
test1=prv_eau[:5]
test2=prv_eau[:15]
test1, test2

(  ID_ech Type_ech Periode       Emplacement             Date_prv Temp_prv
 0    201      Eau      T0  Extension pilote  2019-12-09 00:00:00     9.32
 1    201      Eau      T2  Extension pilote  2020-03-09 00:00:00     11.6
 2    201      Eau      T3  Extension pilote  2020-04-21 00:00:00    15.65
 3    201      Eau      T4  Extension pilote  2020-06-02 00:00:00    22.52
 4    201      Eau      T5  Extension pilote  2020-08-04 00:00:00    24.54,
    ID_ech Type_ech Periode       Emplacement             Date_prv Temp_prv
 0     201      Eau      T0  Extension pilote  2019-12-09 00:00:00     9.32
 1     201      Eau      T2  Extension pilote  2020-03-09 00:00:00     11.6
 2     201      Eau      T3  Extension pilote  2020-04-21 00:00:00    15.65
 3     201      Eau      T4  Extension pilote  2020-06-02 00:00:00    22.52
 4     201      Eau      T5  Extension pilote  2020-08-04 00:00:00    24.54
 5     201      Eau      T6  Extension pilote  2020-09-29 00:00:00    25.39
 6     207      E

In [429]:
gdf_viewer(test1.merge(test2, on='Date_prv'))#, how='outer'))

Rows : 13, columns : 11


interactive(children=(IntSlider(value=10, description='rows', max=13, min=10, readout=False), IntSlider(value=…

In [430]:
gdf_viewer(source_prv_eau)

Rows : 162, columns : 23


interactive(children=(IntSlider(value=10, description='rows', max=162, min=10, readout=False), IntSlider(value…

$\color{red}{\text{error on merge (because of temporal data). I must check it after}}$

## 12-Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx
* **Sheet : 'Résult EAU'**

In [431]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [432]:
tmp_dir='../../CF_data/synthese/Result_traitem/Siterem_Pilote/'
sheet='Result_eau'

In [433]:
df = pd.read_excel('../../CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='Résult EAU', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

  warn(msg)


9 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 93', 'Unnamed: 94', 'Unnamed: 95', 'Unnamed: 96', 'Unnamed: 97', 'Unnamed: 98', 'Unnamed: 99', 'Unnamed: 100', 'Unnamed: 101', 'Unnamed: 102', 'Unnamed: 103', 'Unnamed: 104', 'Unnamed: 105', 'Unnamed: 106', 'Unnamed: 107', 'Unnamed: 108', 'Unnamed: 109', 'Unnamed: 110', 'Unnamed: 111', 'Unnamed: 112', 'Unnamed: 113']
Rows : 117, columns : 91


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=117, min=5, readout=False), IntSlider(value=1…

In [434]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [435]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [436]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [437]:
prv_eau=dble_col_drop(prv_eau)

column(s) dropped: ['6:Autre zone suspecte investiguée', '25:pH', '31:pH']


In [438]:
prv_eau.drop(list(range(3)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=na_col_drop(prv_eau,2)
prv_eau=na_line_drop(prv_eau,3)
prv_eau.reset_index(drop=True, inplace=True)


Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'Profondeur de la nappe/sol ', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'PARAMETRES PHYSICO-CHIMIQUES \n(mesures au labo)']
0 NaN lines dropped


In [439]:
prv_eau.columns

Index(['Nom échantillon', 'Période ',
       'Emplacement \n- P : Pilote \n- HZP : Hors zone pilote',
       'Date de prélèvement', 'Profondeur de la nappe/piezo',
       'Profondeur de la nappe/chambre visite', 'pH',
       'Température de prélèvement ', 'CE', 'ORP', 'Oxygène dissous', 'col_29',
       'température pour mes. pH'],
      dtype='object')

In [440]:
name=['ID_ech', 'Periode', 'Emplacement','Date_prv','Niv_eau_pz','Niv_eau_chbre','pH','Temp_prv','CE','ORP',
      'O_diss','col_29','Temp_pH_mes']
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau=prv_eau.query('ID_ech==ID_ech')
prv_eau.insert(1,'Type_ech','Eau')

In [441]:
prv_eau.drop(columns=['col_29'], inplace=True)
prv_eau['CE']=prv_eau['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [442]:
prv_eau['Periode'].replace('\n',' ', regex=True, inplace=True)
prv_eau.replace('\n','', regex=True, inplace=True)

In [443]:
data=[prv_eau]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [444]:
gdf_viewer(prv_eau, rows=3)

Rows : 87, columns : 13


interactive(children=(IntSlider(value=3, description='rows', max=87, min=3, readout=False), IntSlider(value=12…

In [445]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [446]:
an=col_ren(an, 1)

In [447]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [448]:
an=dble_col_drop(an)

column(s) dropped: []


In [449]:
# rename in a first time before dropping columns (because of names like 'col_xx' in columns)
name=['ID_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Cuivre','Mercure', 'Plomb', 'Nickel', 'Zinc', 
      'CYANURES',"Cyanures (libres)  -  NEN-EN-ISO 14403", "CN_totaux - NEN-EN-ISO 14403", 
      "cyanure (APE) - méthode basée sur EPA 335.3", "cyanure complex - méthode interne ", 
      "thiocyanate - méthode interne", "COMPOSES AROMATIQUES VOLATILS", "Benzène", "Toluène", "Éthylbenzène",
      "Orthoxylène", "Para- et métaxylène", "Xylènes", "Styrène", "PHENOLS", "Phénol", "Indice phénol", 
      "HYDROCARBURES AROMATIQUES POLYCYCLIQUES", "Naphtalène", "Acénaphtylène", "Acénaphtène", "Fluorène", 
      "Phénanthrène", "Anthracène", "Fluoranthène", "Pyrène", "Benzo(a)anthracène", "Chrysène", 
      "Benzo(b)fluoranthène", "Benzo(k)fluoranthène", "Benzo(a)pyrène", "Dibenzo(ah)anthracène", 
      "Benzo(ghi)pérylène", "Indéno(1,2,3-cd)pyrène", "HAP Totaux (16) - EPA", 
      "COMPOSES ORGANOCHLORES VOLATILS", "1,1-Dichloroéthane", "1,2-Dichloroéthane", 
      "1,1-dichloroéthène", "Cis-1,2-dichloroéthène", "Totaux (cis,trans) 1,2-dichloroéthènes", 
      "Trans 1,2-dichloroéthylène", "Dichlorométhane", "1,2-dichloropropane", "Tétrachloroéthylène ", 
      "Tétrachlorométhane", "1,1,1-Trichloroéthane", "1,1,2-Trichloroéthane", "Trichloroéthylène",
      "Chloroforme", "Chlorure de vinyle", "HYDROCARBURES TOTAUX", "fraction aromat. >C6-C7", 
      "fraction aromat. >C7-C8", "fraction aromat. >C8-C10", "fraction aliphat. C5-C6", "fraction aliphat. >C6-C8",
      "fraction aliphat. >C8-C10", "Fraction C5 - C8", "Fraction C8 - C10", "Fraction C10-C12", "Fraction C12-C16",
      "Fraction C16 - C21", "Fraction C21 - C35", "Hydrocarbures totaux C10-C35", "METHYL-TERT-BUTYL-ETHER", "MTBE",
     "a","b","c","d","e","f","g","h"]

In [450]:
an=col_ren(an, name=name, mode=1)
an=an.iloc[:,:-8]

In [451]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'PHENOLS', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOCHLORES VOLATILS', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER', 'MTBE']


In [452]:
pol_field_model={'Arsenic': 'As', 'Cadmium': 'Cd', 'Chrome': 'Cr', 'Chrome VI': 'Cr_VI', 'Cuivre': 'Cu', 
'Mercure': 'Hg', 'Plomb': 'Pb', 'Nickel': 'Ni', 'Zinc': 'Zn', 'Cyanure(s) (?libre(s))?': 'CN_libre', 
'Cyanures (libres)  -  NEN-EN-ISO 14403':'CN_libre', 'cyanure (APE) - méthode basée sur EPA 335.3':'CN_EPA', 
'Cyanure(s) (totaux)': 'CN_tot','CN_totaux':'CN_tot','cyanure(s) (APE)': 'CN_EPA', 'cyanure complex': 'CN_comp',
'thiocyanate': 'thioCN','Benzène': 'Bnz', 'Toluène': 'Toln', 'Éthylbenzène': 'EthylBnz', 'Orthoxylène': 'O-Xyl', 
'Para- et métaxylène': 'P-M-Xyl', 'Xylènes': 'Xyl', 'Styrène': 'Styr', 'BTEX totaux': 'BTEX_tot', 
'Phénol': 'Phenol', 'Indice phénol': 'Idc_Phenol', 'Naphtalène': 'Naphta', 'Acénaphtylène': 'Acenaphtyl', 
'Acénaphtène': 'Acenaphtn', 'Fluorène': 'Fluorene', 'Phénanthrène': 'Phenanthr', 'Anthracène': 'Anthrc', 
'Fluoranthène': 'Flranth', 'Pyrène': 'Pyr', 'Benzo(a)anthracène': 'Bnz(a)anthrc', 'Chrysène': 'Chrys', 
'Benzo(b)fluoranthène': 'Bnz(b)flranth', 'Benzo(k)fluoranthène': 'Bnz(k)flranth', 
'Benzo(a)pyrène': 'Bnz(a)pyr','Dibenzo(ah)anthracène': 'Dibnz(ah)anthrc',
'Benzo(ghi)pérylène': 'Bnz(ghi)peryl', 
'Indéno(1,2,3-cd)pyrène': 'Indeno(1,2,3-cd)pyr', 'HAP Totaux (?:(16) - EPA)?': 'HAP_tot_EPA', 
'1,1-Dichloroéthane': '1,1-DCE', '1,2-Dichloroéthane': '1,2-DCE', '1,1-dichloroéthène': '1,1-DCEn', 
'Cis-1,2-dichloroéthène': 'Cis-1,2-DCEn', 'Trans 1,2-dichloroéthylène': 'Trans 1,2-DCEyl', 
'Dichlorométhane': 'DCM', 'Totaux (cis,trans) 1,2-dichloroéthène(s)': '(cis,trans) 1,2-DCE_tot', 
'1,2-dichloropropane': '1,2-DCP', 'Tétrachloroéthylène': 'TetraCEyn', 'Tétrachlorométhane': 'TCM', 
'1,1,1-Trichloroéthane': '1,1,1-TCE', '1,1,2-Trichloroéthane': '1,1,2-TCE', 'Trichloroéthylène': 'TCEyn', 
'Chloroforme': 'Chloroforme', 'Chlorure de vinyle': 'CVinyl', 'EOX': 'EOX', 
'fraction aromat. >C6-C7': 'Arom_C6C7', 'fraction aromat. >C7-C8': 'Arom_C7C8', 
'fraction aromat. >C8-C10': 'Arom_C8C10', 'fraction aliphat. C5-C6': 'Aliphat_C5C6', 
'fraction aliphat. >C6-C8': 'Aliphat_C6C8', 'fraction aliphat. >C8-C10': 'Aliphat_C8C10', 
'Fraction C5 - C8': 'Fract_C5C8', 'Fraction C8-C10': 'Fract_C8C10', 'Fraction C10-C12': 'Fract_C10C12', 
'Fraction C12-C16': 'Fract_C12C16', 'Fraction C16 - C21': 'Fract_C16C21', 'Fraction C21 - C35': 'Fract_C21C35', 
'Fraction C35 - C40': 'Fract_C35C40', 'Hydrocarbures totaux C10-C35': 'HC_tot_C10C35','C5-C8':'Fract_C5C8', 
'C8-C10':'Fract_C8C10','C10-C12':'Fract_C10C12','C12-C16':'Fract_C12C16','C16-C21':'Fract_C16C21', 
'C21-C35':'Fract_C21C35','C35-C40':'Fract_C35C40', 'totaux C10-C35':'HC_tot_C10C35','C12-C22':'Fract_C12C22', 
'C22-C30':'Fract_C22C30','C30-C40':'Fract_C30C40', 'Totaux C10-C40':'HC_tot_C10C40',
'Hydrocarbures totaux C10-C40':'HC_tot_C10C40', 'MTBE': 'MTBE', 'PCB 28': 'PCB_28', 'PCB 52': 'PCB_52', 
'PCB 101': 'PCB_101', 'PCB 118': 'PCB_118', 'PCB 138': 'PCB_138', 'PCB 153': 'PCB_153', 'PCB 180': 'PCB_180', 
'PCB totaux (7)?': 'PCB_tot', 'Chlorure(s)': 'Chlorure', 'Soufre Total': 'S_tot', 'sulfite(s)': 'sulfite', 
'sulfate(s)': 'sulfate', 'COT':'COT','DBO (5 jours)':'DBO_5j','DCO':'DCO', 
'Ammonium':'NH4','ammoniaque libre':'NH3_libre','azote Kjeldahl':'N_Kjdl','sulfures totaux':'Sulfure_tot', 
'sulfure(s) (libre)':'Sulfure_libre','sulfure(s) (libre(s))':'Sulfure_libre','calcium':'Ca','potassium':'K', 'magnésium':'Mg', 'manganèse':'Mn', 
'sodium':"Na", 'fer':'Fe','phosphore (total)':'P_tot','carbonate':'CaCO3', 'bicarbonate':'Bicarb','Phoshore':'P',
'fer ((Fe))? total':'Fe_tot', 'fer (2\+)':'Fe2','fluorure(s)':'Fluorure','bromure (libre)':'B_libre'}


In [453]:
an=col_ren(an, name=pol_field_model, mode=1)

In [454]:
gdf_viewer(an, rows=3)

Rows : 87, columns : 68


interactive(children=(IntSlider(value=3, description='rows', max=87, min=3, readout=False), IntSlider(value=12…

In [455]:
source_prv_eau=prv_eau
source_an=an

In [456]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:87 ;
source_prv_sol:0 ;source_prv_eau:87 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Param physico'**

In [457]:
tmp_dir='../../CF_data/synthese/Result_traitem/Siterem_Pilote/'
sheet='Param_physico'

In [458]:
df = pd.read_excel('../../CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='param. physico', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

7 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 52, columns : 92


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=52, min=5, readout=False), IntSlider(value=12…

In [459]:
df=df.transpose()
df.reset_index(drop=True, inplace=True)

In [460]:
df=col_ren(df, 1)

In [461]:
df.drop(list(range(2)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [462]:
sdf=df.iloc[:,:33]
df=df.iloc[:,34:]

In [463]:
df=dble_col_drop(df)
sdf=dble_col_drop(sdf)

column(s) dropped: []
column(s) dropped: ['6:Autre zone suspecte investiguée', '25:pH', '31:pH']


In [464]:
df=na_line_drop(df,1)
sdf=na_line_drop(sdf,1)

83 NaN lines dropped
0 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [465]:
df=na_col_drop(df,1)
sdf=na_col_drop(sdf,1)


Columns dropped :['Profondeur de la nappe/chambre visite', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Profondeur de la nappe/sol ', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'col_51']

Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'PARAMETRES PHYSICO-CHIMIQUES \n(lors du prélèvement)', 'PARAMETRES PHYSICO-CHIMIQUES \n(mesures au labo)']


In [466]:
df=df.iloc[:,:-1]
name=['ID_ech','Periode','Emplacement','Date_prv','Niv_eau_pz','Long_pz','Temp_prv ','pH', 'CE', 'ORP','O_diss']
df=col_ren(df, mode=1, name=name)

In [467]:
sdf.drop(columns=['col_29'], inplace=True)
name=['ID_ech','Periode','Emplacement','Date_prv','Niv_eau_pz','Long_pz','pH','Niv_eau_sol','Temp_prv ','CE',
      'ORP','O_diss','Temp_pH_mes']
sdf=col_ren(sdf, mode=1, name=name)

In [468]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)
sdf['CE']=sdf['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [469]:
sdf['Periode'].replace('\n',' ', regex=True, inplace=True)
sdf.replace('\n','', regex=True, inplace=True)
sdf.drop(columns=["Niv_eau_sol"], inplace=True)

In [470]:
set(sdf['Emplacement'])

{'HZP', 'P'}

In [471]:
data=[df, sdf]
for d in data:
    for i in range(len(d['ID_ech'])):
        e=str(d.loc[i, 'Emplacement'])
        
        if re.match('P',e, re.I): 
            d.loc[i,'Emplacement']='Pilote'
        elif re.match('HZP',e, re.I): 
            d.loc[i,'Emplacement']='Hors Pilote'
        else:
            d.loc[i,'Emplacement']=np.nan

In [472]:
df.insert(1, 'Type_ech', 'Eau')
sdf.insert(1, 'Type_ech', 'Eau')

In [473]:
prv_eau=gdf_merger(sdf, df, 'outer', 'ID_ech')[0]

In [474]:
prv_eau.replace('\*|à compléter','', inplace=True, regex=True)

In [475]:
gdf_viewer(prv_eau, rows=3)

Rows : 95, columns : 13


interactive(children=(IntSlider(value=3, description='rows', max=95, min=3, readout=False), IntSlider(value=12…

$\color{red}{\text{error on merge (because of temporal data). I must check it after}}$

In [476]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:87 ;
source_prv_sol:0 ;source_prv_eau:87 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Inorganiques et composés majeurs'**

In [477]:
tmp_dir='../../CF_data/synthese/Result_traitem/Siterem_Pilote/'
sheet='Inorganic_major'

In [478]:
df = pd.read_excel('../../CF_data/Data_SITEREM/Resultats_Siterem_eau_pilote_jusque_decembre_2020.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=2)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

10 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69', 'Unnamed: 70', 'Unnamed: 71', 'Unnamed: 72', 'Unnamed: 73', 'Unnamed: 74', 'Unnamed: 75', 'Unnamed: 76', 'Unnamed: 77', 'Unnamed: 78', 'Unnamed: 79', 'Unnamed: 80', 'Unnamed: 81']
Rows : 68, columns : 54


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=68, min=5, readout=False), IntSlider(value=12…

In [479]:
prv_eau=df.loc[:21]
an=df.loc[22:]

In [480]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [481]:
prv_eau=prv_eau.transpose()
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=col_ren(prv_eau, 1)

In [482]:
prv_eau=dble_col_drop(prv_eau)

column(s) dropped: ['6:Autre zone suspecte investiguée']


In [483]:
prv_eau.drop(list(range(2)), axis=0, inplace=True)
prv_eau.reset_index(drop=True, inplace=True)
prv_eau=na_col_drop(prv_eau,2)
prv_eau=na_line_drop(prv_eau,2)
prv_eau.reset_index(drop=True, inplace=True)


Columns dropped :['SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Profondeur minimum de la crépine', 'Profondeur maximum de la crépine', 'Profondeur du fond du piézomètre', 'Profondeur de la nappe', 'Organoleptique odeur intensité (**)', 'Organoleptique odeur type', 'Limpidité (***)', 'Température', 'Conductivité électrique', 'pH']
0 NaN lines dropped


In [484]:
prv_eau.columns

Index(['Nom échantillon', 'Période ',
       'Emplacement \n- S : Simulateur \n- HZS : Hors zone simulateur',
       'Date de prélèvement', 'Température de prélèvement '],
      dtype='object')

In [485]:
name=['ID_ech', 'Periode', 'Emplacement','Date_prv','Temp_prv']
prv_eau.replace(r'\n',' ', inplace=True, regex=True)
prv_eau=col_ren(prv_eau, name=name, mode=1)
prv_eau=prv_eau.query('ID_ech==ID_ech')
prv_eau.insert(1,'Type_ech','Eau')

In [486]:
gdf_viewer(prv_eau, rows=3)

Rows : 51, columns : 6


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=6,…

In [487]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [488]:
an=col_ren(an, 1)

In [489]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [490]:
an=dble_col_drop(an)

column(s) dropped: ['8:ammonium', '12:nitrite', '14:nitrate']


In [491]:
an=na_col_drop(an,3)


Columns dropped :['CARBONE ORGANIQUE', 'DEMANDE EN O2', 'COMPOSES AZOTES', 'COMPOSES SOUFRES ', 'ELEMENTS MAJEURS', 'AUTRES ANALYSES', 'cyanure (libre)', 'METHYL-TERT-BUTYL-ETHER', 'MTBE', 'Teneur mesurée ', 'Teneur mesurée', 'VS : Valeur seuil', "(*) RP : Rapport de prélèvements; ES : Etude de sol; EO : Etude d'orientation; EC : Etude de caractérisation; SA : Suivi d'assainissement; EF : Evaluation finale", "(**) - : Pas d'impression organoleptique; + : Impression organoleptique faible; ++ : Impression organoleptique forte", '(***) + : Limpide; - : Trouble; -- : Opaque', "Le contenu des tableaux est conforme au modèle repris à l'annexe IX du GREO V03. Le formalisme a été adapté par SITEREM tout en garantissant la lisibilité du document imprimé. "]


In [492]:
an.rename(columns={'col_9':'ammoniaque libre'}, inplace=True)

In [493]:
an.drop(list(range(2)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Eau')


Columns dropped :[]


In [494]:
an=col_ren(an, name=pol_field_model, mode=1)
#an=an.iloc[:,:-7]

In [495]:
gdf_viewer(an, rows=3)

Rows : 51, columns : 29


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=12…

In [496]:
gdf_viewer(source_prv_eau)

Rows : 87, columns : 13


interactive(children=(IntSlider(value=10, description='rows', max=87, min=10, readout=False), IntSlider(value=…

$\color{red}{\text{error on merge (mainly because of temporal data, can't compare col ..._x, ...y). I must check it after}}$

## 13-Resultats_Siterem_SOL.xlsx
* **Sheet : 'Résult SOL ext. pilote'**

In [497]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [498]:
tmp_dir='../../CF_data/synthese/Result_traitem/Siterem_Result_Sol/'
sheet='Result_sol_ExtP'

In [499]:
df = pd.read_excel('../../CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='Résult SOL ext. pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

  warn(msg)


7 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 49', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69', 'Unnamed: 70', 'Unnamed: 71', 'Unnamed: 72', 'Unnamed: 73', 'Unnamed: 74', 'Unnamed: 75', 'Unnamed: 76', 'Unnamed: 77', 'Unnamed: 78', 'Unnamed: 79', 'Unnamed: 80', 'Unnamed: 81', 'Unnamed: 82', 'Unnamed: 83', 'Unnamed: 84', 'Unnamed: 85', 'Unnamed: 86', 'Unnamed: 87', 'Unnamed: 88', 'Unnamed: 89', 'Unnamed: 90', 'Unnamed: 91', 'Unnamed: 92', 'Unnamed: 93', 'Unnamed: 94', 'Unnamed: 95', 'Unnamed: 96', 'Unnamed: 97', 'Unnamed: 98', 'Unnamed: 99', 'Unnamed: 100', 'Unnamed: 101', 'Unnamed: 102', 'Unnamed: 103', 'Unnamed: 104', 'Unnamed: 105', 'Unnamed: 106', 'Unnamed: 107', 'Unnamed: 108', 'Unnamed: 109', 'Unnamed: 110', 'Unname

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=103, min=5, readout=False), IntSlider(value=1…

In [500]:
prv_sol=df.loc[:22]
an=df.loc[23:]

In [501]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [502]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [503]:
prv_sol=dble_col_drop(prv_sol)

column(s) dropped: ['16:température pour mes. pH']


In [504]:
prv_sol.drop(list(range(3)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)


Columns dropped :['MO et COT', 'pH', 'GRANULOMETRIE']
0 NaN lines dropped


In [505]:
gdf_viewer(prv_sol, rows=3)

Rows : 44, columns : 19


interactive(children=(IntSlider(value=3, description='rows', max=44, min=3, readout=False), IntSlider(value=12…

In [506]:
prv_sol=prv_sol[:-1]
prv_sol.drop(columns=['broyage'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [507]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_prv','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH_mes','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
prv_sol=col_ren(prv_sol, name=name, mode=1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [508]:
set(prv_sol.Description)

{'R', 'R ', 'TN', 'TN '}

In [509]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x in ['R','R ']: prv_sol.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: prv_sol.loc[i,'Description']='Terrain naturel'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
prv_sol.insert(1,'Type_ech','Sol')#

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')


In [510]:
gdf_viewer(prv_sol, rows=3)

Rows : 43, columns : 19


interactive(children=(IntSlider(value=3, description='rows', max=43, min=3, readout=False), IntSlider(value=12…

In [511]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [512]:
an=col_ren(an, 1)

In [513]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [514]:
name=['ID_ech','METAUX LOURDS','Arsenic','Cadmium','Chrome','Chrome VI','Cuivre','Mercure','Plomb','Nickel',
'Zinc','CYANURES','cyanure (libre)','cyanure (totaux)','cyanure (APE)','cyanure complex','thiocyanate',
'COMPOSES AROMATIQUES VOLATILS','Benzène','Toluène','Éthylbenzène','Orthoxylène','Para- et métaxylène','Xylènes',
'Styrène','BTEX totaux','PHENOLS','Phénol','HYDROCARBURES AROMATIQUES POLYCYCLIQUES','Naphtalène','Acénaphtylène',
'Acénaphtène','Fluorène','Phénanthrène','Anthracène','Fluoranthène','Pyrène','Benzo(a)anthracène','Chrysène',
'Benzo(b)fluoranthène','Benzo(k)fluoranthène','Benzo(a)pyrène','Dibenzo(ah)anthracène','Benzo(ghi)pérylène',
'Indéno(1,2,3-cd)pyrène','HAP Totaux (16) - EPA','COMPOSES ORGANOHALOGENES VOLATILS','Tétrachloroéthylène',
'Trichloroéthylène','1,1-dichloroéthène','Cis-1,2-dichloroéthène','Trans 1,2-dichloroéthylène',
'Totaux (cis,trans) 1,2-dichloroéthènes','Chlorure de vinyle','1,1,1-Trichloroéthane','1,1,2-Trichloroéthane',
'1,1-Dichloroéthane','1,2-Dichloroéthane','Tétrachlorométhane','Chloroforme','Dichlorométhane',
'1,2-dichloropropane','HYDROCARBURES TOTAUX','fraction aromat. >C6-C7','fraction aromat. >C7-C8',
'fraction aromat. >C8-C10','fraction aliphat. C5-C6','fraction aliphat. >C6-C8','fraction aliphat. >C8-C10',
'Fraction C5 - C8','Fraction C8 - C10','Fraction C10-C12','Fraction C12-C16','Fraction C16 - C21',
'Fraction C21 - C35','Fraction C35 - C40','Hydrocarbures totaux C10-C35','Hydrocarbures totaux C10-C40',
'Teneur mesurée','Teneur mesurée','VS : Valeur seuil']

an=col_ren(an, name=name, mode=1)

In [515]:
an=dble_col_drop(an)

column(s) dropped: ['79:Teneur mesurée']


In [516]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'HYDROCARBURES TOTAUX', 'Teneur mesurée', 'VS : Valeur seuil']


In [517]:
an=col_ren(an, name=pol_field_model, mode=1)

In [518]:
an.rename(columns={'cyanure (totaux)':'CN_tot', 'cyanure (APE)':'CN_EPA'}, inplace=True)

In [519]:
gdf_viewer(an, rows=5) 

Rows : 44, columns : 71


interactive(children=(IntSlider(value=5, description='rows', max=44, min=5, readout=False), IntSlider(value=12…

In [520]:
source_prv_sol=prv_sol
source_an=an

In [521]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:44 ;
source_prv_sol:43 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'SOL T1 pilote'**

In [522]:
tmp_dir='../../CF_data/synthese/Result_traitem/Siterem_Result_Sol/'
sheet='SOL_T1_Pilote'

In [523]:
df = pd.read_excel('../../CF_data/Data_SITEREM/Resultats_Siterem_SOL.xlsx', 
                   sheet_name='SOL T1 pilote', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

  warn(msg)


4 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 30', 'Unnamed: 31', 'Unnamed: 32', 'Unnamed: 33', 'Unnamed: 34', 'Unnamed: 35', 'Unnamed: 36', 'Unnamed: 38', 'Unnamed: 39', 'Unnamed: 40', 'Unnamed: 41', 'Unnamed: 42', 'Unnamed: 43', 'Unnamed: 44', 'Unnamed: 45', 'Unnamed: 46', 'Unnamed: 47', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69', 'Unnamed: 70', 'Unnamed: 71', 'Unnamed: 72', 'Unnamed: 73', 'Unnamed: 74', 'Unnamed: 75', 'Unnamed: 76', 'Unnamed: 77', 'Unnamed: 78', 'Unnamed: 79', 'Unnamed: 80', 'Unnamed: 81', 'Unnamed: 82', 'Unnamed: 83', 'Unnamed: 84', 'Unnamed: 85', 'Unnamed: 86', 'Unnamed: 87', 'Unnamed: 88', 'Unnamed: 89', 'Unnamed: 90', 'Unnamed: 91', 'Unnamed: 92', 'Un

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [524]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [525]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [526]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [527]:
prv_sol=dble_col_drop(prv_sol)

column(s) dropped: ['9:Autre zone suspecte investiguée', '27:température pour mes. pH', '30:pH (H20)']


In [528]:
prv_sol.drop(list(range(3)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,1)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)


Columns dropped :['Matières organiques', 'SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Organoleptique couleur suspecte', 'Organoleptique odeur intensité (***)', 'Organoleptique odeur type', 'MO et COT', 'matières organiques', 'COT', 'pH', 'pH (KCl)', 'température pour mes. pH', 'pH (H20)', 'GRANULOMETRIE', 'Fraction argileuse', 'parties min. 2µm', 'parties min. 50µm', 'parties min. 2mm']
9 NaN lines dropped


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [529]:
prv_sol=prv_sol[:-1]
prv_sol.drop(columns=['broyage'], inplace=True)

In [530]:
prv_sol.columns

Index(['Nom de l'échantillon', 'Profondeur échantillon de', 'à',
       'Matière sèche', 'Date de prélèvement', 'Profondeur d'arrêt du forage',
       'Refus de forage (seulement si oui)', 'Terrain naturel/Remblai (**)',
       'fraction  2 mm (prép. séché à 40°C) ',
       'fraction 2 mm (prép. séché à 40°C) '],
      dtype='object')

In [531]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_prv','Long_for','Refus','Nature_ech','Fract_2','Fract_2+']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [532]:
for i in range(len(prv_sol['Nature_ech'])):
    x = prv_sol.loc[i,'Nature_ech']
    if x in ['R','R ']: prv_sol.loc[i,'Nature_ech']='Remblais'
    elif x in ['TN','TN ']: prv_sol.loc[i,'Nature_ech']='Terrain naturel'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
prv_sol.insert(1,'Type_ech','Sol')#

In [533]:
gdf_viewer(prv_sol, rows=3)

Rows : 15, columns : 11


interactive(children=(IntSlider(value=3, description='rows', max=15, min=3, readout=False), IntSlider(value=11…

In [534]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [535]:
an=col_ren(an, 1)

In [536]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [537]:
name=['ID_ech','METAUX LOURDS','Arsenic','Cadmium','Chrome','Chrome VI','Cobalt','Cuivre','Mercure','Plomb', 
'Nickel','Zinc','CYANURES','cyanure (libre)','cyanure (totaux)','cyanure (APE)','cyanure complex','thiocyanate',
'COMPOSES AROMATIQUES VOLATILS','Benzène','Toluène','Éthylbenzène', 'Orthoxylène','Para- et métaxylène','Xylènes',
'Styrène','BTEX totaux','PHENOLS','Phénol','Indice phénol','HYDROCARBURES AROMATIQUES POLYCYCLIQUES','Naphtalène',
'Acénaphtylène','Acénaphtène', 'Fluorène','Phénanthrène','Anthracène','Fluoranthène','Pyrène','Benzo(a)anthracène',
'Chrysène','Benzo(b)fluoranthène','Benzo(k)fluoranthène','Benzo(a)pyrène','Dibenzo(ah)anthracène',
'Benzo(ghi)pérylène','Indéno(1,2,3-cd)pyrène','HAP Totaux (16) - EPA','COMPOSES ORGANOHALOGENES VOLATILS',
'Tétrachloroéthylène','Trichloroéthylène','1,1-dichloroéthène','Cis-1,2-dichloroéthène',
'Trans 1,2-dichloroéthylène','Totaux (cis,trans) 1,2-dichloroéthènes','Chlorure de vinyle',
'1,1,1-Trichloroéthane','1,1,2-Trichloroéthane','1,1-Dichloroéthane','1,2-Dichloroéthane','Tétrachlorométhane',
'Chloroforme','Dichlorométhane','1,2-dichloropropane','EOX','HYDROCARBURES TOTAUX',
'fraction aromat. >C6-C7','fraction aromat. >C7-C8','fraction aromat. >C8-C10','fraction aliphat. C5-C6',
'fraction aliphat. >C6-C8','fraction aliphat. >C8-C10','Fraction C5 - C8','Fraction C8 - C10','Fraction C10-C12',
'Fraction C12-C16','Fraction C16 - C21','Fraction C21 - C35','Fraction C35 - C40','Hydrocarbures totaux C10-C35',
'Hydrocarbures totaux C10-C40','METHYL-TERT-BUTYL-ETHER','MTBE']

an=an.iloc[:,:-17]
an=col_ren(an, name=name, mode=1)

In [538]:
an=dble_col_drop(an)

column(s) dropped: []


In [539]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,3)
an.insert(1,'Type_ech','Sol')


Columns dropped :['METAUX LOURDS', 'Chrome VI', 'Cobalt', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX totaux', 'PHENOLS', 'Phénol', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'EOX', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER']


In [540]:
an=col_ren(an, name=pol_field_model, mode=1)

In [541]:
an.rename(columns={'cyanure (totaux)':'CN_tot', 'cyanure (APE)':'CN_EPA'}, inplace=True)

In [542]:
gdf_viewer(an, rows=5) 

Rows : 25, columns : 70


interactive(children=(IntSlider(value=5, description='rows', max=25, min=5, readout=False), IntSlider(value=12…

In [543]:
#source_prv_sol.info()#, prv_sol.info()

In [544]:
source_prv_sol=source_prv_sol[['ID_ech', 'Type_ech', 'Ech_top', 'Ech_base', 'MS', 'Date_prv','Long_for', 
                               'Refus', 'Description', 'Fract_2', 'Fract_2+']]

In [545]:
source_prv_sol=gdf_merger(source_prv_sol, prv_sol, col='ID_ech', how='outer')[0]

In [546]:
source_an=gdf_merger(source_an,an,col='ID_ech', how='outer')[0]
source_an=source_an.query('ID_ech==ID_ech')

In [547]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:58 ;
source_prv_sol:58 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


## 14-Logs_forages_vUmons_2018-03-20.xlsx
* **Sheet : 'Analyse_eau_Phases1&2'**

In [548]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [549]:
tmp_dir='../../CF_data/synthese/Result_traitem/vUmons_logsFor/'
sheet='Analyse_eau_Phases1&2'

In [550]:
df = pd.read_excel('../../CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_eau_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

1 NaN lines dropped

Columns dropped :[]
Rows : 51, columns : 85


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=51, min=5, readout=False), IntSlider(value=12…

In [551]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [552]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [553]:
df['CE']=df['CE'].apply(lambda x: pd.to_numeric(x)/1000 
                                  if re.search('^\d+', str(x)) and not pd.isnull(x) else np.nan)

In [554]:
df=col_ren(df,mode=1,name=[re.sub('9999','-',x) for x in df.columns])
df=col_ren(df,mode=1, name=pol_field_model)

In [555]:
name=['ID', 'ID_ech', 'Date_prv', 'X', 'Y', 'Z', 'Long_for','Long_pz_sol', 'Niv_eau_sol', 'pH', 'CE', 'T', 
      'As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg','Pb', 'Ni', 'Zn', 'CN_libre', 'CN_tot', 'CN_APE', 'CN_comp',
      'thioCN', 'Bnz_vn', 'Bnz', 'Toln_vn', 'Toln', 'EthylBnz','O-Xyl', 'P-M-Xyl', 'Xyl_vn', 'Xyl', 'Styr', 
      'Phenol','Naphta_vn', 'Naphta', 'Acenaphtyl', 'Acenaphtn', 'Fluorene',
       'Phenanthr', 'Anthrc', 'Flranth', 'Pyr', 'Bnz(a)anthrc', 'Chrys',
       'Bnz(b)flranth', 'Bnz(k)flranth', 'Bnz(a)pyr', 'Dibnz(ah)anthrc',
       'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr', 'HAP_tot_EPA',
       '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       '(cis,trans) 1,2-DCE_tot', 'Trans 1,2-DCEyl', 'DCM', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35', 'MTBE', 'Chlorure']
df=col_ren(df, mode=1,name=name)

In [556]:
df['ID_ech'].replace('Canne ', 'Can', inplace=True, regex=True)
df['ID_ech'].replace('\n', ' ', inplace=True, regex=True)
df.insert(1,'Type_ech','Eau')

In [557]:
df.drop([20,39], axis=0,inplace=True)
df.reset_index(drop=True, inplace=True)

In [558]:
df.loc[38:,'Date_prv']=df.loc[38:,'Date_prv'].apply(lambda x : dtm.datetime.fromordinal(dtm.datetime(1900, 1, 1).toordinal() + x - 2))

In [559]:
for i in range(len(df['ID_ech'])):
    if pd.isnull(df.loc[i,'ID_ech']): 
        df.loc[i,'ID_ech']=df.loc[i,'ID']

In [560]:
pz=df[['ID', 'X', 'Y', 'Z', 'Long_for','Long_pz_sol']]
prv_eau=df[['ID_ech','Type_ech','Date_prv', 'X', 'Y', 'Z','Niv_eau_sol', 'pH', 'CE', 'T']]
an=df[['ID_ech','Type_ech','Date_prv','As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg','Pb', 'Ni', 'Zn', 'CN_libre', 'CN_tot', 'CN_APE', 
       'CN_comp','thioCN', 'Bnz_vn', 'Bnz', 'Toln_vn', 'Toln', 'EthylBnz','O-Xyl', 'P-M-Xyl', 'Xyl_vn', 'Xyl',
       'Styr', 'Phenol','Naphta_vn', 'Naphta', 'Acenaphtyl', 'Acenaphtn', 'Fluorene',
       'Phenanthr', 'Anthrc', 'Flranth', 'Pyr', 'Bnz(a)anthrc', 'Chrys',
       'Bnz(b)flranth', 'Bnz(k)flranth', 'Bnz(a)pyr', 'Dibnz(ah)anthrc',
       'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr', 'HAP_tot_EPA',
       '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       '(cis,trans) 1,2-DCE_tot', 'Trans 1,2-DCEyl', 'DCM', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35', 'MTBE', 'Chlorure']]

In [561]:
pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pz.drop_duplicates(subset=['ID'], keep='first', ignore_index=True, inplace=True)


In [562]:
gdf_viewer(an, rows=5)

Rows : 45, columns : 76


interactive(children=(IntSlider(value=5, description='rows', max=45, min=5, readout=False), IntSlider(value=12…

In [563]:
source_an=an
source_pz=pz
source_prv_eau=prv_eau

In [564]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:29 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:45 ;
source_prv_sol:0 ;source_prv_eau:45 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Analyse_sol_Phases1&2'**

In [565]:
tmp_dir='../../CF_data/synthese/Result_traitem/vUmons_logsFor/'
sheet='Analyse_sol_Phases1&2'

In [566]:
df = pd.read_excel('../../CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Analyse_sol_Phases1&2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

0 NaN lines dropped

Columns dropped :[]
Rows : 64, columns : 84


interactive(children=(IntSlider(value=5, description='rows', max=64, min=5, readout=False), IntSlider(value=12…

In [567]:
df=col_ren(df, mode=1, name=pol_field_model)

In [568]:
name=['ID_ech','Date_prv','ID','X','Y','Z','Nature_ech','Organo','Long_for','Refus','Ech_top','Ech_base',
      'MS','Broyage < 150 µm','Broyage ','Fract_2','Fract_2+','As','Cd','Cr','Cr_VI','Cu',
       'Hg','Pb','Ni','Zn','CN_libre','CN_tot','CN_APE',
       'CN_comp','thioCN','Bnz','Toln','EthylBnz','O-Xyl','P-M-Xyl',
       'Xyl','Styr','Phenol','Naphta','Acenaphtyl','Acenaphtn',
       'Fluorene','Phenanthr','Anthrc','Flranth','Pyr','Bnz(a)anthrc',
       'Chrys','Bnz(b)flranth','Bnz(k)flranth','Bnz(a)pyr',
       'Dibnz(ah)anthrc','Bnz(ghi)peryl','Indeno(1,2,3-cd)pyr',
       'HAP_tot_EPA','1,1-DCE','1,2-DCE','1,1-DCEn',
       'Cis-1,2-DCEn','Trans 1,2-DCEyl','DCM',
       '(cis,trans) 1,2-DCE_tot','1,2-DCP','TetraCEyn','TCM',
       '1,1,1-TCE','1,1,2-TCE','TCEyn','Chloroforme','CVinyl','Arom_C6C7',
       'Arom_C7C8','Arom_C8C10','Aliphat_C5C6','Aliphat_C6C8',
       'Aliphat_C8C10','Fract_C5C8','Fract_C8C10','Fract_C10C12',
       'Fract_C12C16','Fract_C16C21','Fract_C21C35','HC_tot_C10C35']
df=col_ren(df, mode=1, name=name)

In [569]:
df.drop(list(range(4)), axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

In [570]:
df.replace(9999,np.nan, inplace=True, regex=True) #int
df.replace(f'[{9999}|9999].',np.nan, inplace=True, regex=True) #float, str

In [571]:
for i in range(len(df['Nature_ech'])):
    x = df.loc[i,'Nature_ech']
    if x in ['R','R ']: df.loc[i,'Nature_ech']='Remblais'
    elif x in ['L']: df.loc[i,'Nature_ech']='Limons'
    elif x in ['LA']: df.loc[i,'Nature_ech']='Limons et argiles'
    elif x in ['LS']: df.loc[i,'Nature_ech']='Limons et sables'

df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
df.insert(1,'Type_ech','Sol')

In [572]:
df.drop(14, axis=0, inplace=True)
df.drop(['Broyage < 150 µm', 'Broyage '], axis=1, inplace=True)
df.reset_index(drop=True, inplace=True)

In [573]:
df.loc[8, 'ID_ech']='F4/2M'
df.loc[31, 'ID_ech']='P19/1'
df.loc[32, 'ID_ech']='P19/2'

In [574]:
pz=df[['ID', 'X', 'Y', 'Z', 'Long_for','Refus']]
prv_sol=df[['ID_ech', 'Type_ech', 'Date_prv', 'X', 'Y', 'Z', 'Nature_ech','Organo', 
            'Ech_top', 'Ech_base', 'MS', 'Fract_2','Fract_2+']]
an=df[['ID_ech', 'Type_ech','As', 'Cd', 'Cr', 'Cr_VI', 'Cu', 'Hg', 'Pb', 'Ni', 'Zn',
       'CN_libre', 'CN_tot', 'CN_APE', 'CN_comp', 'thioCN', 'Bnz', 'Toln',
       'EthylBnz', 'O-Xyl', 'P-M-Xyl', 'Xyl', 'Styr', 'Phenol', 'Naphta',
       'Acenaphtyl', 'Acenaphtn', 'Fluorene', 'Phenanthr', 'Anthrc', 'Flranth',
       'Pyr', 'Bnz(a)anthrc', 'Chrys', 'Bnz(b)flranth', 'Bnz(k)flranth',
       'Bnz(a)pyr', 'Dibnz(ah)anthrc', 'Bnz(ghi)peryl', 'Indeno(1,2,3-cd)pyr',
       'HAP_tot_EPA', '1,1-DCE', '1,2-DCE', '1,1-DCEn', 'Cis-1,2-DCEn',
       'Trans 1,2-DCEyl', 'DCM', '(cis,trans) 1,2-DCE_tot', '1,2-DCP',
       'TetraCEyn', 'TCM', '1,1,1-TCE', '1,1,2-TCE', 'TCEyn', 'Chloroforme',
       'CVinyl', 'Arom_C6C7', 'Arom_C7C8', 'Arom_C8C10', 'Aliphat_C5C6',
       'Aliphat_C6C8', 'Aliphat_C8C10', 'Fract_C5C8', 'Fract_C8C10',
       'Fract_C10C12', 'Fract_C12C16', 'Fract_C16C21', 'Fract_C21C35',
       'HC_tot_C10C35']]

In [575]:
gdf_viewer(df, rows=5)

Rows : 59, columns : 83


interactive(children=(IntSlider(value=5, description='rows', max=59, min=5, readout=False), IntSlider(value=12…

In [576]:
source_an=source_an.append(an, ignore_index=True)
source_pz=pz
source_prv_sol=prv_sol

In [577]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:59 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:104 ;
source_prv_sol:59 ;source_prv_eau:45 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Synthèse'**

In [578]:
tmp_dir='../../CF_data/synthese/Result_traitem/vUmons_logsFor/'
sheet='Synthese'

In [579]:
df = pd.read_excel('../../CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Synthèse', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

4 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 15', 'Unnamed: 16']
Rows : 33, columns : 14


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=33, min=5, readout=False), IntSlider(value=12…

In [580]:
df=df[:29]
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().replace(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Refus']=df['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')


In [581]:
name=['ID','X','Y','Z', 'Refus','Long_for', 'RB', 'ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [582]:
cols=['ID','X','Y','Z', 'Refus','Long_for']

for i in range(len(df)):
    if not pd.isnull(df.loc[i, 'RB']): 
        df.loc[i, 'Terrain']='Remblais'
        df.loc[i, 'Litho_top']=0
        if not pd.isnull(df.loc[i, 'Rb_base']):
            df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else:
            df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'ALL']):
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Terrain']='Alluvions'
        df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        if not pd.isnull(df.loc[i, 'S_A']):
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else:
            df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if not pd.isnull(df.loc[i, 'S_A']):
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Terrain']='Socle altéré'
        df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        if not pd.isnull(df.loc[i, 'S_S']):
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else:
            df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if not pd.isnull(df.loc[i, 'S_S']):
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Terrain']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S', 'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[i+.2,cols]=df.loc[i,cols]


In [583]:
gdf_viewer(df, rows=5, cols=15)

Rows : 51, columns : 9


interactive(children=(IntSlider(value=5, description='rows', max=51, min=5, readout=False), IntSlider(value=9,…

In [584]:
litho=df
source_litho=df

In [585]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:59 ; source_litho:51 ; source_Fac-uknw:0 ; source_an:104 ;
source_prv_sol:59 ;source_prv_eau:45 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Sond2017v2'**

In [586]:
tmp_dir='../../CF_data/synthese/Result_traitem/vUmons_logsFor/'
sheet='Sond2017v2'

In [587]:
df = pd.read_excel('../../CF_data/Data_UMONS/Logs_forages_vUmons_2018-03-20.xlsx', 
                   sheet_name='Sond2017v2', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

0 NaN lines dropped

Columns dropped :['Unnamed: 18', 'Unnamed: 19', 'Unnamed: 20', 'Unnamed: 21', 'Unnamed: 22', 'Unnamed: 23', 'Unnamed: 24', 'Unnamed: 25', 'Unnamed: 26', 'Unnamed: 27', 'Unnamed: 28', 'Unnamed: 29', 'Unnamed: 30', 'Unnamed: 31']
Rows : 71, columns : 18


interactive(children=(IntSlider(value=5, description='rows', max=71, min=5, readout=False), IntSlider(value=12…

In [588]:
df.replace('\*','', inplace=True, regex=True)
df['Refus']=df['Refus'].apply(lambda x: 'x' if x==1 else '')

In [589]:
name=['R_ID','ID','X','Y','Z','Refus','Date_ouv','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','cote_rb','All_top', 'Soc_alt_top','Soc_sn_top']
df=col_ren(df, mode=1, name=name)
df=df[['ID','X','Y','Z','Refus','Date_ouv','Long_for','Z_fond','RB','ALL', 'S_A', 'S_S', 
      'Rb_base','All_top', 'Soc_alt_top','Soc_sn_top']]

In [590]:
cols=['ID','Date_ouv','X','Y','Z','Z_fond','Refus','Long_for']

for i in range(len(df)):    
    if df.loc[i, 'RB']==1: 
        df.loc[i, 'Terrain']='Remblais'
        df.loc[i, 'Litho_top']=0
        
        if not pd.isnull(df.loc[i, 'Rb_base']): df.loc[i, 'Litho_base']=df.loc[i, 'Rb_base']
        else: df.loc[i, 'Litho_base']=df.loc[i, 'Long_for']
    
    val_def=df.loc[i, 'Litho_base'] # temporary value of litho_base if nan
    
    if df.loc[i, 'ALL']==1:
        df.loc[i+.2,cols]=df.loc[i,cols]
        df.loc[i+.2, 'Terrain']='Alluvions'
        
        if not pd.isnull(df.loc[i, 'All_top']): df.loc[i+.2, 'Litho_top']=df.loc[i, 'All_top']
        else: df.loc[i+.2, 'Litho_top']=val_def #df.loc[i, 'litho_base']
            
        if df.loc[i, 'S_A']==1: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.2, 'Litho_base']=df.loc[i, 'Long_for']
    
    if df.loc[i, 'S_A']==1:
        df.loc[i+.5,cols]=df.loc[i,cols]
        df.loc[i+.5, 'Terrain']='Socle altéré'
        
        if not pd.isnull(df.loc[i, 'Soc_alt_top']): df.loc[i+.5, 'Litho_top']=df.loc[i, 'Soc_alt_top']
        else: df.loc[i+.5, 'Litho_top']=val_def #df.loc[i+.2, 'litho_base']
        
        if df.loc[i, 'S_S']==1: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Soc_sn_top']
        else: df.loc[i+.5, 'Litho_base']=df.loc[i, 'Long_for']
            
    if df.loc[i, 'S_S']==1:
        df.loc[i+.7,cols]=df.loc[i,cols]
        df.loc[i+.7, 'Terrain']='Socle sain'
        df.loc[i+.7, 'Litho_top']=df.loc[i, 'Soc_sn_top']
        df.loc[i+.7, 'Litho_base']=df.loc[i, 'Long_for']

df.drop(columns=['RB', 'ALL', 'S_A', 'S_S','Rb_base','All_top', 'Soc_alt_top','Soc_sn_top'], inplace=True)
df.sort_index(inplace=True)
df.reset_index(drop=True, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value, pi)


In [591]:
df=df[:-1]

In [592]:
gdf_viewer(df, rows=5, cols=15)

Rows : 109, columns : 11


interactive(children=(IntSlider(value=5, description='rows', max=109, min=5, readout=False), IntSlider(value=1…

In [593]:
litho=df
source_litho = source_litho.merge(df, 'outer')

In [594]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:59 ; source_litho:119 ; source_Fac-uknw:0 ; source_an:104 ;
source_prv_sol:59 ;source_prv_eau:45 ; source_mes_pz:0 ; source_mes_sol:0 ;


# Processing for new data added - April 2021

## 15-Profils de sol et données de terrain 2019.xlsx
* **Sheet : 'Log'**

In [595]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [596]:
tmp_dir='../../CF_data/synthese/Result_traitem/donnees_terrain_2019/'
sheet='Log'

In [597]:
df = pd.read_excel('../../CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et données de terrain 2019.xlsx', 
                   sheet_name='Log', skiprows=0)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

153 NaN lines dropped

Columns dropped :['Unnamed: 5', 'Unnamed: 6']
Rows : 98, columns : 5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=98, min=5, readout=False), IntSlider(value=5,…

In [598]:
name=['ID','Litho_top', 'Litho_base', 'Keyword', 'Description']
df=col_ren(df, name=name, mode=1)

In [599]:
bh = df[1:62]
pza = df[65:80] #piezair
pz = df[83:]

In [600]:
bh.reset_index(drop=True, inplace=True)
pza.reset_index(drop=True, inplace=True)
pz.reset_index(drop=True, inplace=True)

In [601]:
bh.insert(1,'Type', 'Forage')
bh.insert(1,'Zone', 'Extension Pilote')
pza.insert(1,'Type', 'Piezair')
pza.insert(1,'Zone', 'Extension Pilote')
pz.insert(1,'Type', 'Piezo')
pz.insert(1,'Zone', 'Mini-Pilote')

In [602]:
litho=gdf_merger(bh, pza, col='ID')[0]
litho=gdf_merger(litho, pz, col='ID')[0]
litho=litho[['ID','Type','Zone','Litho_top','Litho_base','Description','Keyword']]

In [603]:
gdf_viewer(litho, rows=3)

Rows : 91, columns : 7


interactive(children=(IntSlider(value=3, description='rows', max=91, min=3, readout=False), IntSlider(value=7,…

In [604]:
source_litho=litho

In [605]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:91 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:0 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Echantillon'+'Organoleptique**

In [606]:
tmp_dir='../../CF_data/synthese/Result_traitem/donnees_terrain_2019/'
sheet='Echantillon'

In [607]:
df = pd.read_excel('../../CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et données de terrain 2019.xlsx', 
                   sheet_name='Echantillon', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

16 NaN lines dropped

Columns dropped :['Unnamed: 4', 'Unnamed: 5']
Rows : 67, columns : 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=67, min=5, readout=False), IntSlider(value=4,…

In [608]:
name=['ID','Ech_top', 'Ech_base', 'ID_ech']
df=col_ren(df, name=name, mode=1)
df.insert(1,'Type_ech','Sol')

In [609]:
df.drop(index=[43,44,55,56,66], inplace=True)
df.reset_index(drop=True, inplace=True)

In [610]:
ech=df.copy()

In [611]:
df = pd.read_excel('../../CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et données de terrain 2019.xlsx', 
                   sheet_name='Organoleptique', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,4)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

20 NaN lines dropped

Columns dropped :['Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8', 'Unnamed: 9', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Unnamed: 14', 'Unnamed: 15']
Rows : 20, columns : 5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=20, min=5, readout=False), IntSlider(value=5,…

In [612]:
name=['ID','Pol_top', 'Pol_base','Polluant','Intensite']
df=col_ren(df, name=name, mode=1)

In [613]:
df.drop(index=[10,11,14,15], inplace=True)
df.reset_index(drop=True, inplace=True)

In [614]:
mdf=gdf_merger(ech, df, col='ID', how='outer')[0]

In [615]:
gdf_viewer(mdf)

Rows : 70, columns : 9


interactive(children=(IntSlider(value=10, description='rows', max=70, min=10, readout=False), IntSlider(value=…

In [616]:
prv_sol=mdf
source_prv_sol=prv_sol

In [617]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:91 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:70 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Données de forage'**

In [618]:
tmp_dir='../../CF_data/synthese/Result_traitem/donnees_terrain_2019/'
sheet='Donnees_forage'

In [619]:
df = pd.read_excel('../../CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et données de terrain 2019.xlsx', 
                   sheet_name='Données de forage', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

25 NaN lines dropped

Columns dropped :['RAS', 'RAS.1', 'Niv. Eau p/r sol', 'RAS.2', 'Unnamed: 18', 'PZ Prof.\nmesurée']
Rows : 27, columns : 14


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=27, min=5, readout=False), IntSlider(value=12…

In [620]:
name=['ID', 'X', 'Y', 'Z', 'Date_ouv', 'Long_for', 'Methode', 'Diam_for','Rmq', 'Long_pz', 'Diam_pz', 
      'Crep_long','Societe', 'Resp_chantier']
df=col_ren(df, name=name, mode=1)
df.drop(index=[16,23], inplace=True)
df.reset_index(drop=True, inplace=True)

In [621]:
df.insert(5, 'Type', '')
df.loc[:15,'Type']='Forage'
df.loc[16:21,'Type']='Piezair'
df.loc[22:,'Type']='Piezo'

In [622]:
df.loc[9,'ID']='224 bis'

In [623]:
df['Refus'] = ''
df['Type_refus']=''

for i in range(len(df['Rmq'])):
    val = str(df.loc[i,'Rmq'])
    if re.search('[Bb]loqué', val) :
        df.loc[i,'Type_refus'] = 'x'
        
        if re.search('[lL]aitier', val):
            df.loc[i,'Type_refus'] = 'Laitier'
        elif re.search('[Bb]éton', val):
            df.loc[i,'Type_refus'] = 'Béton'
        elif re.search('[Mm]atériaux', val):
            df.loc[i,'Type_refus'] = 'Matériaux indurés' 
    else: 
        df.loc[i,'Type_refus'] = '' 

df['Diam_int_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[1])/1000 
                                                  if not pd.isnull(x) else x)
df['Diam_ext_pz'] = df['Diam_pz'].apply(lambda x: pd.to_numeric(x.replace('mm','').split('x')[0])/1000 
                                                  if not pd.isnull(x) else x)
df['Diam_for'] = df['Diam_for'].apply(lambda x: pd.to_numeric(x)/1000 if not pd.isnull(x) else x)

df.insert(10, 'Diam_ext_pz', df.pop('Diam_ext_pz')) # move to a specified position
df.insert(11, 'Diam_int_pz', df.pop('Diam_int_pz'))
df.drop(columns=['Rmq', 'Diam_pz'], axis=1, inplace=True)
df.drop(df.query("ID!=ID").index, inplace=True) # delete all ID='NaN' lines
df.reset_index(drop=True, inplace=True)

gen_id_dated(df,'ID','Date_ouv')  

Generation of ID-dated...
Using column ' Date_ouv ' in the (geo)dataframe !
Process ended, check you (geo)dataframe


In [624]:
df.columns

Index(['ID_date', 'ID', 'X', 'Y', 'Z', 'Date_ouv', 'Type', 'Long_for',
       'Methode', 'Diam_for', 'Diam_ext_pz', 'Diam_int_pz', 'Long_pz',
       'Crep_long', 'Societe', 'Resp_chantier', 'Refus', 'Type_refus'],
      dtype='object')

In [625]:
pz = df.query("Type=='Piezo'")
pza=df.query("Type=='Piezair'")
bh = df.query("Type=='Forage'")

pz.reset_index(inplace=True, drop=True)
pza.reset_index(inplace=True, drop=True)
bh.reset_index(inplace=True, drop=True)

In [626]:
gdf_viewer(df, rows=3)

Rows : 25, columns : 18


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

In [627]:
source_pz = pz
source_pza = pza
source_bh = bh

In [628]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
pza.to_csv(tmp_dir+sheet+'_Piezairs.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
source_pza.to_csv(tmp_dir+'source_Piezairs.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)}; source_pza:{len(source_pza)} ;'
      f'source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:16 ; source_pz:3; source_pza:6 ;source_litho:91 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:70 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Equipement'**

In [629]:
tmp_dir='../../CF_data/synthese/Result_traitem/donnees_terrain_2019/'
sheet='Equipement'

In [630]:
df = pd.read_excel('../../CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et données de terrain 2019.xlsx', 
                   sheet_name='Equipement', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


37 NaN lines dropped

Columns dropped :[]
Rows : 35, columns : 7


interactive(children=(IntSlider(value=5, description='rows', max=35, min=5, readout=False), IntSlider(value=7,…

In [631]:
df.drop(columns=['Déplacement'], inplace=True)
name=['ID','Equip_top', 'Equip_base', 'Diam_for', 'Diam_ext_pz', 'Legende']
df=col_ren(df, mode=1, name=name)

In [632]:
df.drop(index=[24,25], inplace=True)
df.reset_index(drop=True, inplace=True)

In [633]:
gdf_viewer(df)

Rows : 33, columns : 6


interactive(children=(IntSlider(value=10, description='rows', max=33, min=10, readout=False), IntSlider(value=…

In [634]:
equip=df
source_equip=equip

In [635]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)
equip.to_csv(tmp_dir+sheet+'_Equipment.csv', index=False)

#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)
source_equip.to_csv(tmp_dir+'source_Equipment.csv', index=False)

print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:16 ; source_pz:3 ; source_litho:91 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:70 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'Piézométrie'**

In [636]:
tmp_dir='../../CF_data/synthese/Result_traitem/donnees_terrain_2019/'
sheet='piezometrie'

In [637]:
df = pd.read_excel('../../CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Profils de sol et données de terrain 2019.xlsx', 
                   sheet_name='Piézométrie', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,2)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

4 NaN lines dropped

Columns dropped :['Unnamed: 4', 'Unnamed: 5', 'Unnamed: 6', 'Unnamed: 7', 'Unnamed: 8']
Rows : 3, columns : 4


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=4, …

In [638]:
name=['ID','Niv_pz_sol', 'Type_ech', 'Date_mes']
df=col_ren(df, name=name, mode=1)

In [639]:
mes_pz=df
source_mes_pz=mes_pz

In [640]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
#an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
#source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:16 ; source_pz:3 ; source_litho:91 ; source_Fac-uknw:0 ; source_an:0 ;
source_prv_sol:70 ;source_prv_eau:0 ; source_mes_pz:3 ; source_mes_sol:0 ;


## 16-Résultats SOL extension pilote et piézairs.xlsx
* **Sheet : 'Résult SOL'**

In [641]:
# New file, so the source variables must be overwritten !!
_df=pd.DataFrame()
source_mes_pz, source_mes_sol, source_pz, source_prv_eau, source_prv_sol =  _df, _df, _df, _df, _df
source_ouv, source_an, source_litho, source_bh = _df, _df, _df, _df

In [642]:
tmp_dir='../../CF_data/synthese/Result_traitem/donnees_terrain_2019/'
sheet='Result_Sol'

In [643]:
df = pd.read_excel('../../CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Résultats SOL extension pilote et piézairs.xlsx', 
                   sheet_name='Résult SOL', skiprows=5)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

  warn(msg)


4 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 69', 'Unnamed: 70', 'Unnamed: 71', 'Unnamed: 72', 'Unnamed: 73', 'Unnamed: 74', 'Unnamed: 75', 'Unnamed: 76', 'Unnamed: 77', 'Unnamed: 78', 'Unnamed: 79', 'Unnamed: 80', 'Unnamed: 81', 'Unnamed: 82', 'Unnamed: 83', 'Unnamed: 84', 'Unnamed: 85', 'Unnamed: 86', 'Unnamed: 87', 'Unnamed: 88', 'Unnamed: 89', 'Unnamed: 90', 'Unnamed: 91', 'Unnamed: 92', 'Unnamed: 93', 'Unnamed: 94', 'Unnamed: 95', 'Unnamed: 96', 'Unnamed: 97', 'Unnamed: 98', 'Unnamed: 99', 'Unnamed: 100', 'Unnamed: 101', 'Unnamed: 102', 'Unnamed: 103', 'Unnamed: 104', 'Unnamed: 105', 'Unnamed: 106', 'Unnamed: 107', 'Unnamed: 108', 'Unnamed: 109', 'Unnamed: 110', 'Unnamed: 111', 'Unnamed: 112', 'Unnamed: 113', 'Unnamed: 114', 'Unnamed: 115', 'Unnamed: 116', 'Unnamed: 117', 'Unnamed: 118', 'Unnamed: 119', 'Unnamed: 120', 'Unnamed: 121', 'Unnamed: 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [644]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [645]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [646]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [647]:
prv_sol=dble_col_drop(prv_sol)

column(s) dropped: ['9:Autre zone suspecte investiguée', '27:température pour mes. pH', '30:pH (H20)']


In [648]:
prv_sol.drop(list(range(3)), axis=0, inplace=True)
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=na_col_drop(prv_sol,3)
prv_sol=na_line_drop(prv_sol,3)
prv_sol.reset_index(drop=True, inplace=True)


Columns dropped :['Matières organiques', 'SPP/zone suspecte investiguée', 'Autre zone suspecte investiguée', 'Parcelle', 'X Lambert', 'Y Lambert', "Type d'étude (*)", 'Organoleptique couleur suspecte', 'Organoleptique odeur intensité (***)', 'Organoleptique odeur type', 'MO et COT', 'pH', 'GRANULOMETRIE', 'Fraction argileuse']
0 NaN lines dropped


In [649]:
prv_sol=prv_sol[:-1]
prv_sol.drop(columns=['broyage'], inplace=True)

In [650]:
name=['ID_ech','Ech_top', 'Ech_base','MS','Date_prv','Long_for','Refus','Description','MO','COT','pH_KCl', 
      'Temp_pH_mes','pH_H20','Fract_2','Fract_2+', 'Fract_min_2µ','Fract_min_50µ','Fract_min_2']
prv_sol=col_ren(prv_sol, name=name, mode=1)

In [651]:
set(prv_sol.Description)

{'R', 'R ', 'TN', 'TN ', nan}

In [652]:
for i in range(len(prv_sol['Description'])):
    x = prv_sol.loc[i,'Description']
    if x in ['R','R ']: prv_sol.loc[i,'Description']='Remblais'
    elif x in ['TN','TN ']: prv_sol.loc[i,'Description']='Terrain naturel'

prv_sol['Refus']=prv_sol['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else '')
prv_sol.insert(1,'Type_ech','Sol')#

In [653]:
for i in range(len(prv_sol)):
    x=prv_sol.loc[i,'ID_ech']
    r=re.search('([\w|\d]+)\n.+$', x)
    if r: 
        prv_sol.loc[i,'ID_ech']='226/'+r.group(1) # Rename borehole 304 to 226 because of conflict with piezair

In [654]:
prv_sol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 19 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   ID_ech         55 non-null     object
 1   Type_ech       55 non-null     object
 2   Ech_top        53 non-null     object
 3   Ech_base       53 non-null     object
 4   MS             55 non-null     object
 5   Date_prv       55 non-null     object
 6   Long_for       53 non-null     object
 7   Refus          55 non-null     object
 8   Description    53 non-null     object
 9   MO             5 non-null      object
 10  COT            4 non-null      object
 11  pH_KCl         4 non-null      object
 12  Temp_pH_mes    4 non-null      object
 13  pH_H20         4 non-null      object
 14  Fract_2        55 non-null     object
 15  Fract_2+       55 non-null     object
 16  Fract_min_2µ   5 non-null      object
 17  Fract_min_50µ  5 non-null      object
 18  Fract_min_2    5 non-null      o

In [655]:
gdf_viewer(prv_sol, rows=3)

Rows : 55, columns : 19


interactive(children=(IntSlider(value=3, description='rows', max=55, min=3, readout=False), IntSlider(value=12…

In [656]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [657]:
an=col_ren(an, 1)

In [658]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [659]:
an.columns

Index(['ID_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Chrome VI',
       'Cobalt', 'Cuivre', 'Mercure', 'Plomb', 'Nickel', 'Zinc', 'CYANURES',
       'cyanure (libre)', 'cyanure (totaux)', 'cyanure (APE)',
       'cyanure complex', 'thiocyanate', 'COMPOSES AROMATIQUES VOLATILS',
       'Benzène', 'Toluène', 'Éthylbenzène', 'Orthoxylène',
       'Para- et métaxylène', 'Xylènes', 'Styrène', 'BTEX totaux', 'PHENOLS',
       'Phénol', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES',
       'Naphtalène', 'Acénaphtylène', 'Acénaphtène', 'Fluorène', 'col_35',
       'Anthracène', 'Fluoranthène', 'Pyrène', 'Benzo(a)anthracène',
       'Chrysène', 'Benzo(b)fluoranthène', 'Benzo(k)fluoranthène',
       'Benzo(a)pyrène', 'Dibenzo(ah)anthracène', 'Benzo(ghi)pérylène',
       'Indéno(1,2,3-cd)pyrène', 'HAP Totaux (16) - EPA',
       'COMPOSES ORGANOHALOGENES VOLATILS', 'Tétrachloroéthylène',
       'Trichloroéthylène', '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène',
       '

In [660]:
an=an[an.columns[:-17]]
an.rename(columns={'col_35':'Phénanthrène'}, inplace=True)

In [661]:
an=col_ren(an, name=pol_field_model, mode=1)

In [662]:
an=dble_col_drop(an)

column(s) dropped: []


In [663]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')


Columns dropped :['METAUX LOURDS', 'CYANURES', 'COMPOSES AROMATIQUES VOLATILS', 'BTEX_tot', 'PHENOLS', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'COMPOSES ORGANOHALOGENES VOLATILS', 'HYDROCARBURES TOTAUX', 'METHYL-TERT-BUTYL-ETHER']


In [664]:
gdf_viewer(an, rows=5) 

Rows : 56, columns : 75


interactive(children=(IntSlider(value=5, description='rows', max=56, min=5, readout=False), IntSlider(value=12…

In [665]:
source_prv_sol=prv_sol
source_an=an

In [666]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:56 ;
source_prv_sol:55 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;


* **Sheet : 'inorganiques et composés majeur'**

In [667]:
tmp_dir='../../CF_data/synthese/Result_traitem/donnees_terrain_2019/'
sheet='Inorg_comp_majeur'

In [668]:
df = pd.read_excel('../../CF_data/Data_UMONS/Profils_de_sol_Siterem- extension_pilote_et_pilote/'
                   'Résultats SOL extension pilote et piézairs.xlsx', 
                   sheet_name='inorganiques et composés majeur', skiprows=1)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
df.replace(r'<|>','', inplace=True, regex=True)
df.replace(r'-$',np.nan, inplace=True, regex=True)

gdf_viewer(df, rows=5)

11 NaN lines dropped

Columns dropped :['Unnamed: 0', 'Unnamed: 2', 'Unnamed: 48', 'Unnamed: 49', 'Unnamed: 50', 'Unnamed: 51', 'Unnamed: 52', 'Unnamed: 53', 'Unnamed: 54', 'Unnamed: 55', 'Unnamed: 56', 'Unnamed: 57', 'Unnamed: 58', 'Unnamed: 59', 'Unnamed: 60', 'Unnamed: 61', 'Unnamed: 62', 'Unnamed: 63', 'Unnamed: 64', 'Unnamed: 65', 'Unnamed: 66', 'Unnamed: 67', 'Unnamed: 68', 'Unnamed: 69', 'Unnamed: 70', 'Unnamed: 71', 'Unnamed: 72', 'Unnamed: 73', 'Unnamed: 74', 'Unnamed: 75', 'Unnamed: 76', 'Unnamed: 77', 'Unnamed: 78', 'Unnamed: 79', 'Unnamed: 80', 'Unnamed: 81', 'Unnamed: 82', 'Unnamed: 83', 'Unnamed: 84', 'Unnamed: 85', 'Unnamed: 86', 'Unnamed: 87', 'Unnamed: 88', 'Unnamed: 89', 'Unnamed: 90', 'Unnamed: 91', 'Unnamed: 92', 'Unnamed: 93', 'Unnamed: 94']
Rows : 64, columns : 46


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=64, min=5, readout=False), IntSlider(value=12…

In [669]:
prv_sol=df.loc[:20]
an=df.loc[21:]

In [670]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [671]:
prv_sol=prv_sol.transpose()
prv_sol.reset_index(drop=True, inplace=True)
prv_sol=col_ren(prv_sol, 1)

In [672]:
prv_sol=dble_col_drop(prv_sol)

column(s) dropped: ['5:Autre zone suspecte investiguée']


In [673]:
prv_sol.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45 entries, 0 to 44
Data columns (total 20 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   Nom échantillon                      44 non-null     object
 1   Broyage                              15 non-null     object
 2   Date de prélèvement                  43 non-null     object
 3   SPP/zone suspecte investiguée        0 non-null      object
 4   Autre zone suspecte investiguée      0 non-null      object
 5   Parcelle                             0 non-null      object
 6   X Lambert                            1 non-null      object
 7   Y Lambert                            1 non-null      object
 8   Type d'étude (*)                     0 non-null      object
 9   Profondeur minimum de la crépine     1 non-null      object
 10  Profondeur maximum de la crépine     1 non-null      object
 11  Profondeur du fond du piézomètre     1 non-null

this part of the file is unusefull (look above)

In [674]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [675]:
an=col_ren(an, 1)

In [676]:
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [677]:
an.columns

Index(['ID_ech', 'COMPOSES AZOTES', 'ammonium', 'ammonium',
       'ammoniaque - libre', 'azote Kjeldahl', 'nitrite', 'nitrite', 'nitrate',
       'nitrate', 'COMPOSES SOUFRES ', 'sulfures totaux', 'sulfures (libre)',
       'Soufre Total', 'sulfites', 'sulfate', 'ELEMENTS MAJEURS', 'calcium',
       'potassium', 'magnésium', 'manganèse', 'sodium', 'fer',
       'fer (Fe) total', 'fer (2+)', 'AUTRES ANALYSES', 'aluminium',
       'chlorures', 'fluorures', 'cyanure (libre)', 'bromure (libre)',
       'phosphore (total)', 'phosphore', 'carbonate', 'bicarbonate',
       'METHYL-TERT-BUTYL-ETHER', 'MTBE', 'Teneur mesurée ', 'Teneur mesurée',
       'VS : Valeur seuil',
       '(*) RP : Rapport de prélèvements; ES : Etude de sol; EO : Etude d'orientation; EC : Etude de caractérisation; SA : Suivi d'assainissement; EF : Evaluation finale',
       '(**) - : Pas d'impression organoleptique; + : Impression organoleptique faible; ++ : Impression organoleptique forte',
       '(***) + : Limpide; 

In [678]:
an=an[an.columns[:-7]]

In [679]:
an=col_ren(an, name=pol_field_model, mode=1)

In [680]:
an=dble_col_drop(an)

column(s) dropped: ['3:NH4', '4:NH4', '5:NH4', '9:nitrite', '10:nitrite', '11:nitrite', '13:nitrate', '14:nitrate', '15:nitrate']


In [681]:
an.drop(list(range(3)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)
an=na_col_drop(an,1)
an.insert(1,'Type_ech','Sol')


Columns dropped :['COMPOSES AZOTES', 'NH3_libre', 'nitrite', 'nitrate', 'COMPOSES SOUFRES ', 'Sulfure_tot', 'Sulfure_libre', 'sulfite', 'ELEMENTS MAJEURS', 'Fe_tot', 'Fe2', 'AUTRES ANALYSES', 'Fluorure', 'CN_libre', 'B_libre', 'CaCO3', 'Bicarb', 'METHYL-TERT-BUTYL-ETHER', 'MTBE']


In [682]:
gdf_viewer(an, rows=5) 

Rows : 42, columns : 16


interactive(children=(IntSlider(value=5, description='rows', max=42, min=5, readout=False), IntSlider(value=12…

In [683]:
an.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 42 entries, 0 to 41
Data columns (total 16 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ID_ech     42 non-null     object
 1   Type_ech   42 non-null     object
 2   NH4        4 non-null      object
 3   N_Kjdl     4 non-null      object
 4   S_tot      1 non-null      object
 5   sulfate    4 non-null      object
 6   Ca         7 non-null      object
 7   K          7 non-null      object
 8   Mg         7 non-null      object
 9   Mn         3 non-null      object
 10  Na         7 non-null      object
 11  Fe         3 non-null      object
 12  aluminium  3 non-null      object
 13  Chlorure   4 non-null      object
 14  P_tot      4 non-null      object
 15  phosphore  3 non-null      object
dtypes: object(16)
memory usage: 5.4+ KB


In [684]:
#source_prv_sol=prv_sol
source_an=gdf_merger(source_an, an, how='outer', col='ID_ech')[0]

In [685]:
if not os.path.exists(tmp_dir):
    os.makedirs(tmp_dir)

#bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#pz.to_csv(tmp_dir+sheet+'_Piezometers.csv', index=False)
#ouv.to_csv(tmp_dir+sheet+'_Facility-uknw.csv', index=False)
#litho.to_csv(tmp_dir+sheet+'_Lithology.csv', index=False)
#mes_pz.to_csv(tmp_dir+sheet+'_Measures-piezo.csv', index=False)
#mes_sol.to_csv(tmp_dir+sheet+'_Measures-soil.csv', index=False)
#prv_eau.to_csv(tmp_dir+sheet+'_Samples-water.csv', index=False)
#prv_sol.to_csv(tmp_dir+sheet+'_Samples-soil.csv', index=False)
an.to_csv(tmp_dir+sheet+'_Analysis.csv', index=False)


#source_bh.to_csv(tmp_dir+'source_Boreholes.csv', index=False)
#source_pz.to_csv(tmp_dir+'source_Piezometers.csv', index=False)
#source_ouv.to_csv(tmp_dir+'source_Facility-uknw.csv', index=False)
#source_litho.to_csv(tmp_dir+'source_Lithology.csv', index=False)
#source_mes_pz.to_csv(tmp_dir+'source_Measures-piezo.csv', index=False)
#source_mes_sol.to_csv(tmp_dir+'source_Measures-soil.csv', index=False)
#source_prv_eau.to_csv(tmp_dir+'source_Samples-water.csv', index=False)
#source_prv_sol.to_csv(tmp_dir+'source_Samples-soil.csv', index=False)
source_an.to_csv(tmp_dir+'source_Analysis.csv', index=False)


print(f'source_bh:{len(source_bh)} ; source_pz:{len(source_pz)} ; source_litho:{len(source_litho)} ; '
     f'source_Fac-uknw:{len(source_ouv)} ; source_an:{len(source_an)} ;\nsource_prv_sol:{len(source_prv_sol)} ;'
     f'source_prv_eau:{len(source_prv_eau)} ; source_mes_pz:{len(source_mes_pz)} ; source_mes_sol:{len(source_mes_sol)} ;')

source_bh:0 ; source_pz:0 ; source_litho:0 ; source_Fac-uknw:0 ; source_an:61 ;
source_prv_sol:55 ;source_prv_eau:0 ; source_mes_pz:0 ; source_mes_sol:0 ;
