# Data Gathering

In [1]:
%matplotlib widget

In [2]:
from utils.io import gen_id_dated, gdf_viewer, gdf_geom, gdf_merger, gdf_filter, na_col_drop, na_line_drop
import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
#from shapely.geometry import Point
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR

In [3]:
def files_search(work_dir, files_dict, prefix='', skip=None, details=False):   
    
    if skip is None:
        skip = "we don't want to skip a word"
        
    for k in files_dict.keys():
        tmp_list = []
        for p,d,f in os.walk(work_dir):
            for x in f:
                add = False
                if re.search(prefix,x,re.I) and not re.search(skip,x,re.I):
                    add = True
                    i = str(f'{p}/{x}')
                else:
                    add = False
                    i=''
                    
                if re.search(k,i,re.I) and add:
                    tmp_list.append(i)
        tmp_list.sort()
        files_dict.update({k:tmp_list})

    for k,v in files_dict.items():
        print(k,' \t: ',len(v))
    
    if details: # Look filenames
        which = files_dict.keys()

        for w in which:
            print('\n+++++++++++++++++')
            print(f'+  {w.upper()}\t+ ')
            print('+++++++++++++++++')
            [print(i, '-', x) for i, x in enumerate(files_dict[w], 0)]    


In [4]:
def check_col(data):
    cols_idx = []
    
    class DoubledColumns(Exception):
        """Merging process doubled column(s) still remain. Check and drop them before continue"""
        pass
    
    for i in range(len(data.columns)):
        if re.search('_x|_y', list(data.columns)[i]):
            cols_idx.append(i)
    
    if len(cols_idx) != 0 :
        raise DoubledColumns(f'Merging process doubled column(s) still remain.'
                             f'\nCheck and drop them before continue ! Doubled columns position {cols_idx}')

In [5]:
def distinct_obj_test(df1, df2, on='ID', how='outer', dist_max=1):
    test_distinct = df1.merge(df2, on=on, how=how)

    dist_max = dist_max
    
    if 'X' in list(df1.columns) and 'X' in list(df2.columns):
        for idx in test_distinct.index:
            distinct_objects = True
            if not pd.isnull(test_distinct.loc[idx,'X_x']) and not pd.isnull(test_distinct.loc[idx,'X_y']):
                dist = (test_distinct.loc[idx,'X_x'] - test_distinct.loc[idx,'X_y']) ** 2 + (test_distinct.loc[idx,'Y_x'] - test_distinct.loc[idx,'Y_y']) ** 2
                if dist <= (dist_max) ** 2:  # consider as same object
                    distinct_objects = False
            else:
                distinct_objects = False
            test_distinct.loc[idx, 'Distinct_obj'] = distinct_objects

        test_distinct.insert(1,'Distinct_obj', test_distinct.pop('Distinct_obj') )

        gdf_viewer(test_distinct)
    else:
        print('Cannot proceed ! No position data in one of the dataframe')

In [6]:
def create_df(file1, file2): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    """
    
    df1 = pd.read_csv(file1, delimiter=',')
    df2 = pd.read_csv(file2, delimiter=',')
    
    print(f"df1 : {file1.replace(work_dir,'')} \ndf2 : {file2.replace(work_dir,'')}\n")

    if 'X' in list(df1.columns): print('df1 - Position data')
    else: print('df1 - No position data')
    if 'X' in list(df2.columns): print('df2 - Position data')
    else: print('df2 - No position data')
    
    return df1, df2

In [7]:
def validate_data(data, data_to_validate, col, idx_list, valid_col, on='ID'):
    
    old_idx_col = 'Source_index'
    
    for i in idx_list:
        if old_idx_col in data_to_validate.columns:
            idx = data_to_validate.loc[i, old_idx_col]
            data.loc[idx, col] = data_to_validate.loc[i, valid_col]
        else:
            print(f"Parameter 'data_to_validate' must contain '{old_idx_col}' column !")

    data_to_validate.drop(index=idx_list, inplace=True)
    data_to_validate.reset_index(drop=True, inplace=True)
    print(f"Operation done ")
    
    #return data

In [8]:
def double_objects_check(data):
    uniq_ID = []
    dbl_ID = []
    idx_ = []
    qdf = pd.DataFrame()

    for i in data.index:
        id_ = data.loc[i, 'ID']

        if id_ not in uniq_ID:
            uniq_ID.append(id_)
        elif id_ not in dbl_ID:
            dbl_ID.append(id_)
        else:
            idx_.append(i)

    for i in dbl_ID:
        qdf = qdf.append(data.query(f"ID=='{i}'"))

    return qdf

## Files reading

In [9]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [10]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'facility':0}

In [11]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
facility  	:  4


In [12]:
how=['inner', 'outer', 'left', 'right']

In [13]:
f = False
t = True

# Boreholes

Some corrections todo in 'data organization':
- correct extraction in the file 2 -> Samples
- file 4 and file 5 are the same in result (check it)
- try to concatenate file 1 with piezo (if possible because no position)
- check processing for 'refus and 'type_refus' (every object)

In [14]:
# keys: Borehole','Piezometer','Litho', 'Trench','Equipm','Measure','Sample','Analysis','facility'
files_dict['Borehole']

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Forage_Pilote/leve_Z_elect_pos_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Prof_contact_sol_forage/Feuil1_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Donnees_forage_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Equipement_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Log_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/donnees_forage_Boreholes.csv']

In [15]:
key='Borehole'
save_file = f'Merged_Boreholes.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus', 'Societe'] #columns of interest
boreholes = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

7 files


In [16]:
file1= files_dict[key][1]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : Prof_contact_sol_forage/Feuil1_Boreholes.csv 
df2 : donnees_terrain_2019/Donnees_forage_Boreholes.csv

df1 - Position data
df2 - Position data
Rows : 8, columns : 6, Unique col 'ID': 8


interactive(children=(IntSlider(value=3, description='rows', max=8, min=3, readout=False), IntSlider(value=6, …

Rows : 16, columns : 18, Unique col 'ID': 16


interactive(children=(IntSlider(value=3, description='rows', max=16, min=3, readout=False), IntSlider(value=12…

(None, None)

In [17]:
df2.ID=df2.ID.apply(lambda x: 'F'+x) # name recent (2019) boreholes

In [18]:
distinct_obj_test(df1, df2, dist_max=1)

Rows : 17, columns : 24


interactive(children=(IntSlider(value=10, description='rows', max=17, min=10, readout=False), IntSlider(value=…

In [19]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf) # check if columns with '..._x' or '..._y' are still present and raise an error


Columns dropped :['Crep_long', 'Diam_int_pz', 'Long_pz', 'Diam_ext_pz']

Ambiguous values in both columns compared, change it manually !
Columns ['Long_for_x', 'Long_for_y'] must be dropped manually !


In [20]:
err_df

Unnamed: 0,ID,Long_for_x,Long_for_y,Source_index
0,F205,3.2,4.8,0
1,F208,3.4,4.8,1
2,F212,3.4,4.8,2
3,F207,3.4,4.8,3
4,F214,3.6,4.8,4
5,F217,4.2,4.8,5
6,F225,4.0,4.8,6


In [21]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 17, columns : 14, Unique col 'ID': 17


interactive(children=(IntSlider(value=10, description='rows', max=17, min=10, readout=False), IntSlider(value=…

#### boreholes merging

In [22]:
boreholes = mdf.copy() #saving

In [23]:
file1= files_dict[key][2]
file2= files_dict[key][4]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : database_Memoris3/Profils_sol_Boreholes.csv 
df2 : profils_sols_donnees_forages/Equipement_Boreholes.csv

df1 - No position data
df2 - Position data
Rows : 172, columns : 6, Unique col 'ID': 172


interactive(children=(IntSlider(value=3, description='rows', max=172, min=3, readout=False), IntSlider(value=6…

Rows : 13, columns : 13, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=12…

(None, None)

In [24]:
distinct_obj_test(df1, df2, dist_max=1)

Cannot proceed ! No position data in one of the dataframe


In [25]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', col_non_na=2, line_non_na=2)
check_col(mdf)

In [26]:
mdf['ID_date'] = mdf['ID_date'].apply(lambda x: str(x).upper())

In [27]:
mdf['Long_for'] = mdf[['Profondeur', 'Long_for']].apply(lambda x: x[0] if pd.isnull(x[1]) else x[1], axis=1)
mdf.drop(columns=['Profondeur'], inplace=True)

In [28]:
gdf_viewer(mdf, rows=3, cols=15, un_val='ID', view=t)

Rows : 185, columns : 13, Unique col 'ID': 185


interactive(children=(IntSlider(value=3, description='rows', max=185, min=3, readout=False), IntSlider(value=1…

#### boreholes merging

In [29]:
boreholes, err_df=gdf_merger(boreholes, mdf, how=how[1], col='ID', dist_max=2, col_non_na=1)
check_col(boreholes)

Ambiguous values in both columns compared, change it manually !
Columns ['Long_for_x', 'Long_for_y'] must be dropped manually !


In [30]:
err_df

Unnamed: 0,ID,Long_for_x,Long_for_y,Source_index
0,F205,99999.0,1.4,0
1,F212,99999.0,5.8,2
2,F217,99999.0,5.7,5
3,F219,1.5,1.8,13


In [31]:
validate_data(boreholes, err_df, on='ID', col='Long_for', idx_list=[0,1,2], valid_col='Long_for_y')

Operation done 


In [32]:
err_df

Unnamed: 0,ID,Long_for_x,Long_for_y,Source_index
0,F219,1.5,1.8,13


In [33]:
gdf_viewer(boreholes, rows=3, cols=15, un_val='ID', view=t)

Rows : 198, columns : 15, Unique col 'ID': 198


interactive(children=(IntSlider(value=3, description='rows', max=198, min=3, readout=False), IntSlider(value=1…

In [34]:
file1= files_dict[key][5]
file2= files_dict[key][0]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : profils_sols_donnees_forages/Log_Boreholes.csv 
df2 : Forage_Pilote/leve_Z_elect_pos_Boreholes.csv

df1 - No position data
df2 - No position data
Rows : 24, columns : 3, Unique col 'ID': 24


interactive(children=(IntSlider(value=3, description='rows', max=24, min=3, readout=False), IntSlider(value=3,…

Rows : 72, columns : 5, Unique col 'ID': 72


interactive(children=(IntSlider(value=3, description='rows', max=72, min=3, readout=False), IntSlider(value=5,…

(None, None)

In [35]:
distinct_obj_test(df1, df2, dist_max=1)

Cannot proceed ! No position data in one of the dataframe


In [36]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', col_non_na=1, )
check_col(mdf) # check if columns with '..._x' or '..._y' are still present and raise an error

In [37]:
mdf['Long_for'] = mdf[['Profondeur', 'Long_for']].apply(lambda x: x[0] if pd.isnull(x[1]) else x[1], axis=1)
mdf.drop(columns=['Profondeur'], inplace=True)

In [38]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 96, columns : 5, Unique col 'ID': 96


interactive(children=(IntSlider(value=10, description='rows', max=96, min=10, readout=False), IntSlider(value=…

#### Last boreholes merging

In [39]:
boreholes, err_df=gdf_merger(boreholes, mdf, how=how[1], col='ID', col_non_na=1)
check_col(boreholes)

Ambiguous values in both columns compared, change it manually !
Columns ['Long_for_x', 'Long_for_y'] must be dropped manually !


In [40]:
err_df # i think there are not the same, but no date or postition to distinguish them !
# --> check boreholes sheets (pdf)

Unnamed: 0,ID,Long_for_x,Long_for_y,Source_index
0,F205,1.4,5.84,0
1,F208,99999.0,5.77,1
2,F212,5.8,5.675,2
3,F207,99999.0,5.79,3
4,F214,99999.0,5.685,4
5,F217,5.7,5.73,5
6,F225,99999.0,5.74,6
7,F201,2.4,5.835,8
8,F221,1.4,5.72,9
9,F223,1.3,5.68,10


In [41]:
gdf_viewer(boreholes, rows=3, un_val='ID', view=t)

Rows : 235, columns : 16, Unique col 'ID': 235


interactive(children=(IntSlider(value=3, description='rows', max=235, min=3, readout=False), IntSlider(value=1…

####  $\color{red}{\textbf{Save final Boreholes data}}$

In [42]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
boreholes.to_csv(save_dir+save_file, index=False)

# Piezometers

Some corrections todo in 'data organization':

In [43]:
key='Piezometer'
save_file = f'Merged_Piezometers.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
piezometers = pd.DataFrame()
print(len(files_dict[key]), 'files')

17 files


In [44]:
file1= files_dict[key][0]
file2= files_dict[key][1]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : Liste_XY/Sol_Eau_Piezometers.csv 
df2 : Memoris_seafile/Result_eau_Piezometers.csv

df1 - Position data
df2 - Position data
Rows : 257, columns : 6, Unique col 'ID': 254


interactive(children=(IntSlider(value=3, description='rows', max=257, min=3, readout=False), IntSlider(value=6…

Rows : 30, columns : 9, Unique col 'ID': 30


interactive(children=(IntSlider(value=3, description='rows', max=30, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [45]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

##### check and validate duplicate objects
- The function "gdf_filter()" doesn't work in some cases, so we use function "doubled_objects_check()"
- we have same objects Names but differents by positions here

In [46]:
mdf, check = gdf_filter(mdf, position=True, id_col='ID', expression='sup|prof', dist_crit=1, drop=True, rapp_val=1)

same objects at indices:[1, 8, 10, 12, 113, 256], will be dropped if drop is set True!
Rows : 280 ; Columns : 12 ; Unique on 'ID' : 274 ; 


In [47]:
double_objects_check(mdf)

Unnamed: 0,ID,X,Y,Type_ech,Zsol,Nappe,Equip_top,Equip_base,Type,Long_pz_sol,Long_for,Type_equip
1,502,152365.0,122855.0,Eau,,Socle,,,Piezo,,,
2,502,152366.396,122857.132,Eau,,Alluvions,,,Piezo,,,
24,531,152958.0,122711.0,Eau,,remblais,,,Piezo,,,
25,531,152957.86,122709.637,Eau,,Alluvions,,,Piezo,,,
29,541,153138.0,122591.0,Eau,,remblais,,,Piezo,,,
30,541,153135.498,122590.856,Eau,,Alluvions,,,Piezo,,,
40,P12,153021.0,122640.0,Eau,102.227643,remblais,0.5,3.5,Piezo,4.03,4.8,Crepine
277,P12,152877.815915,122573.902564,Eau,102.227643,remblais,0.5,3.5,Piezo,4.03,4.8,Crepine
42,P22,152941.0,122615.0,Eau,102.349,remblais,2.5,4.5,Piezo,4.4,4.8,Crepine
278,P22,152881.59,122578.837,Eau,102.349,remblais,2.5,4.5,Piezo,4.4,4.8,Crepine


In [48]:
drop_id = [2,25,30] # objects are seemingly the same, but is it possible to get 2 objects so close (~ 1m)?
mdf.drop(index=drop_id, inplace=True)
mdf.reset_index(drop=True, inplace=True)

In [49]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

Rows : 277, columns : 12, Unique col 'ID': 274


interactive(children=(IntSlider(value=3, description='rows', max=277, min=3, readout=False), IntSlider(value=1…

##### Piezometers merging 

In [50]:
piezometers = mdf.copy() #saving

In [51]:
file1= files_dict[key][2]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : Phase_1_Memoris/Result_eau_Piezometers.csv 
df2 : Phase_2_Memoris/Result_eau_Piezometers.csv

df1 - Position data
df2 - Position data
Rows : 14, columns : 10, Unique col 'ID': 14


interactive(children=(IntSlider(value=3, description='rows', max=14, min=3, readout=False), IntSlider(value=10…

Rows : 10, columns : 9, Unique col 'ID': 10


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [52]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=2, col_non_na=1)
check_col(mdf)

In [53]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=f)

Rows : 24, columns : 11, Unique col 'ID': 24


In [54]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

In [55]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

Rows : 294, columns : 13, Unique col 'ID': 289


interactive(children=(IntSlider(value=3, description='rows', max=294, min=3, readout=False), IntSlider(value=1…

In [56]:
double_objects_check(piezometers)

Unnamed: 0,ID,X,Y,Type_ech,Zsol,Nappe,Long_pz,Equip_top,Equip_base,Type,Long_pz_sol,Long_for,Type_equip
37,P12,153021.0,122640.0,Eau,102.227643,remblais,,0.5,3.5,Piezo,4.03,4.8,Crepine
38,P12,152877.815915,122573.902564,Eau,102.227643,remblais,,0.5,3.5,Piezo,4.03,4.8,Crepine
40,P22,152941.0,122615.0,Eau,102.349,remblais,,2.5,4.5,Piezo,4.4,4.8,Crepine
41,P22,152881.59,122578.837,Eau,102.349,remblais,,2.5,4.5,Piezo,4.4,4.8,Crepine
292,P22,152881.59,122578.837,Eau,102.349,remblais,,2.5,4.5,Piezo,4.4,4.8,
42,P25,152997.0,122604.0,Eau,102.369,remblais,,2.8,4.8,Piezo,4.6,6.0,Crepine
43,P25,152872.429,122579.02,Eau,102.369,remblais,,2.8,4.8,Piezo,4.6,6.0,Crepine
293,P25,152872.429,122579.02,Eau,102.369,remblais,,2.8,4.8,Piezo,4.6,6.0,


In [57]:
drop_id = [292, 293]
piezometers.drop(index=drop_id, inplace=True)
gdf_viewer(piezometers, rows=5, un_val='ID', view=f)

Rows : 292, columns : 13, Unique col 'ID': 289


In [58]:
file1= files_dict[key][4]
file2= files_dict[key][5]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : database_Memoris3/Donnees_piezos_Piezometers.csv 
df2 : database_Memoris3/Drains_Pz_ENEL_Piezometers.csv

df1 - Position data
df2 - Position data
Rows : 117, columns : 13, Unique col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

Rows : 6, columns : 6, Unique col 'ID': 6


interactive(children=(IntSlider(value=3, description='rows', max=6, min=3, readout=False), IntSlider(value=6, …

(None, None)

In [59]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=2, col_non_na=1)
check_col(mdf)


Columns dropped :['Ht_Chbre']



In [60]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

In [61]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

Rows : 390, columns : 19, Unique col 'ID': 381


interactive(children=(IntSlider(value=3, description='rows', max=390, min=3, readout=False), IntSlider(value=1…

In [62]:
piezometers, check = gdf_filter(piezometers, position=True, id_col='ID', expression='sup|prof', dist_crit=1, drop=True)
#gdf_viewer(piezometers, rows=5, un_val='ID', view=t)

same objects at indices:[355, 356, 360, 362, 365, 370, 372, 386], will be dropped if drop is set True!
Rows : 382 ; Columns : 19 ; Unique on 'ID' : 374 ; 


In [63]:
double_objects_check(piezometers)

Unnamed: 0,ID,X,Y,Z,Zsol,Type_ech,Equip_top,Societe,Zone,Diam_int_pz,Equip_base,Long_crep,Long_pz_sol,Long_for,Sous_zone,Type_equip,Nappe,Type,Long_pz
37,P12,153021.0,122640.0,,102.227643,Eau,0.5,,,,3.5,,4.03,4.8,,Crepine,remblais,Piezo,
38,P12,152877.815915,122573.902564,,102.227643,Eau,0.5,,,,3.5,,4.03,4.8,,Crepine,remblais,Piezo,
40,P22,152941.0,122615.0,,102.349,Eau,2.5,,,,4.5,,4.4,4.8,,Crepine,remblais,Piezo,
41,P22,152881.59,122578.837,,102.349,Eau,2.5,,,,4.5,,4.4,4.8,,Crepine,remblais,Piezo,
42,P25,152997.0,122604.0,,102.369,Eau,2.8,,,,4.8,,4.6,6.0,,Crepine,remblais,Piezo,
43,P25,152872.429,122579.02,,102.369,Eau,2.8,,,,4.8,,4.6,6.0,,Crepine,remblais,Piezo,
2,508,152467.0,122850.0,105.44,,Eau,,SITEREM,Dépôts sidérurgiques et de lagunage,51.0,,3.0,,,Dépôt de stériles,,Socle,Piezo,
377,508,152467.56,122850.9,105.44,104.93,Eau,,SITEREM,Dépôts sidérurgiques et de lagunage,51.0,,3.0,,,Dépôt de stériles,,Socle,Piezo,15.59
4,FP15,152961.0,122631.0,103.122,,Eau,,SITEREM,Cokerie de Marchienne,41.0,,4.0,,,"Reservoirs à fuel, usine à benzol",,Socle,Piezo,
378,FP15,152961.883,122631.939,103.122,102.35,Eau,,SITEREM,Cokerie de Marchienne,41.0,,4.0,,,"Reservoirs à fuel, usine à benzol",,Socle,Piezo,19.228


In [64]:
drop_id = [2,4,30,94,106]
piezometers.drop(index=drop_id, inplace=True)
gdf_viewer(piezometers, rows=5, un_val='ID', view=f)

Rows : 377, columns : 19, Unique col 'ID': 374


In [65]:
file1= files_dict[key][6]
file2= files_dict[key][9]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : database_Memoris3/Profils_sol_Piezometers.csv 
df2 : donnees_terrain_2019/Donnees_forage_Piezometers.csv

df1 - No position data
df2 - Position data
Rows : 111, columns : 6, Unique col 'ID': 111


interactive(children=(IntSlider(value=3, description='rows', max=111, min=3, readout=False), IntSlider(value=6…

Rows : 3, columns : 18, Unique col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=12,…

(None, None)

In [66]:
df2['ID'] = df2.ID.astype('object')

In [67]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=2, col_non_na=1)
check_col(mdf)


Columns dropped :['Z', 'Type_refus', 'Refus']



In [68]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

In [69]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

Rows : 489, columns : 27, Unique col 'ID': 486


interactive(children=(IntSlider(value=3, description='rows', max=489, min=3, readout=False), IntSlider(value=1…

In [70]:
file1= files_dict[key][10]
file2= files_dict[key][11]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : donnees_terrain_2019/Equipement_Piezometers.csv 
df2 : donnees_terrain_2019/Log_Piezometers.csv

df1 - No position data
df2 - No position data
Rows : 9, columns : 5, Unique col 'ID': 9


interactive(children=(IntSlider(value=3, description='rows', max=9, min=3, readout=False), IntSlider(value=5, …

Rows : 10, columns : 9, Unique col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [71]:
df1['ID'] = df1.ID.astype('object')

In [72]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=2, col_non_na=1)
check_col(mdf)

In [73]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

Ambiguous values in both columns compared, change it manually !
Columns ['Diam_ext_pz_x', 'Diam_ext_pz_y'] must be dropped manually !


In [74]:
err_df

Unnamed: 0,ID,Diam_ext_pz_x,Diam_ext_pz_y,Source_index
0,50,25.0,45.0,486
1,51,25.0,45.0,487
2,52,25.0,45.0,488


In [75]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

Rows : 505, columns : 31, Unique col 'ID': 495


interactive(children=(IntSlider(value=3, description='rows', max=505, min=3, readout=False), IntSlider(value=1…

In [76]:
file1= files_dict[key][12]
file2= files_dict[key][13]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : profils_sols_donnees_forages/Equipement_Piezometers.csv 
df2 : profils_sols_donnees_forages/donnees_forage_Piezometers.csv

df1 - No position data
df2 - Position data
Rows : 12, columns : 5, Unique col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=5,…

Rows : 12, columns : 18, Unique col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=12…

(None, None)

In [77]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=2, col_non_na=1)
check_col(mdf)

Ambiguous values in both columns compared, change it manually !
Columns ['Diam_for_x', 'Diam_for_y', 'Diam_ext_pz_x', 'Diam_ext_pz_y'] must be dropped manually !


In [78]:
err_df

Unnamed: 0,ID,Diam_for_x,Diam_for_y,Diam_ext_pz_x,Diam_ext_pz_y,Source_index
0,F14M,250,75,50,53.0,8
1,F15bM,108,75,50,53.0,9
2,F16M,108,75,50,53.0,10
3,F17dM,108,75,50,53.0,11
4,F2M,75,75,45,53.0,0
5,F3M,75,75,45,53.0,1
6,F4M,75,75,45,53.0,2
7,F5M,75,75,45,53.0,3
8,F6M,75,75,45,53.0,4
9,F11M,75,75,45,53.0,5


In [79]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

In [80]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

Rows : 517, columns : 35, Unique col 'ID': 507


interactive(children=(IntSlider(value=3, description='rows', max=517, min=3, readout=False), IntSlider(value=1…

In [81]:
file1= files_dict[key][14]
file2= files_dict[key][15]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : profils_sols_donnees_forages/piezometrie_Piezometers.csv 
df2 : vUmons_logsFor/Analyse_eau_Phases1&2_Piezometers.csv

df1 - No position data
df2 - Position data
Rows : 11, columns : 8, Unique col 'ID': 11


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=8,…

Rows : 29, columns : 7, Unique col 'ID': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=7,…

(None, None)

In [82]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=2, col_non_na=1)
check_col(mdf)

In [83]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

Ambiguous values in both columns compared, change it manually !
Columns ['Long_pz_x', 'Long_pz_y'] must be dropped manually !


In [84]:
err_df

Unnamed: 0,ID,Long_pz_x,Long_pz_y,Source_index
0,F3M,3.3,2.98,506
1,F13M,3.5,4.04,512
2,F15bM,4.0,4.67,514
3,F16M,4.8,4.85,515
4,F17dM,3.6,3.97,516


In [85]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

Rows : 525, columns : 38, Unique col 'ID': 513


interactive(children=(IntSlider(value=3, description='rows', max=525, min=3, readout=False), IntSlider(value=1…

In [86]:
file1= files_dict[key][16]
df1 = pd.read_csv(file1, delimiter=',')

print(f"df1 : {file1.replace(work_dir,'')}")
gdf_viewer(df1, rows=3, un_val='ID', view=t)

df1 : vUmons_logsFor/Analyse_sol_Phases1&2_Piezometers.csv
Rows : 59, columns : 7, Unique col 'ID': 32


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=7,…

#### Last merging

In [87]:
piezometers, err_df=gdf_merger(piezometers, df1, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

Ambiguous values in both columns compared, change it manually !
Columns ['Long_for_x', 'Long_for_y'] must be dropped manually !


In [88]:
err_df

Unnamed: 0,ID,Long_for_x,Long_for_y,Source_index
0,P14M,3.2,2.8,311


In [89]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

Rows : 572, columns : 38, Unique col 'ID': 519


interactive(children=(IntSlider(value=3, description='rows', max=572, min=3, readout=False), IntSlider(value=1…

####  $\color{red}{\textbf{Save final Piezometers data}}$

In [90]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
piezometers.to_csv(save_dir+save_file, index=False)

==========================================================================================================

# Unknown facilities

In [146]:
key='facility'
save_file = f'Merged_Facilites_unknw.csv'
#coi=['ID','X','Y','Z','Litho_top','Litho_base','Description']  #columns of interest
facilities = pd.DataFrame()
print(len(files_dict[key]), 'files')

4 files


In [147]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : database_Memoris3/Donnees_piezos_Unkown-facility.csv 
df2 : database_Memoris3/Result_sol_Unkown-facility.csv

df1 - Position data
df2 - Position data
Rows : 13, columns : 6, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=6,…

Rows : 13, columns : 8, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=8,…

(None, None)

In [148]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', col_non_na=1)#, step_merge
check_col(mdf)

In [149]:
facilities = mdf.copy() #saving

In [150]:
file1= files_dict[key][1]
df1 = pd.read_csv(file1, delimiter=',')

print(f"df1 : {file1.replace(work_dir,'')}")
gdf_viewer(df1, rows=3, un_val='ID', view=t)

df1 : database_Memoris3/Drains_Pz_ENEL_Unkown-facility.csv
Rows : 12, columns : 6, Unique col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=6,…

In [151]:
facilities, err_df=gdf_merger(facilities, df1, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

In [152]:
gdf_viewer(facilities, rows=3, un_val='ID', view=t)

Rows : 25, columns : 10, Unique col 'ID': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=10…

####  $\color{red}{\textbf{Save final Unknown Facilities data}}$

In [153]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
facilities.to_csv(save_dir+save_file, index=False)

# Lithologies

Do not add parameter 'dist_max' when merging without considering position !!! otherwise, unuseless rows added

In [128]:
key='Litho'
save_file = f'Merged_Lithologies.csv'
coi=['ID','X','Y','Z','Litho_top','Litho_base','Description']  #columns of interest
lithologies = pd.DataFrame()
print(len(files_dict[key]), 'files')

7 files


In [129]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : database_Memoris3/Drains_Pz_ENEL_Lithology.csv 
df2 : donnees_terrain_2019/Log_Lithology.csv

df1 - No position data
df2 - No position data
Rows : 1626, columns : 10, Unique col 'ID': 298


interactive(children=(IntSlider(value=3, description='rows', max=1626, min=3, readout=False), IntSlider(value=…

Rows : 86, columns : 7, Unique col 'ID': 24


interactive(children=(IntSlider(value=3, description='rows', max=86, min=3, readout=False), IntSlider(value=7,…

(None, None)

In [131]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', col_non_na=1)#, step_merge
check_col(mdf)

Ambiguous values in both columns compared, change it manually !
Columns ['Litho_top_x', 'Litho_top_y', 'Litho_base_x', 'Litho_base_y'] must be dropped manually !


In [133]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

Rows : 1818, columns : 12, Unique col 'ID': 314


interactive(children=(IntSlider(value=3, description='rows', max=1818, min=3, readout=False), IntSlider(value=…

In [112]:
gdf_viewer(err_df, rows=5, un_val='ID', view=t) #err_df.ID.unique()

Rows : 180, columns : 6, Unique col 'ID': 8


interactive(children=(IntSlider(value=5, description='rows', max=180, min=5, readout=False), IntSlider(value=6…

In [132]:
common_cols = list(set(df1.columns) & set(df2.columns))
test1 = df1.merge(df2, how = 'inner', on='ID')
test2 = df1.merge(df2, how = 'outer', on='ID', indicator=True).loc[lambda x : x.query('_merge =="right_only" or _merge=="left_only"').index]
test3 = test1.merge(test2, how = 'outer', on='ID')
test4 = df1.merge(df2, how = 'outer', on=list(common_cols))
print((len(test1), len(test2), len(test3)))
gdf_viewer(test4)

(180, 1638, 1818)
Rows : 1712, columns : 12


interactive(children=(IntSlider(value=10, description='rows', max=1712, min=10, readout=False), IntSlider(valu…

In [113]:
lithologies = mdf.copy() #saving

In [114]:
file1= files_dict[key][2]
file2= files_dict[key][4]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : database_Memoris3/Result_sol_Lithology.csv 
df2 : profils_sols_donnees_forages/Log_Lithology.csv

df1 - Position data
df2 - No position data
Rows : 1423, columns : 14, Unique col 'ID': 330


interactive(children=(IntSlider(value=3, description='rows', max=1423, min=3, readout=False), IntSlider(value=…

Rows : 54, columns : 6, Unique col 'ID': 24


interactive(children=(IntSlider(value=3, description='rows', max=54, min=3, readout=False), IntSlider(value=6,…

(None, None)

In [115]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', col_non_na=1)
check_col(mdf)


Columns dropped :['Type']



In [116]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

Rows : 1477, columns : 17, Unique col 'ID': 354


interactive(children=(IntSlider(value=3, description='rows', max=1477, min=3, readout=False), IntSlider(value=…

##### Lithologies merging 

In [117]:
lithologies, err_df=gdf_merger(lithologies, mdf, how=how[1], col='ID', col_non_na=1)
check_col(mdf)

In [118]:
gdf_viewer(lithologies, rows=10, cols=15, un_val='ID', view=t)

Rows : 8289, columns : 21, Unique col 'ID': 444


interactive(children=(IntSlider(value=10, description='rows', max=8289, min=10, readout=False), IntSlider(valu…

In [135]:
file1= files_dict[key][5]
file2= files_dict[key][6]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : vUmons_logsFor/Sond2017v2_Lithology.csv 
df2 : vUmons_logsFor/Synthese_Lithology.csv

df1 - Position data
df2 - Position data
Rows : 109, columns : 7, Unique col 'ID': 71


interactive(children=(IntSlider(value=3, description='rows', max=109, min=3, readout=False), IntSlider(value=7…

Rows : 51, columns : 7, Unique col 'ID': 29


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=7,…

(None, None)

In [113]:
gdf_viewer(df1.merge(df2, how='inner', on='ID'), rows=5, cols=15, un_val='ID', view=t)

Rows : 97, columns : 13, Unique col 'ID': 29


interactive(children=(IntSlider(value=5, description='rows', max=97, min=5, readout=False), IntSlider(value=13…

In [120]:
gdf_viewer(df1.merge(df2, how = 'outer', on='ID',indicator=True), rows=5, cols=15, un_val='ID', view=t)

Rows : 155, columns : 14, Unique col 'ID': 71


interactive(children=(IntSlider(value=5, description='rows', max=155, min=5, readout=False), IntSlider(value=1…

In [115]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

Rows : 155, columns : 7, Unique col 'ID': 71


interactive(children=(IntSlider(value=3, description='rows', max=155, min=3, readout=False), IntSlider(value=7…

In [104]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

Ambiguous values in both columns compared, change it manually !
Columns ['Litho_top_x', 'Litho_top_y', 'Litho_base_x', 'Litho_base_y'] must be dropped manually !


In [105]:
gdf_viewer(err_df, rows=5, un_val='ID', view=t) #err_df.ID.unique()

Rows : 97, columns : 6, Unique col 'ID': 29


interactive(children=(IntSlider(value=5, description='rows', max=97, min=5, readout=False), IntSlider(value=6,…

##### Lithologies merging 

In [101]:
lithologies, err_df=gdf_merger(lithologies, mdf, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

In [102]:
gdf_viewer(lithologies, rows=3, un_val='ID', view=t)

Rows : 8289, columns : 21, Unique col 'ID': 444


interactive(children=(IntSlider(value=3, description='rows', max=8289, min=3, readout=False), IntSlider(value=…

In [None]:
stop

# Equipments

We must also retrieve equipments information from boreholes and piezometers

In [None]:
key='Equipm'
save_file = f'Merged_Equipments.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
equipments = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][1]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], col='ID', dist_max=1, col_non_na=1)
check_col(mdf)

##### check and validate duplicate objects
- The function "gdf_filter()" doesn't work in some cases, so we use function "doubled_objects_check()"
- we have same objects Names but differents by positions here

In [None]:
mdf, check = gdf_filter(mdf, position=True, id_col='ID', expression='sup|prof', dist_crit=1, drop=True, rapp_val=1)

In [None]:
double_objects_check(mdf)

In [None]:
drop_id = [2,25,30] # objects are seemingly the same, but is it possible to get 2 objects so close (~ 1m)?
mdf.drop(index=drop_id, inplace=True)
mdf.reset_index(drop=True, inplace=True)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

##### Piezometers merging 

In [None]:
piezometers = mdf.copy() #saving

# Samples

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

# Measures

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

# Analysis

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)