# Data Gathering

In [1]:
%matplotlib widget

In [2]:
from utils.io import gdf_viewer, gdf_merger, data_validation, gdf_filter, fix_duplicates
import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR

In [3]:
def files_search(work_dir, files_dict, prefix='', skip=None, details=False):   
    
    if skip is None:
        skip = "we don't want to skip a word"
        
    for k in files_dict.keys():
        tmp_list = []
        for p,d,f in os.walk(work_dir):
            for x in f:
                add = False
                if re.search(prefix,x,re.I) and not re.search(skip,x,re.I):
                    add = True
                    i = str(f'{p}/{x}')
                else:
                    add = False
                    i=''
                    
                if re.search(k,i,re.I) and add:
                    tmp_list.append(i)
        tmp_list.sort()
        files_dict.update({k:tmp_list})

    for k,v in files_dict.items():
        print(k,' \t: ',len(v))
    
    if details: # Look filenames
        which = files_dict.keys()

        for w in which:
            print('\n+++++++++++++++++')
            print(f'+  {w.upper()}\t+ ')
            print('+++++++++++++++++')
            [print(i, '-', x) for i, x in enumerate(files_dict[w], 0)]    


In [4]:
def create_df(file1, file2, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    """
    
    df1 = pd.read_csv(file1, delimiter=',')
    df2 = pd.read_csv(file2, delimiter=',')
    
    if verbose:
        if 'X' in list(df1.columns): msg1 = ' --> Coordinates'
        else: msg1 = ' --> No coordinates'

        if 'X' in list(df2.columns): msg2 = ' --> Coordinates'
        else: msg2 = ' --> No coordinates'

        print(f"df1 : {msg1} \ndf2 : {msg2}\n")
    return df1, df2

In [5]:
def dataset_overview(d, verbose=False): # check for same datasets in given files
    """d: dict
    """
    l = len(d)
    with_coord = []
    no_coord = []
    same = []
    
    for i in range(l-1):
        for j in range(i,l):
            a, b = create_df(d[i], d[j], verbose)
            if j != i:
                if a.equals(b):
                    same.append((i,j))
            
            if 'X' in list(b.columns) and j not in with_coord:
                with_coord.append(j)
            elif 'X' not in list(b.columns) and j not in no_coord:
                no_coord.append(j)
    
    print(f'Same files:{same}\nFiles with coordinates:{with_coord}\nFiles without coordinates:{no_coord}')

## Reading files

In [6]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [7]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [8]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [9]:
how=['inner', 'outer', 'left', 'right']

In [10]:
f = False
t = True

# ================== PROCESSING ===================== 

# Boreholes

Some corrections todo in 'data organization':
- correct extraction in the file 2 -> Samples
- file 4 and file 5 are the same in result (check it)
- try to concatenate file 1 with piezo (if possible because no position)
- check processing for 'refus and 'type_refus' (every object)

In [11]:
key='Borehole'
save_file = f'Merged_Boreholes.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus', 'Societe'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

7 files


In [12]:
files_dict[key] #files_dict[key][0]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Forage_Pilote/leve_Z_elect_pos_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Prof_contact_sol_forage/Feuil1_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Donnees_forage_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Equipement_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Log_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/donnees_forage_Boreholes.csv']

In [13]:
dataset_overview(files_dict[key])

Same files:[(4, 6)]
Files with coordinates:[1, 3, 4, 6]
Files without coordinates:[0, 2, 5]


#### $\color{green}{\textbf{Read and merge}}$

In [14]:
file1= work_dir + 'profils_sols_donnees_forages/Log_Boreholes.csv' # 5
file2= work_dir + 'profils_sols_donnees_forages/Equipement_Boreholes.csv' # 4

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates 
df2 :  --> Coordinates

Rows : 24, columns : 3, Unique col 'ID': 24


interactive(children=(IntSlider(value=3, description='rows', max=24, min=3, readout=False), IntSlider(value=3,…

Rows : 13, columns : 13, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=12…

(None, None)

In [15]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [16]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

In [17]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 25, columns : 13, Unique col 'ID': 25


interactive(children=(IntSlider(value=10, description='rows', max=25, min=10, readout=False), IntSlider(value=…

#### First object dataset save

In [18]:
dataset = mdf.copy() #saving

#### $\color{green}{\textbf{Read and merge}}$

In [19]:
file1= work_dir + 'database_Memoris3/Profils_sol_Boreholes.csv' # 2
file2= work_dir + 'Prof_contact_sol_forage/Feuil1_Boreholes.csv' # 1

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates 
df2 :  --> Coordinates

Rows : 172, columns : 6, Unique col 'ID': 172


interactive(children=(IntSlider(value=3, description='rows', max=172, min=3, readout=False), IntSlider(value=6…

Rows : 8, columns : 6, Unique col 'ID': 8


interactive(children=(IntSlider(value=3, description='rows', max=8, min=3, readout=False), IntSlider(value=6, …

(None, None)

In [20]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [21]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [22]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 177, columns : 10, Unique col 'ID': 177


interactive(children=(IntSlider(value=10, description='rows', max=177, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [23]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
136,Long_for,F205,1.4,3.2
138,Long_for,F212,5.8,3.4
142,Long_for,F217,5.7,4.2


#### Merge with object dataset

In [24]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [25]:
if 'level_0' in dataset.columns:
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [26]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 202, columns : 14, Unique col 'ID': 202


interactive(children=(IntSlider(value=10, description='rows', max=202, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [27]:
file1= work_dir + 'Forage_Pilote/leve_Z_elect_pos_Boreholes.csv' # 0
file2= work_dir + 'donnees_terrain_2019/Donnees_forage_Boreholes.csv' # 3

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates 
df2 :  --> Coordinates

Rows : 72, columns : 5, Unique col 'ID': 72


interactive(children=(IntSlider(value=3, description='rows', max=72, min=3, readout=False), IntSlider(value=5,…

Rows : 16, columns : 18, Unique col 'ID': 16


interactive(children=(IntSlider(value=3, description='rows', max=16, min=3, readout=False), IntSlider(value=12…

(None, None)

In [28]:
df2.ID=df2.ID.apply(lambda x: 'F'+x) # name recent (2019) boreholes

In [29]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [30]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 74, columns : 20, Unique col 'ID': 74


interactive(children=(IntSlider(value=10, description='rows', max=74, min=10, readout=False), IntSlider(value=…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [31]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
45,Long_for,F201,5.835,2.4
49,Long_for,F205,5.84,4.8
51,Long_for,F207,5.79,4.8
52,Long_for,F208,5.77,4.8
56,Long_for,F212,5.675,4.8
58,Long_for,F214,5.685,4.8
61,Long_for,F217,5.73,4.8
63,Long_for,F219,5.63,1.5
64,Long_for,F220,5.655,0.5
65,Long_for,F221,5.72,1.4


#### Merge with object dataset

In [32]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [33]:
if 'level_0' in dataset.columns:
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [34]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 235, columns : 22, Unique col 'ID': 235


interactive(children=(IntSlider(value=10, description='rows', max=235, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [35]:
conflict_df

Unnamed: 0,Check_col,ID,index_x,index_y,Long_for_x,Long_for_y,ID_date_x,ID_date_y,Date_ouv_x,Date_ouv_y,Societe_x,Societe_y
33,index,F10,33.0,0.0,7.0,,2010-F10,,2010-03-15,,SITEREM,
34,index,F11,34.0,1.0,6.0,,2010-F11,,2010-03-16,,SITEREM,
36,index,F13,36.0,3.0,8.4,,2010-F13,,2010-03-16,,SITEREM,
38,index,F23,38.0,10.0,6.8,,2010-F23,,2010-03-18,,SBS Environnement,
39,index,F24,39.0,11.0,6.8,,2010-F24,,2010-03-18,,SBS Environnement,
54,index,F41,54.0,13.0,6.8,,2010-F41,,2010-03-12,,SBS Environnement,
97,"Long_for, index",F100,97.0,14.0,5.5,5.905,2010-F100,,2010-02-11,,SBS Environnement,
98,"Long_for, index",F101,98.0,15.0,2.0,5.775,2010-F101,,2010-02-11,,SBS Environnement,
99,"Long_for, index",F102,99.0,16.0,6.9,5.945,2010-F102,,2010-02-23,,SBS Environnement,
100,"Long_for, index",F103,100.0,17.0,8.1,5.725,2010-F103,,2010-02-23,,SBS Environnement,


In [36]:
gdf_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 235, columns : 22, Unique col 'ID': 235


interactive(children=(IntSlider(value=3, description='rows', max=235, min=3, readout=False), IntSlider(value=1…

####  $\color{red}{\textbf{Save final object dataset}}$

In [37]:
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [38]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

# Piezometers

In [39]:
key='Piezometer'
save_file = f'Merged_Piezometers.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

17 files


In [40]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Liste_XY/Sol_Eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Donnees_piezos_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Drains_Pz_ENEL_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_sol_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Donnees_forage_

In [41]:
dataset_overview(files_dict[key])

Same files:[(7, 8)]
Files with coordinates:[0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 15, 16]
Files without coordinates:[6, 10, 11, 12, 14]


#### $\color{green}{\textbf{Read and merge}}$

In [42]:
file1= work_dir + 'Phase_1_Memoris/Result_eau_Piezometers.csv' # 2
file2= work_dir + 'Memoris_seafile/Result_eau_Piezometers.csv' # 1  


df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates 
df2 :  --> Coordinates

Rows : 14, columns : 10, Unique col 'ID': 14


interactive(children=(IntSlider(value=3, description='rows', max=14, min=3, readout=False), IntSlider(value=10…

Rows : 30, columns : 9, Unique col 'ID': 30


interactive(children=(IntSlider(value=3, description='rows', max=30, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [43]:
fix_duplicates(df1, df2, drop_old_id=True)

14 duplicate objects fixed!


In [44]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### First object dataset save

In [45]:
dataset = mdf.copy() #saving

#### $\color{green}{\textbf{Read and merge}}$

In [46]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates 
df2 :  --> Coordinates

Rows : 117, columns : 13, Unique col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

Rows : 257, columns : 6, Unique col 'ID': 254


interactive(children=(IntSlider(value=3, description='rows', max=257, min=3, readout=False), IntSlider(value=6…

(None, None)

In [47]:
df1, check = gdf_filter(df1, position=True, id_col='ID', expression='sup|prof', dist_crit=1, drop=True, drop_old_id=True)

same objects at indices:[65, 67, 72, 74, 94, 102, 108, 110, 113, 114, 116], will be dropped if drop is set True!
Rows : 106 ; Columns : 13 ; Unique on 'ID' : 106 ; 


In [48]:
df2, check = gdf_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_crit=1, drop=True)

same objects at indices:[1, 8, 10, 12, 113, 256], will be dropped if drop is set True!
Rows : 251 ; Columns : 6 ; Unique on 'ID' : 245 ; 


In [49]:
fix_duplicates(df1, df2)

3 duplicate objects fixed!


In [50]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [51]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y
8,Nappe,59,Remblai_All,remblais
65,Nappe,FP14,All_limoneuses_graveleuses,Alluvions
72,Nappe,501,Remblai_All,remblais
74,Nappe,509,All_limoneuse,Alluvions
75,Nappe,510,All_limoneuse,Alluvions
76,Nappe,511,Remblai_All,remblais
77,Nappe,504,All_limoneuse,Alluvions
78,Nappe,513,Remblai_All,remblais
80,Nappe,528,Remblai_All,remblais
82,Nappe,522,Remblai_All,remblais


In [52]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

Validation done, but conflicts remain!


In [53]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y
103,Nappe,502,Socle,Alluvions
104,Nappe,512,Remblai_All,Socle
106,Nappe,595,Socle,Alluvions


#### Merge with object dataset

In [54]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [55]:
if 'level_0' in dataset.columns:
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [56]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 347, columns : 20, Unique col 'ID': 341


interactive(children=(IntSlider(value=10, description='rows', max=347, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [57]:
a, b = 3, 5
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

Phase_2_Memoris/Result_eau_Piezometers.csv || database_Memoris3/Drains_Pz_ENEL_Piezometers.csv


In [58]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Piezometers.csv' # 3
file2= work_dir + 'database_Memoris3/Drains_Pz_ENEL_Piezometers.csv' # 5  


df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates 
df2 :  --> Coordinates

Rows : 10, columns : 9, Unique col 'ID': 10


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

Rows : 6, columns : 6, Unique col 'ID': 6


interactive(children=(IntSlider(value=3, description='rows', max=6, min=3, readout=False), IntSlider(value=6, …

(None, None)

In [59]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [60]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [61]:
if 'level_0' in dataset.columns:
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [62]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 356, columns : 21, Unique col 'ID': 350


interactive(children=(IntSlider(value=10, description='rows', max=356, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [63]:
a, b = 9, 7
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

donnees_terrain_2019/Donnees_forage_Piezometers.csv || database_Memoris3/Result_eau_Piezometers.csv


In [64]:
#file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
#file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates 
df2 :  --> Coordinates

Rows : 3, columns : 18, Unique col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=12,…

Rows : 117, columns : 13, Unique col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

(None, None)

In [65]:
df2.ID = df2.ID.astype(str)

In [69]:
df2.query('ID != ID')

Unnamed: 0,ID,Societe,Zone,Sous_zone,X,Y,Zsol,Z,Type,Long_pz,Long_crep,Diam_int_pz,Nappe


In [66]:
df2, check = gdf_filter(df1, position=True, id_col='ID', expression='sup|prof', dist_crit=1, drop=True, drop_old_id=True)

AttributeError: Can only use .str accessor with string values!

In [None]:
df2

In [None]:
fix_duplicates(df1, df2)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

In [None]:
conflict_df

#### Merge with object dataset

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

In [None]:
pause

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df # i think there are not the same, but no date or postition to distinguish them !
# --> check boreholes sheets (pdf)

In [None]:
gdf_viewer(dataset, rows=3, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final object dataset}}$

In [None]:
if 'index' in piezometers.columns:
    piezometers.drop(columns='index', inplace=True)

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
piezometers.to_csv(save_dir + save_file, index=False)

###  ------------------------------------- Testing area ------------------------------

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
a, b = 4, 0
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))
#file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
#file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
print(files_dict[key][1].replace(work_dir,""),'||', files_dict[key][2].replace(work_dir,""))
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2)

In [None]:
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [None]:
test, conf_test=gdf_merger(df1, df2, how=how[1], on='new_ID', dist_max=1)

In [None]:
if len(conf_test) > 0:
    gdf_viewer(conf_test, rows=10, un_val='new_ID', view=t)

In [None]:
gdf_viewer(test, rows=10, un_val='ID', view=t)

In [None]:
pause

###  °°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°

In [None]:
file1= files_dict[key][6]
file2= files_dict[key][4]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][1]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

##### check and validate duplicate objects
- The function "gdf_filter()" doesn't work in some cases, so we use function "doubled_objects_check()"
- we have same objects Names but differents by positions here

In [None]:
mdf, check = gdf_filter(mdf, position=True, id_on='ID', expression='sup|prof', dist_crit=1, drop=True, rapp_val=1)

In [None]:
double_objects_check(mdf)

In [None]:
drop_id = [2,25,30] # objects are seemingly the same, but is it possible to get 2 objects so close (~ 1m)?
mdf.drop(index=drop_id, inplace=True)
mdf.reset_index(drop=True, inplace=True)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

#### Merge with object type dataset

In [None]:
piezometers = mdf.copy() #saving

In [None]:
file1= files_dict[key][2]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=f)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
double_objects_check(piezometers)

In [None]:
drop_id = [292, 293]
piezometers.drop(index=drop_id, inplace=True)
gdf_viewer(dataset, rows=5, un_val='ID', view=f)

In [None]:
file1= files_dict[key][4]
file2= files_dict[key][5]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
dataset, check = gdf_filter(dataset, position=True, id_on='ID', expression='sup|prof', dist_crit=1, drop=True)
#gdf_viewer(dataset, rows=5, un_val='ID', view=t)

In [None]:
double_objects_check(piezometers)

In [None]:
drop_id = [2,4,30,94,106]
piezometers.drop(index=drop_id, inplace=True)
gdf_viewer(dataset, rows=5, un_val='ID', view=f)

In [None]:
file1= files_dict[key][6]
file2= files_dict[key][9]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df2['ID'] = df2.ID.astype('object')

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][10]
file2= files_dict[key][11]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1['ID'] = df1.ID.astype('object')

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][12]
file2= files_dict[key][13]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
conflict_df

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][14]
file2= files_dict[key][15]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][16]
df1 = pd.read_csv(file1, delimiter=',')

print(f"df1 : {file1.replace(work_dir,'')}")
gdf_viewer(df1, rows=3, un_val='ID', view=t)

#### Last merging

In [None]:
dataset, conflict_df=gdf_merger(dataset, df1, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final Piezometers data}}$

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
piezometers.to_csv(save_dir+save_file, index=False)

==========================================================================================================

# Unknown facilities

In [None]:
key='facility'
save_file = f'Merged_Facilites_unknw.csv'
#coi=['ID','X','Y','Z','Litho_top','Litho_base','Description']  #columns of interest
facilities = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID')#, step_merge
check_col(mdf)

In [None]:
facilities = mdf.copy() #saving

In [None]:
file1= files_dict[key][1]
df1 = pd.read_csv(file1, delimiter=',')

print(f"df1 : {file1.replace(work_dir,'')}")
gdf_viewer(df1, rows=3, un_val='ID', view=t)

In [None]:
facilities, conflict_df=gdf_merger(facilities, df1, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(facilities, rows=3, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final Unknown Facilities data}}$

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
facilities.to_csv(save_dir+save_file, index=False)

# Lithologies

Do not add parameter 'dist_max' when merging without considering position !!! otherwise, unuseless rows added

In [None]:
key='Litho'
save_file = f'Merged_Lithologies.csv'
coi=['ID','X','Y','Z','Litho_top','Litho_base','Description']  #columns of interest
lithologies = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID')#, step_merge
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

In [None]:
gdf_viewer(conflict_df, rows=5, un_val='ID', view=t) #conflict_df.ID.unique()

In [None]:
common_cols = list(set(df1.columns) & set(df2.columns))
test1 = df1.merge(df2, how = 'inner', on='ID')
test2 = df1.merge(df2, how = 'outer', on='ID', indicator=True).loc[lambda x : x.query('_merge =="right_only" or _merge=="left_only"').index]
test3 = test1.merge(test2, how = 'outer', on='ID')
test4 = df1.merge(df2, how = 'outer', on=list(common_cols))
print((len(test1), len(test2), len(test3)))
gdf_viewer(test4)

In [None]:
lithologies = mdf.copy() #saving

In [None]:
file1= files_dict[key][2]
file2= files_dict[key][4]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID')
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

##### Lithologies merging 

In [None]:
lithologies, conflict_df=gdf_merger(lithologies, mdf, how=how[1], on='ID')
check_col(mdf)

In [None]:
gdf_viewer(lithologies, rows=10, cols=15, un_val='ID', view=t)

In [None]:
file1= files_dict[key][5]
file2= files_dict[key][6]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
gdf_viewer(df1.merge(df2, how='inner', on='ID'), rows=5, cols=15, un_val='ID', view=t)

In [None]:
gdf_viewer(df1.merge(df2, how = 'outer', on='ID',indicator=True), rows=5, cols=15, un_val='ID', view=t)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(conflict_df, rows=5, un_val='ID', view=t) #conflict_df.ID.unique()

##### Lithologies merging 

In [None]:
lithologies, conflict_df=gdf_merger(lithologies, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(lithologies, rows=3, un_val='ID', view=t)

In [None]:
stop

# Equipments

We must also retrieve equipments information from boreholes and piezometers

In [None]:
key='Equipm'
save_file = f'Merged_Equipments.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
equipments = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][1]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

##### check and validate duplicate objects
- The function "gdf_filter()" doesn't work in some cases, so we use function "doubled_objects_check()"
- we have same objects Names but differents by positions here

In [None]:
mdf, check = gdf_filter(mdf, position=True, id_on='ID', expression='sup|prof', dist_crit=1, drop=True, rapp_val=1)

In [None]:
double_objects_check(mdf)

In [None]:
drop_id = [2,25,30] # objects are seemingly the same, but is it possible to get 2 objects so close (~ 1m)?
mdf.drop(index=drop_id, inplace=True)
mdf.reset_index(drop=True, inplace=True)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

#### Merge with object type dataset

In [None]:
piezometers = mdf.copy() #saving

# Samples

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

# Measures

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

In [None]:
files_dict[key]

# Analysis

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

In [None]:
files_dict[key]