# Data Gathering

In [1]:
%matplotlib widget

In [2]:
from utils.io import gdf_viewer, gdf_merger, data_validation, gdf_filter, fix_duplicates
import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR



In [3]:
def files_search(work_dir, files_dict, prefix='', skip=None, details=False):   
    
    if skip is None:
        skip = "we don't want to skip a word"
        
    for k in files_dict.keys():
        tmp_list = []
        for p,d,f in os.walk(work_dir):
            for x in f:
                add = False
                if re.search(prefix,x,re.I) and not re.search(skip,x,re.I):
                    add = True
                    i = str(f'{p}/{x}')
                else:
                    add = False
                    i=''
                    
                if re.search(k,i,re.I) and add:
                    tmp_list.append(i)
        tmp_list.sort()
        files_dict.update({k:tmp_list})

    for k,v in files_dict.items():
        print(k,' \t: ',len(v))
    
    if details: # Look filenames
        which = files_dict.keys()

        for w in which:
            print('\n+++++++++++++++++')
            print(f'+  {w.upper()}\t+ ')
            print('+++++++++++++++++')
            [print(i, '-', x) for i, x in enumerate(files_dict[w], 0)]    


In [4]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    for f in files:
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df1 : {msg}")
            
    return dfs

In [5]:
def dataset_overview(d, verbose=False): # check for same datasets in given files
    """d: dict
    """
    l = len(d)
    with_coord = []
    no_coord = []
    same = []
    
    for i in range(l-1):
        for j in range(i,l):
            a, b = create_df([d[i], d[j]], verbose)
            if j != i:
                if a.equals(b):
                    same.append((i,j))
            
            if 'X' in list(b.columns) and j not in with_coord:
                with_coord.append(j)
            elif 'X' not in list(b.columns) and j not in no_coord:
                no_coord.append(j)
    
    print(f'Same files:{same}\nFiles with coordinates:{with_coord}\nFiles without coordinates:{no_coord}')

## Reading files

In [6]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [7]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [8]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [9]:
how=['inner', 'outer', 'left', 'right']

In [10]:
f = False
t = True

# ================== PROCESSING ===================== 

# Boreholes

Some corrections todo in 'data organization':
- correct extraction in the file 2 -> Samples
- file 4 and file 5 are the same in result (check it)
- try to concatenate file 1 with piezo (if possible because no position)
- check processing for 'refus and 'type_refus' (every object)

In [11]:
key='Borehole'
save_file = f'Merged_Boreholes.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus', 'Societe'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

7 files


In [12]:
files_dict[key] #files_dict[key][0]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Forage_Pilote/leve_Z_elect_pos_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Prof_contact_sol_forage/Feuil1_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Donnees_forage_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Equipement_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Log_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/donnees_forage_Boreholes.csv']

In [13]:
dataset_overview(files_dict[key])

Same files:[(4, 6)]
Files with coordinates:[1, 3, 4, 6]
Files without coordinates:[0, 2, 5]


#### $\color{green}{\textbf{Read and merge}}$

In [14]:
file1= work_dir + 'profils_sols_donnees_forages/Log_Boreholes.csv' # 5
file2= work_dir + 'profils_sols_donnees_forages/Equipement_Boreholes.csv' # 4

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> Coordinates
Rows : 24, columns : 3, Unique col 'ID': 24


interactive(children=(IntSlider(value=3, description='rows', max=24, min=3, readout=False), IntSlider(value=3,…

Rows : 13, columns : 13, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=12…

(None, None)

In [15]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [16]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

In [17]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 25, columns : 13, Unique col 'ID': 25


interactive(children=(IntSlider(value=10, description='rows', max=25, min=10, readout=False), IntSlider(value=…

#### First object dataset save

In [18]:
dataset = mdf.copy() #saving

#### $\color{green}{\textbf{Read and merge}}$

In [19]:
file1= work_dir + 'database_Memoris3/Profils_sol_Boreholes.csv' # 2
file2= work_dir + 'Prof_contact_sol_forage/Feuil1_Boreholes.csv' # 1

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> Coordinates
Rows : 172, columns : 6, Unique col 'ID': 172


interactive(children=(IntSlider(value=3, description='rows', max=172, min=3, readout=False), IntSlider(value=6…

Rows : 8, columns : 6, Unique col 'ID': 8


interactive(children=(IntSlider(value=3, description='rows', max=8, min=3, readout=False), IntSlider(value=6, …

(None, None)

In [20]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [21]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [22]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 177, columns : 10, Unique col 'ID': 177


interactive(children=(IntSlider(value=10, description='rows', max=177, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [23]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
136,Long_for,F205,1.4,3.2
138,Long_for,F212,5.8,3.4
142,Long_for,F217,5.7,4.2


#### Merge with object dataset

In [24]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [25]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [26]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 202, columns : 14, Unique col 'ID': 202


interactive(children=(IntSlider(value=10, description='rows', max=202, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [27]:
file1= work_dir + 'Forage_Pilote/leve_Z_elect_pos_Boreholes.csv' # 0
file2= work_dir + 'donnees_terrain_2019/Donnees_forage_Boreholes.csv' # 3

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> Coordinates
Rows : 72, columns : 5, Unique col 'ID': 72


interactive(children=(IntSlider(value=3, description='rows', max=72, min=3, readout=False), IntSlider(value=5,…

Rows : 16, columns : 18, Unique col 'ID': 16


interactive(children=(IntSlider(value=3, description='rows', max=16, min=3, readout=False), IntSlider(value=12…

(None, None)

In [28]:
df2.ID=df2.ID.apply(lambda x: 'F'+x) # name recent (2019) boreholes

In [29]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [30]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 74, columns : 20, Unique col 'ID': 74


interactive(children=(IntSlider(value=10, description='rows', max=74, min=10, readout=False), IntSlider(value=…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [31]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
45,Long_for,F201,5.835,2.4
49,Long_for,F205,5.84,4.8
51,Long_for,F207,5.79,4.8
52,Long_for,F208,5.77,4.8
56,Long_for,F212,5.675,4.8
58,Long_for,F214,5.685,4.8
61,Long_for,F217,5.73,4.8
63,Long_for,F219,5.63,1.5
64,Long_for,F220,5.655,0.5
65,Long_for,F221,5.72,1.4


#### Merge with object dataset

In [32]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [33]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [34]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 235, columns : 21, Unique col 'ID': 235


interactive(children=(IntSlider(value=10, description='rows', max=235, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [35]:
conflict_df

Unnamed: 0,Check_col,ID,index_x,index_y,Long_for_x,Long_for_y,Societe_x,Societe_y,ID_date_x,ID_date_y,Date_ouv_x,Date_ouv_y
33,index,F10,33.0,0.0,7.0,,SITEREM,,2010-F10,,2010-03-15,
34,index,F11,34.0,1.0,6.0,,SITEREM,,2010-F11,,2010-03-16,
36,index,F13,36.0,3.0,8.4,,SITEREM,,2010-F13,,2010-03-16,
38,index,F23,38.0,10.0,6.8,,SBS Environnement,,2010-F23,,2010-03-18,
39,index,F24,39.0,11.0,6.8,,SBS Environnement,,2010-F24,,2010-03-18,
54,index,F41,54.0,13.0,6.8,,SBS Environnement,,2010-F41,,2010-03-12,
97,"index, Long_for",F100,97.0,14.0,5.5,5.905,SBS Environnement,,2010-F100,,2010-02-11,
98,"index, Long_for",F101,98.0,15.0,2.0,5.775,SBS Environnement,,2010-F101,,2010-02-11,
99,"index, Long_for",F102,99.0,16.0,6.9,5.945,SBS Environnement,,2010-F102,,2010-02-23,
100,"index, Long_for",F103,100.0,17.0,8.1,5.725,SBS Environnement,,2010-F103,,2010-02-23,


In [36]:
gdf_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 235, columns : 21, Unique col 'ID': 235


interactive(children=(IntSlider(value=3, description='rows', max=235, min=3, readout=False), IntSlider(value=1…

####  $\color{red}{\textbf{Save final object dataset}}$

In [37]:
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [38]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

# Unknown facilities

In [39]:
key='Facility'
save_file = f'Merged_Facilites_unknw.csv'
#coi=['ID','X','Y','Z','Litho_top','Litho_base','Description']  #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

4 files


In [40]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Donnees_piezos_Unkown-facility.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Drains_Pz_ENEL_Unkown-facility.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_eau_Unkown-facility.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_sol_Unkown-facility.csv']

In [41]:
dataset_overview(files_dict[key])

Same files:[(2, 3)]
Files with coordinates:[0, 1, 2, 3]
Files without coordinates:[]


#### $\color{green}{\textbf{Read and merge}}$

In [42]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Unkown-facility.csv' # 0
file2= work_dir + 'database_Memoris3/Drains_Pz_ENEL_Unkown-facility.csv' # 1  

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 13, columns : 6, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=6,…

Rows : 12, columns : 6, Unique col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=6,…

(None, None)

In [43]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### First object dataset save

In [44]:
dataset = mdf.copy() #saving

In [45]:
gdf_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 25, columns : 8, Unique col 'ID': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=8,…

#### $\color{green}{\textbf{Read and merge}}$

In [46]:
file1= work_dir + 'database_Memoris3/Result_eau_Unkown-facility.csv' # 2

df1 = create_df([file1])[0]
gdf_viewer(df1, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
Rows : 13, columns : 8, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=8,…

#### Merge with object dataset

In [47]:
dataset, conflict_df=gdf_merger(dataset, df1, how=how[1], on='ID', dist_max=1)

In [48]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [49]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 25, columns : 10, Unique col 'ID': 25


interactive(children=(IntSlider(value=10, description='rows', max=25, min=10, readout=False), IntSlider(value=…

####  $\color{red}{\textbf{Save final object dataset}}$

In [50]:
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [51]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

# Equipments

In [52]:
key='Equipm'
save_file = f'Merged_Equipments.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

3 files


In [53]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Drains_Pz_ENEL_Equipment.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Equipement_Equipment.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Equipement_Equipments.csv']

In [54]:
dataset_overview(files_dict[key])

Same files:[]
Files with coordinates:[]
Files without coordinates:[0, 1, 2]


#### $\color{green}{\textbf{Read and merge}}$

In [55]:
file1= work_dir + 'donnees_terrain_2019/Equipement_Equipment.csv' # 1
file2= work_dir + 'profils_sols_donnees_forages/Equipement_Equipments.csv' # 2  


df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 33, columns : 7, Unique col 'ID': 9


interactive(children=(IntSlider(value=3, description='rows', max=33, min=3, readout=False), IntSlider(value=7,…

Rows : 36, columns : 7, Unique col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=36, min=3, readout=False), IntSlider(value=7,…

(None, None)

In [56]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [57]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### First object dataset save

In [58]:
dataset = mdf.copy() #saving

In [59]:
gdf_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 69, columns : 7, Unique col 'ID': 21


interactive(children=(IntSlider(value=3, description='rows', max=69, min=3, readout=False), IntSlider(value=7,…

####  $\color{red}{\textbf{Save final object dataset}}$

In [60]:
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [61]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

# Samples

In [62]:
key='Sample'
save_file = f'Merged_Samples.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

27 files


In [63]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Param_agro_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Param_agro_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_sol_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Siterem_Ext_Pilote/Inorganic_maj

In [64]:
dataset_overview(files_dict[key])

Same files:[(1, 9), (18, 20)]
Files with coordinates:[4, 5, 8, 25, 26]
Files without coordinates:[0, 1, 2, 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


#### $\color{green}{\textbf{Read and merge}}$

In [65]:
file1= work_dir + 'Memoris_seafile/Result_eau_Samples-water.csv' # 4
file2= work_dir + 'Phase_1_Memoris/Result_eau_Samples-water.csv' # 5

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID_ech', view=t), gdf_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 45, columns : 10, Unique col 'ID_ech': 45


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=10…

Rows : 17, columns : 17, Unique col 'ID_ech': 17


interactive(children=(IntSlider(value=3, description='rows', max=17, min=3, readout=False), IntSlider(value=12…

(None, None)

In [66]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

#### First object dataset save

In [67]:
dataset = mdf.copy() #saving

In [68]:
gdf_viewer(dataset, rows=3, un_val='ID_ech', view=t)

Rows : 60, columns : 17, Unique col 'ID_ech': 60


interactive(children=(IntSlider(value=3, description='rows', max=60, min=3, readout=False), IntSlider(value=12…

#### $\color{green}{\textbf{Read and merge}}$

In [69]:
file1= work_dir + 'vUmons_logsFor/Analyse_eau_Phases1&2_Samples-water.csv' # 25
file2= work_dir + 'vUmons_logsFor/Analyse_sol_Phases1&2_Samples-soil.csv' # 26


df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID_ech', view=t), gdf_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 45, columns : 10, Unique col 'ID_ech': 41


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=10…

Rows : 59, columns : 13, Unique col 'ID_ech': 59


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=12…

(None, None)

In [70]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [71]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

In [72]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [73]:
gdf_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 132, columns : 25, Unique col 'ID_ech': 126


interactive(children=(IntSlider(value=10, description='rows', max=132, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [74]:
file1= work_dir + 'Container_phyto/Param_agro_Samples-soil.csv' # 0
file2= work_dir + 'Phase_2_Memoris/Result_eau_Samples-water.csv' # 8


df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID_ech', view=t), gdf_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df1 :  --> Coordinates
Rows : 5, columns : 21, Unique col 'ID_ech': 5


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=12,…

Rows : 11, columns : 10, Unique col 'ID_ech': 11


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=10…

(None, None)

In [75]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [76]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

In [77]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [78]:
gdf_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 140, columns : 38, Unique col 'ID_ech': 131


interactive(children=(IntSlider(value=10, description='rows', max=140, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [79]:
file1= work_dir + 'Container_phyto/Param_agro_Samples-water.csv' # 1
file2= work_dir + 'Container_phyto/Result_SOL_Samples-soil.csv' # 2

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID_ech', view=t), gdf_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 24, columns : 6, Unique col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=24, min=3, readout=False), IntSlider(value=6,…

Rows : 5, columns : 8, Unique col 'ID_ech': 5


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=8, …

(None, None)

In [80]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [81]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [82]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [83]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [84]:
gdf_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 164, columns : 42, Unique col 'ID_ech': 137


interactive(children=(IntSlider(value=10, description='rows', max=164, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [85]:
conflict_df # i think there are shifted columns (fract_2+_y, fract_2_y)

Unnamed: 0,Check_col,ID_ech,Fract_2_x,Fract_2_y,Fract_2+_x,Fract_2+_y,MS_x,MS_y
135,"Fract_2, Fract_2+, MS",Ech. 1,79.0,33.0,21.0,67.0,89.8,88.8
136,MS,Ech. 2,47.0,47.0,53.0,53.0,75.9,80.0
137,"Fract_2, Fract_2+, MS",Ech. 3,69.0,40.0,31.0,60.0,80.1,87.9
138,"Fract_2, Fract_2+, MS",Ech. 4,70.0,45.0,30.0,55.0,85.8,90.5
139,"Fract_2, Fract_2+, MS",Ech. 5,71.0,29.0,29.0,71.0,84.9,76.1


#### $\color{green}{\textbf{Read and merge}}$

In [86]:
file1= work_dir + 'Memoris_seafile/Result_SOL_Samples-soil.csv' # 3
file2= work_dir + 'Phase_1_Memoris/Result_sol_Samples-soil.csv' # 6

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID_ech', view=t), gdf_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 60, columns : 12, Unique col 'ID_ech': 60


interactive(children=(IntSlider(value=3, description='rows', max=60, min=3, readout=False), IntSlider(value=12…

Rows : 29, columns : 11, Unique col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=11…

(None, None)

In [87]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [88]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

In [89]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [90]:
gdf_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 284, columns : 44, Unique col 'ID_ech': 207


interactive(children=(IntSlider(value=10, description='rows', max=284, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [91]:
conflict_df

#### $\color{green}{\textbf{Read and merge}}$

In [92]:
file1= work_dir + 'Phase_2_Memoris/Result_SOL_Samples-soil.csv' # 7
file2= work_dir + 'Siterem_Ext_Pilote/Param_physico_Samples-water.csv' # 10

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID_ech', view=t), gdf_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 25, columns : 12, Unique col 'ID_ech': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

Rows : 33, columns : 13, Unique col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=33, min=3, readout=False), IntSlider(value=12…

(None, None)

In [93]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [94]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [95]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [96]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [97]:
gdf_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 580, columns : 49, Unique col 'ID_ech': 211


interactive(children=(IntSlider(value=10, description='rows', max=580, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [98]:
conflict_df

Unnamed: 0,Check_col,ID_ech,Emplacement_x,Emplacement_y,Periode_x,Periode_y
90,"Emplacement, Periode",201,Extension pilote,Simulateur,T0,Avant chauffe
132,"Emplacement, Periode",207,Extension pilote,Simulateur,T0,Avant chauffe
172,"Emplacement, Periode",208,Extension pilote,Simulateur,T0,Chauffage actif
212,"Emplacement, Periode",221,Extension pilote,Simulateur,T0,Chauffage actif
260,"Emplacement, Periode",214,Extension pilote,Simulateur,T0,Arrêt PAC
261,"Emplacement, Periode",225,Extension pilote,Simulateur,T0,Chauffage actif


#### $\color{green}{\textbf{Read and merge}}$

In [99]:
file1= work_dir + 'Siterem_Ext_Pilote/Result_eau_Samples-water.csv' # 11
file2= work_dir + 'Siterem_Pilote/Inorganic_major_Samples-water.csv' # 12

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID_ech', view=t), gdf_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 31, columns : 8, Unique col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=31, min=3, readout=False), IntSlider(value=8,…

Rows : 51, columns : 6, Unique col 'ID_ech': 7


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=6,…

(None, None)

In [100]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [101]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [102]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [103]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [104]:
gdf_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 2781, columns : 50, Unique col 'ID_ech': 218


interactive(children=(IntSlider(value=10, description='rows', max=2781, min=10, readout=False), IntSlider(valu…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [108]:
gdf_viewer(conflict_df, rows=5,)

Rows : 144, columns : 10


interactive(children=(IntSlider(value=5, description='rows', max=144, min=5, readout=False), IntSlider(value=1…

In [106]:
a, b = 4, 0
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

Memoris_seafile/Result_eau_Samples-water.csv || Container_phyto/Param_agro_Samples-soil.csv


In [107]:
pause

NameError: name 'pause' is not defined

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
#file1= work_dir + 'Memoris_seafile/Result_SOL_Samples-soil.csv' # 3
#file2= work_dir + 'Phase_1_Memoris/Result_sol_Samples-soil.csv' # 6

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID_ech', view=t), gdf_viewer(df2, rows=3, un_val='ID_ech', view=t)

In [None]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID_ech', date_col='Date_prv', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
gdf_viewer(dataset, rows=10, un_val='ID_ech', view=t)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

####  $\color{red}{\textbf{Save final object dataset}}$

In [None]:
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

###  ------------------------------------- Testing area ------------------------------

In [None]:
a, b = 3, 0
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
#file1= work_dir + 'Memoris_seafile/Result_SOL_Samples-soil.csv' # 3
#file2= work_dir + 'Phase_1_Memoris/Result_sol_Samples-soil.csv' # 6

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
pause

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df([file1, file2])
print(files_dict[key][1].replace(work_dir,""),'||', files_dict[key][2].replace(work_dir,""))
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2)

In [None]:
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [None]:
test, conf_test=gdf_merger(df1, df2, how=how[1], on='new_ID', dist_max=1)

In [None]:
if len(conf_test) > 0:
    gdf_viewer(conf_test, rows=10, un_val='new_ID', view=t)

In [None]:
gdf_viewer(test, rows=10, un_val='ID', view=t)

In [None]:
pause

###  °°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°

# Measures

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

In [None]:
files_dict[key]

# Analysis

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

In [None]:
files_dict[key]

# Piezometers

In [None]:
key='Piezometer'
save_file = f'Merged_Piezometers.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
dataset_overview(files_dict[key])

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'Phase_1_Memoris/Result_eau_Piezometers.csv' # 2
file2= work_dir + 'Memoris_seafile/Result_eau_Piezometers.csv' # 1  


df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2, drop_old_id=True)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### First object dataset save

In [None]:
dataset = mdf.copy() #saving

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1, check = gdf_filter(df1, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True, drop_old_id=True)

In [None]:
df2, check = gdf_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True)

In [None]:
df2

In [None]:
fix_duplicates(df1, df2)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

In [None]:
conflict_df

#### Merge with object dataset

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
a, b = 3, 5
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

In [None]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Piezometers.csv' # 3
file2= work_dir + 'database_Memoris3/Drains_Pz_ENEL_Piezometers.csv' # 5  


df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'donnees_terrain_2019/Donnees_forage_Piezometers.csv' # 9
file2= work_dir + 'database_Memoris3/Result_eau_Piezometers.csv' # 7  

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [None]:
df2, check = gdf_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, val_max=1.1, drop=True, drop_old_id=True)

In [None]:
gdf_viewer(df2, rows=10, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

In [None]:
conflict_df

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
gdf_viewer(dataset, rows=10, un_val='ID', view=t)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
a, b = 13, 15
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

In [None]:
#file1= work_dir + 'donnees_terrain_2019/Donnees_forage_Piezometers.csv' # 9
#file2= work_dir + 'database_Memoris3/Result_eau_Piezometers.csv' # 7  

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df # i think there are not the same, but no date or postition to distinguish them !
# --> check boreholes sheets (pdf)

In [None]:
gdf_viewer(dataset, rows=3, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final object dataset}}$

In [None]:
if 'index' in piezometers.columns:
    piezometers.drop(columns='index', inplace=True)

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
piezometers.to_csv(save_dir + save_file, index=False)

###  ------------------------------------- Testing area ------------------------------

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
a, b = 4, 0
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))
#file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
#file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df([file1, file2])
print(files_dict[key][1].replace(work_dir,""),'||', files_dict[key][2].replace(work_dir,""))
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2)

In [None]:
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [None]:
test, conf_test=gdf_merger(df1, df2, how=how[1], on='new_ID', dist_max=1)

In [None]:
if len(conf_test) > 0:
    gdf_viewer(conf_test, rows=10, un_val='new_ID', view=t)

In [None]:
gdf_viewer(test, rows=10, un_val='ID', view=t)

In [None]:
pause

###  °°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°

In [None]:
file1= files_dict[key][6]
file2= files_dict[key][4]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][1]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

##### check and validate duplicate objects
- The function "gdf_filter()" doesn't work in some cases, so we use function "doubled_objects_check()"
- we have same objects Names but differents by positions here

In [None]:
mdf, check = gdf_filter(mdf, position=True, id_on='ID', expression='sup|prof', dist_max=1, drop=True, rapp_val=1)

In [None]:
double_objects_check(mdf)

In [None]:
drop_id = [2,25,30] # objects are seemingly the same, but is it possible to get 2 objects so close (~ 1m)?
mdf.drop(index=drop_id, inplace=True)
mdf.reset_index(drop=True, inplace=True)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

#### Merge with object type dataset

In [None]:
piezometers = mdf.copy() #saving

In [None]:
file1= files_dict[key][2]
file2= files_dict[key][3]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=f)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
double_objects_check(piezometers)

In [None]:
drop_id = [292, 293]
piezometers.drop(index=drop_id, inplace=True)
gdf_viewer(dataset, rows=5, un_val='ID', view=f)

In [None]:
file1= files_dict[key][4]
file2= files_dict[key][5]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
dataset, check = gdf_filter(dataset, position=True, id_on='ID', expression='sup|prof', dist_max=1, drop=True)
#gdf_viewer(dataset, rows=5, un_val='ID', view=t)

In [None]:
double_objects_check(piezometers)

In [None]:
drop_id = [2,4,30,94,106]
piezometers.drop(index=drop_id, inplace=True)
gdf_viewer(dataset, rows=5, un_val='ID', view=f)

In [None]:
file1= files_dict[key][6]
file2= files_dict[key][9]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df2['ID'] = df2.ID.astype('object')

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][10]
file2= files_dict[key][11]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1['ID'] = df1.ID.astype('object')

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][12]
file2= files_dict[key][13]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
conflict_df

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][14]
file2= files_dict[key][15]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=gdf_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][16]
df1 = pd.read_csv(file1, delimiter=',')

print(f"df1 : {file1.replace(work_dir,'')}")
gdf_viewer(df1, rows=3, un_val='ID', view=t)

#### Last merging

In [None]:
dataset, conflict_df=gdf_merger(dataset, df1, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
gdf_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final Piezometers data}}$

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
piezometers.to_csv(save_dir+save_file, index=False)

==========================================================================================================

# Lithologies

Do not add parameter 'dist_max' when merging without considering position !!! otherwise, unuseless rows added

In [None]:
key='Litho'
save_file = f'Merged_Lithologies.csv'
coi=['ID','X','Y','Z','Litho_top','Litho_base','Description']  #columns of interest
lithologies = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID')#, step_merge
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

In [None]:
gdf_viewer(conflict_df, rows=5, un_val='ID', view=t) #conflict_df.ID.unique()

In [None]:
common_cols = list(set(df1.columns) & set(df2.columns))
test1 = df1.merge(df2, how = 'inner', on='ID')
test2 = df1.merge(df2, how = 'outer', on='ID', indicator=True).loc[lambda x : x.query('_merge =="right_only" or _merge=="left_only"').index]
test3 = test1.merge(test2, how = 'outer', on='ID')
test4 = df1.merge(df2, how = 'outer', on=list(common_cols))
print((len(test1), len(test2), len(test3)))
gdf_viewer(test4)

In [None]:
lithologies = mdf.copy() #saving

In [None]:
file1= files_dict[key][2]
file2= files_dict[key][4]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID')
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

##### Lithologies merging 

In [None]:
lithologies, conflict_df=gdf_merger(lithologies, mdf, how=how[1], on='ID')
check_col(mdf)

In [None]:
gdf_viewer(lithologies, rows=10, cols=15, un_val='ID', view=t)

In [None]:
file1= files_dict[key][5]
file2= files_dict[key][6]

df1, df2 = create_df([file1, file2])
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
gdf_viewer(df1.merge(df2, how='inner', on='ID'), rows=5, cols=15, un_val='ID', view=t)

In [None]:
gdf_viewer(df1.merge(df2, how = 'outer', on='ID',indicator=True), rows=5, cols=15, un_val='ID', view=t)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

In [None]:
mdf, conflict_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(conflict_df, rows=5, un_val='ID', view=t) #conflict_df.ID.unique()

##### Lithologies merging 

In [None]:
lithologies, conflict_df=gdf_merger(lithologies, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(lithologies, rows=3, un_val='ID', view=t)

In [None]:
stop