# Data Gathering

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    for f in files:
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df1 : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [4]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [5]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# PIEZOMETERS PROCESSING

In [8]:
key='Piezometer'
save_file = f'Merged_Piezometers.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

17 files


In [9]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Liste_XY/Sol_Eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Donnees_piezos_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Drains_Pz_ENEL_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_sol_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Donnees_forage_

In [10]:
data_overview(files_dict[key])

Same files:[(7, 8)]
Files with coordinates:[0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 15, 16]
Files without coordinates:[6, 10, 11, 12, 14]


#### $\color{green}{\textbf{Read and merge}}$

In [11]:
file1= work_dir + 'Phase_1_Memoris/Result_eau_Piezometers.csv' # 2
file2= work_dir + 'Memoris_seafile/Result_eau_Piezometers.csv' # 1  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 14, columns : 10, Unique values on col 'ID': 14


interactive(children=(IntSlider(value=3, description='rows', max=14, min=3, readout=False), IntSlider(value=10…

Rows : 30, columns : 9, Unique values on col 'ID': 30


interactive(children=(IntSlider(value=3, description='rows', max=30, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [12]:
fix_duplicates(df1, df2, drop_old_id=True)

14 duplicate objects fixed!


In [13]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### First object dataset save

In [14]:
dataset = mdf.copy() #saving

In [15]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 30, columns : 11, Unique values on col 'ID': 30


interactive(children=(IntSlider(value=3, description='rows', max=30, min=3, readout=False), IntSlider(value=11…

#### $\color{green}{\textbf{Read and merge}}$

In [16]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 117, columns : 13, Unique values on col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

Rows : 257, columns : 6, Unique values on col 'ID': 254


interactive(children=(IntSlider(value=3, description='rows', max=257, min=3, readout=False), IntSlider(value=6…

(None, None)

In [17]:
df1, check1 = data_filter(df1, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[61, 64, 66, 69, 71, 73, 107, 114, 112, 115], will be dropped if drop is set True!
Rows : 107 ; Columns : 14 ; Unique on 'ID' : 104 ; 


In [18]:
df2, check2 = data_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True)

some data must be checked !
same objects at indices:[0, 7, 9, 11, 32, 112], will be dropped if drop is set True!
Rows : 251 ; Columns : 7 ; Unique on 'ID' : 245 ; 


In [19]:
df2

Unnamed: 0,ID,Origin_ID,Nappe,X,Y,Type_ech,Type
0,160,160sup,Socle,152395.000,122839.000,Eau,Piezo
1,502,502,Socle,152365.000,122855.000,Eau,Piezo
2,502,502,Alluvions,152366.396,122857.132,Eau,Piezo
3,508,508,Socle,152467.000,122850.000,Eau,Piezo
4,512,512,Socle,152428.000,122766.000,Eau,Piezo
...,...,...,...,...,...,...,...
246,520,520,,152644.000,122791.000,Sol,Piezo
247,524,524,,152570.000,122789.000,Sol,Piezo
248,525,525,,152548.000,122783.000,Sol,Piezo
249,526,526,,152553.000,122757.000,Sol,Piezo


In [20]:
fix_duplicates(df1, df2)

3 duplicate objects fixed!


In [21]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [22]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y,Origin_ID_x,Origin_ID_y
8,"Nappe, Origin_ID",59,Remblai_All,remblais,59,P59
37,Origin_ID,186,Remblais,,186,503
61,"Nappe, Origin_ID",FP49,Remblai_All,Socle,FP 49 SUP,FP49sup
65,Nappe,FP14,Remblais,Alluvions,FP14sup,FP14sup
68,"Nappe, Origin_ID",FP63,Remblais,Socle,FP63 sup,FP63sup
69,"Nappe, Origin_ID",FP76,Remblais,Socle,FP76 sup,FP76sup
71,Nappe,501,Remblai_All,remblais,501,501
73,Nappe,509,All_limoneuse,Alluvions,509,509
74,Nappe,510,All_limoneuse,Alluvions,510,510
75,Nappe,511,Remblai_All,remblais,511,511


In [23]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

Validation done, but conflicts remain!


In [24]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y,Origin_ID_x,Origin_ID_y
8,Origin_ID,59,Done,Done,59,P59
61,Origin_ID,FP49,Done,Done,FP 49 SUP,FP49sup
68,Origin_ID,FP63,Done,Done,FP63 sup,FP63sup
69,Origin_ID,FP76,Done,Done,FP76 sup,FP76sup
103,Nappe,539,All_limoneuse,Alluvions,539,539
105,Nappe,533,All_limoneuses_graveleuses,Alluvions,533,533
106,"Nappe, Origin_ID",502,All_limoneuses_graveleuses,Socle,502sup,502
108,"Nappe, Origin_ID",512,Remblai_All,Socle,512sup,512
111,"Nappe, Origin_ID",595,All_limoneuses_graveleuses,Alluvions,595sup Al,595
112,"Nappe, Origin_ID",540,Remblais,Alluvions,540sup R,540


#### Merge with object dataset

In [25]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [26]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [27]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 363, columns : 21, Unique values on col 'ID': 346


interactive(children=(IntSlider(value=10, description='rows', max=363, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [28]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Piezometers.csv' # 3
file2= work_dir + 'database_Memoris3/Drains_Pz_ENEL_Piezometers.csv' # 5  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 10, columns : 9, Unique values on col 'ID': 10


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

Rows : 6, columns : 6, Unique values on col 'ID': 6


interactive(children=(IntSlider(value=3, description='rows', max=6, min=3, readout=False), IntSlider(value=6, …

(None, None)

In [29]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [30]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [31]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [32]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 374, columns : 22, Unique values on col 'ID': 355


interactive(children=(IntSlider(value=10, description='rows', max=374, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [33]:
file1= work_dir + 'donnees_terrain_2019/Donnees_forage_Piezometers.csv' # 9
file2= work_dir + 'database_Memoris3/Result_eau_Piezometers.csv' # 7  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 3, columns : 18, Unique values on col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=12,…

Rows : 117, columns : 13, Unique values on col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

(None, None)

In [34]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [35]:
df2, check = data_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True, drop_old_id=True)

same objects at indices:[61, 64, 66, 69, 71, 73, 107, 114, 112, 115], will be dropped if drop is set True!
Rows : 107 ; Columns : 14 ; Unique on 'ID' : 104 ; 


In [36]:
dataframe_viewer(df2, rows=10, un_val='ID', view=t)

Rows : 107, columns : 14, Unique values on col 'ID': 104


interactive(children=(IntSlider(value=10, description='rows', max=107, min=10, readout=False), IntSlider(value…

In [37]:
fix_duplicates(df1, df2)

0 duplicate objects fixed!


In [38]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [39]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [40]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y,Origin_ID_x,Origin_ID_y,Long_crep_x,Long_crep_y,Long_pz_x,Long_pz_y,Zsol_x,Zsol_y,Diam_int_pz_x,Diam_int_pz_y
36,"Nappe, Origin_ID",59,remblais,Remblai_All,#conflict,59,5.8,5.8,6.944,6.944,101.96,101.96,25.0,25.0
65,Origin_ID,186,,Remblais,#conflict,186,9.0,9.0,9.051,9.051,109.881,109.881,25.0,25.0
89,"Nappe, Origin_ID",FP49,Socle,Remblai_All,#conflict,FP 49 SUP,5.0,5.0,,,,,41.0,41.0
93,"Nappe, Origin_ID",FP63,Alluvions,Remblais,#conflict,FP63 sup,3.0,3.0,3.357,3.357,102.803,102.803,41.0,41.0
94,Origin_ID,FP76,,Remblais,#conflict,FP76 sup,5.0,5.0,6.503,6.503,102.73,102.73,41.0,41.0
96,Nappe,501,Socle,Remblai_All,501,501,2.0,2.0,9.69,9.69,110.04,110.04,25.0,25.0
99,Nappe,510,remblais,All_limoneuse,510,510,3.0,3.0,5.4,5.4,104.05,104.05,41.0,41.0
101,Nappe,513,Alluvions,Remblai_All,513,513,3.0,3.0,7.1,7.1,104.69,104.69,41.0,41.0
103,Nappe,528,remblais,Remblai_All,528,528,2.0,2.0,3.67,3.67,103.5,103.5,41.0,41.0
105,Nappe,522,remblais,Remblai_All,522,522,2.0,2.0,2.98,2.98,103.07,103.07,41.0,41.0


In [41]:
print(list(conflict_df.index)[:18])

[36, 65, 89, 93, 94, 96, 99, 101, 103, 105, 117, 119, 125, 127, 128, 130, 132, 134]


In [42]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

Validation done, but conflicts remain!


In [43]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y,Origin_ID_x,Origin_ID_y,Long_crep_x,Long_crep_y,Long_pz_x,Long_pz_y,Zsol_x,Zsol_y,Diam_int_pz_x,Diam_int_pz_y
36,Origin_ID,59,Done,Done,#conflict,59,5.8,5.8,6.944,6.944,101.96,101.96,25.0,25.0
89,Origin_ID,FP49,Done,Done,#conflict,FP 49 SUP,5.0,5.0,,,,,41.0,41.0
93,Origin_ID,FP63,Done,Done,#conflict,FP63 sup,3.0,3.0,3.357,3.357,102.803,102.803,41.0,41.0
119,"Long_crep, Long_pz, Origin_ID",541,Done,Done,541,541sup R,2.0,4.0,8.22,5.2,101.41,101.41,25.0,25.0
128,Origin_ID,502,Done,Done,#conflict,502sup,2.0,2.0,7.37,7.37,105.28,105.28,25.0,25.0
130,Origin_ID,512,Done,Done,#conflict,512sup,3.0,3.0,4.67,4.67,104.82,104.82,41.0,41.0
132,"Zsol, Diam_int_pz, Long_crep, Long_pz, Origin_ID",512,Done,Done,512,512sup,2.0,3.0,14.56,4.67,104.87,104.82,51.0,41.0
134,Origin_ID,595,Done,Done,#conflict,595sup Al,3.0,3.0,9.4,9.4,101.59,101.59,25.0,25.0
135,"Nappe, Origin_ID",540,#conflict,Remblais,#conflict,540sup R,5.0,5.0,7.76,7.76,106.91,106.91,25.0,25.0


In [44]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [45]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 411, columns : 31, Unique values on col 'ID': 358


interactive(children=(IntSlider(value=10, description='rows', max=411, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [46]:
file1= work_dir + 'profils_sols_donnees_forages/donnees_forage_Piezometers.csv' # 13
file2= work_dir + 'vUmons_logsFor/Analyse_eau_Phases1&2_Piezometers.csv' # 15  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 12, columns : 18, Unique values on col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=12…

Rows : 29, columns : 7, Unique values on col 'ID': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=7,…

(None, None)

In [47]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [48]:
fix_duplicates(df1, df2, drop_old_id=True)

1 duplicate objects fixed!


In [49]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1, error_tol_dict={1:['X', 'Y'], 0.01:['Z']})

#### Merge with object dataset

In [50]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [51]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [52]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 431, columns : 33, Unique values on col 'ID': 378


interactive(children=(IntSlider(value=10, description='rows', max=431, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [53]:
file1= work_dir + 'vUmons_logsFor/Analyse_sol_Phases1&2_Piezometers.csv' # 16

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID', view=t)

Rows : 59, columns : 7, Unique values on col 'ID': 32


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=7,…

#### Merge with object dataset

In [54]:
dataset, conflict_df=data_merger(dataset, df1, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [55]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
18,Long_for,P14M,3.2,2.8


In [56]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

## No position dataset

#### $\color{green}{\textbf{Read and merge}}$

In [57]:
a, b = 14, 12
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

profils_sols_donnees_forages/piezometrie_Piezometers.csv || profils_sols_donnees_forages/Equipement_Piezometers.csv


In [58]:
file1= work_dir + 'profils_sols_donnees_forages/piezometrie_Piezometers.csv' # 14
file2= work_dir + 'profils_sols_donnees_forages/Equipement_Piezometers.csv' # 12  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 11, columns : 8, Unique values on col 'ID': 11


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=8,…

Rows : 12, columns : 5, Unique values on col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=5,…

(None, None)

In [59]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [60]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

In [61]:
no_pos_dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [62]:
file1= work_dir + 'donnees_terrain_2019/Equipement_Piezometers.csv' # 10
file2= work_dir + 'donnees_terrain_2019/Log_Piezometers.csv' # 11

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 9, columns : 5, Unique values on col 'ID': 9


interactive(children=(IntSlider(value=3, description='rows', max=9, min=3, readout=False), IntSlider(value=5, …

Rows : 10, columns : 9, Unique values on col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [63]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [64]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [65]:
no_pos_dataset, conflict_df=data_merger(no_pos_dataset, mdf, how=how[1], on='ID', dist_max=1)

#### $\color{green}{\textbf{Read and merge}}$

In [66]:
file1= work_dir + 'database_Memoris3/Profils_sol_Piezometers.csv' # 6

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID', view=t)

Rows : 111, columns : 6, Unique values on col 'ID': 111


interactive(children=(IntSlider(value=3, description='rows', max=111, min=3, readout=False), IntSlider(value=6…

#### Merge with object dataset

In [67]:
no_pos_dataset, conflict_df=data_merger(no_pos_dataset, df1, how=how[1], on='ID', dist_max=1)

## final merging

In [68]:
dataset.ID.replace('^P','F', regex=True, inplace=True)

In [69]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID']:
        dataset.drop(columns=c, inplace=True)

In [70]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[4, 3, 12, 18, 457], will be dropped if drop is set True!
Rows : 460 ; Columns : 32 ; Unique on 'ID' : 364 ; 


In [71]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 460, columns : 32, Unique values on col 'ID': 364


interactive(children=(IntSlider(value=3, description='rows', max=460, min=3, readout=False), IntSlider(value=1…

In [72]:
dataframe_viewer(no_pos_dataset, rows=3, un_val='ID', view=t)

Rows : 142, columns : 19, Unique values on col 'ID': 135


interactive(children=(IntSlider(value=3, description='rows', max=142, min=3, readout=False), IntSlider(value=1…

In [73]:
no_pos_dataset.ID.replace('^P','F', regex=True, inplace=True)

In [74]:
no_pos_dataset, check1 = data_filter(no_pos_dataset, position=False, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[24, 25, 22, 28, 29, 30, 26], will be dropped if drop is set True!
Rows : 135 ; Columns : 20 ; Unique on 'ID' : 135 ; 


In [75]:
no_pos_dataset.drop(columns='Origin_ID', inplace=True)

#### Merge with object dataset

In [76]:
dataset, conflict_df=data_merger(dataset, no_pos_dataset, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [77]:
conflict_df

Unnamed: 0,Check_col,ID,Diam_ext_pz_x,Diam_ext_pz_y,Long_pz_x,Long_pz_y,Diam_for_x,Diam_for_y
3,Diam_ext_pz,F2M,53.0,45.0,6.0,,75.0,75.0
6,Diam_ext_pz,F4M,53.0,45.0,4.0,,75.0,75.0
10,Diam_ext_pz,F5M,53.0,45.0,6.0,,75.0,75.0
13,Diam_ext_pz,F6M,53.0,45.0,3.6,,75.0,75.0
16,Diam_ext_pz,F12M,53.0,45.0,3.5,,75.0,75.0
17,Long_pz,F13M,,45.0,3.36,4.04,,75.0
18,Long_pz,F13M,,45.0,3.36,4.04,,75.0
19,Long_pz,F13M,,45.0,3.36,4.04,,75.0
20,"Diam_ext_pz, Long_pz",F13M,53.0,45.0,3.5,4.04,75.0,75.0
22,Long_pz,F16M,,50.0,4.39,4.85,,108.0


In [78]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[1, 2, 0, 5, 4, 8, 9, 7, 12, 11, 15, 14, 18, 19, 17, 24, 22, 144, 150, 143, 155, 363, 373, 387, 420, 418, 424, 422, 430, 439, 441, 435], will be dropped if drop is set True!
Rows : 492 ; Columns : 42 ; Unique on 'ID' : 428 ; 


In [79]:
check1

Unnamed: 0,Check_col,Refus,Description,Keyword,Resp_chantier,Sous_zone,Type_ech,Type_refus,Method,Methode,Date_ouv,Zone,Societe,Type_equip,ID_2,Nappe,ID
1,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,,,,,,Crepine,P2M,Remblais + limons,F2M
2,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,,,,,,Crepine,P2M,Remblais + limons,F2M
3,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,Dual tube,,2017-02-22,,ECOPLANNING sprl,,P2M,Remblais + limons,F2M
5,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,,,,,,Crepine,P4M,Remblais,F4M
6,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,Dual tube,,2017-02-22,,ECOPLANNING sprl,,P4M,Remblais,F4M
8,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,,,,,,Crepine,P5M,Remblais + limons,F5M
9,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,,,,,,Crepine,P5M,Remblais + limons,F5M
10,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,Dual tube,,2017-02-22,,ECOPLANNING sprl,,P5M,Remblais + limons,F5M
12,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,,,,,,Crepine,P6M,Remblais,F6M
13,"Refus, Description, Keyword, Resp_chantier, So...",,,,,,,,Dual tube,,2017-02-22,,ECOPLANNING sprl,,P6M,Remblais,F6M


In [80]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 492, columns : 42, Unique values on col 'ID': 428


interactive(children=(IntSlider(value=3, description='rows', max=492, min=3, readout=False), IntSlider(value=1…

####  $\color{red}{\textbf{Save final object dataset}}$

In [82]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID'] and c in dataset.columns:
        dataset.drop(columns=c, inplace=True)

In [83]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

## Querying

In [None]:
coi_1 = ['ID','Date_ouv','X','Y','Z','Zsol','Type','Description','Diam_for','Long_for',]
coi_2 = ['Nappe','Refus','Type_refus','Profondeur','Long_pz','Diam_ext_pz','Diam_int_pz','Zone']

In [None]:
x_min, x_max = 152882, 152885
y_min, y_max =  122605, 122608
bh_name = 'F1.M' # regex
#bh_name = None

q = dataset.query(f'X >= {x_min} and X <= {x_max} and Y >= {y_min} and Y <= {y_max}')
if bh_name is not None:
    q = dataset.query(f'ID.str.contains("{bh_name}")', engine='python')
dataframe_viewer(q[coi_1 + coi_2], rows=10, cols=15, un_val='ID', view=t)

In [None]:
drop_dict = {1:10, 2:20, 3:30, 4:10}

In [None]:
[k for k, v in drop_dict.items() if v == min(drop_dict.values())][0]