# Data Gathering

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    for f in files:
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df1 : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [4]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [5]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# PIEZOMETERS PROCESSING

In [8]:
key='Piezometer'
save_file = f'Merged_Piezometers.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

17 files


In [9]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Liste_XY/Sol_Eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Donnees_piezos_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Drains_Pz_ENEL_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_sol_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Donnees_forage_

In [10]:
data_overview(files_dict[key])

Same files:[(7, 8)]
Files with coordinates:[0, 1, 2, 3, 4, 5, 7, 8, 9, 13, 15, 16]
Files without coordinates:[6, 10, 11, 12, 14]


#### $\color{green}{\textbf{Read and merge}}$

In [11]:
file1= work_dir + 'Phase_1_Memoris/Result_eau_Piezometers.csv' # 2
file2= work_dir + 'Memoris_seafile/Result_eau_Piezometers.csv' # 1  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 14, columns : 10, Unique values on col 'ID': 14


interactive(children=(IntSlider(value=3, description='rows', max=14, min=3, readout=False), IntSlider(value=10…

Rows : 30, columns : 9, Unique values on col 'ID': 30


interactive(children=(IntSlider(value=3, description='rows', max=30, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [12]:
fix_duplicates(df1, df2, drop_old_id=True)

14 duplicate objects fixed!


In [13]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### First object dataset save

In [14]:
dataset = mdf.copy() #saving

In [15]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 30, columns : 11, Unique values on col 'ID': 30


interactive(children=(IntSlider(value=3, description='rows', max=30, min=3, readout=False), IntSlider(value=11…

#### $\color{green}{\textbf{Read and merge}}$

In [16]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 117, columns : 13, Unique values on col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

Rows : 257, columns : 6, Unique values on col 'ID': 254


interactive(children=(IntSlider(value=3, description='rows', max=257, min=3, readout=False), IntSlider(value=6…

(None, None)

In [17]:
df1, check1 = data_filter(df1, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[61, 64, 66, 69, 71, 73, 107, 114, 112, 115], will be dropped if drop is set True!
Rows : 107 ; Columns : 14 ; Unique on 'ID' : 104 ; 


In [18]:
df2, check2 = data_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True)

some data must be checked !
same objects at indices:[0, 7, 9, 11, 32, 112], will be dropped if drop is set True!
Rows : 251 ; Columns : 7 ; Unique on 'ID' : 245 ; 


In [19]:
df2

Unnamed: 0,ID,Origin_ID,Nappe,X,Y,Type_ech,Type
0,160,160sup,Socle,152395.000,122839.000,Eau,Piezo
1,502,502,Socle,152365.000,122855.000,Eau,Piezo
2,502,502,Alluvions,152366.396,122857.132,Eau,Piezo
3,508,508,Socle,152467.000,122850.000,Eau,Piezo
4,512,512,Socle,152428.000,122766.000,Eau,Piezo
...,...,...,...,...,...,...,...
246,520,520,,152644.000,122791.000,Sol,Piezo
247,524,524,,152570.000,122789.000,Sol,Piezo
248,525,525,,152548.000,122783.000,Sol,Piezo
249,526,526,,152553.000,122757.000,Sol,Piezo


In [20]:
fix_duplicates(df1, df2)

3 duplicate objects fixed!


In [21]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [22]:
conflict_df

Unnamed: 0,Check_col,ID,Origin_ID_x,Origin_ID_y,Nappe_x,Nappe_y
8,"Origin_ID, Nappe",59,59,P59,Remblai_All,remblais
37,Origin_ID,186,186,503,Remblais,
61,"Origin_ID, Nappe",FP49,FP 49 SUP,FP49sup,Remblai_All,Socle
65,Nappe,FP14,FP14sup,FP14sup,Remblais,Alluvions
68,"Origin_ID, Nappe",FP63,FP63 sup,FP63sup,Remblais,Socle
69,"Origin_ID, Nappe",FP76,FP76 sup,FP76sup,Remblais,Socle
71,Nappe,501,501,501,Remblai_All,remblais
73,Nappe,509,509,509,All_limoneuse,Alluvions
74,Nappe,510,510,510,All_limoneuse,Alluvions
75,Nappe,511,511,511,Remblai_All,remblais


In [23]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

Validation done, but conflicts remain!


In [24]:
conflict_df

Unnamed: 0,Check_col,ID,Origin_ID_x,Origin_ID_y,Nappe_x,Nappe_y
8,Origin_ID,59,59,P59,Done,Done
61,Origin_ID,FP49,FP 49 SUP,FP49sup,Done,Done
68,Origin_ID,FP63,FP63 sup,FP63sup,Done,Done
69,Origin_ID,FP76,FP76 sup,FP76sup,Done,Done
103,Nappe,539,539,539,All_limoneuse,Alluvions
105,Nappe,533,533,533,All_limoneuses_graveleuses,Alluvions
106,"Origin_ID, Nappe",502,502sup,502,All_limoneuses_graveleuses,Socle
108,"Origin_ID, Nappe",512,512sup,512,Remblai_All,Socle
111,"Origin_ID, Nappe",595,595sup Al,595,All_limoneuses_graveleuses,Alluvions
112,"Origin_ID, Nappe",540,540sup R,540,Remblais,Alluvions


#### Merge with object dataset

In [25]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [26]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [27]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 363, columns : 21, Unique values on col 'ID': 346


interactive(children=(IntSlider(value=10, description='rows', max=363, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [28]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Piezometers.csv' # 3
file2= work_dir + 'database_Memoris3/Drains_Pz_ENEL_Piezometers.csv' # 5  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 10, columns : 9, Unique values on col 'ID': 10


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

Rows : 6, columns : 6, Unique values on col 'ID': 6


interactive(children=(IntSlider(value=3, description='rows', max=6, min=3, readout=False), IntSlider(value=6, …

(None, None)

In [29]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [30]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [31]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [32]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 374, columns : 22, Unique values on col 'ID': 355


interactive(children=(IntSlider(value=10, description='rows', max=374, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [33]:
file1= work_dir + 'donnees_terrain_2019/Donnees_forage_Piezometers.csv' # 9
file2= work_dir + 'database_Memoris3/Result_eau_Piezometers.csv' # 7  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 3, columns : 18, Unique values on col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=12,…

Rows : 117, columns : 13, Unique values on col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

(None, None)

In [34]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [35]:
df2, check = data_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True, drop_old_id=True)

same objects at indices:[61, 64, 66, 69, 71, 73, 107, 114, 112, 115], will be dropped if drop is set True!
Rows : 107 ; Columns : 14 ; Unique on 'ID' : 104 ; 


In [36]:
dataframe_viewer(df2, rows=10, un_val='ID', view=t)

Rows : 107, columns : 14, Unique values on col 'ID': 104


interactive(children=(IntSlider(value=10, description='rows', max=107, min=10, readout=False), IntSlider(value…

In [37]:
fix_duplicates(df1, df2)

0 duplicate objects fixed!


In [38]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [39]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [40]:
conflict_df

Unnamed: 0,Check_col,ID,Origin_ID_x,Origin_ID_y,Nappe_x,Nappe_y,Long_crep_x,Long_crep_y,Long_pz_x,Long_pz_y,Diam_int_pz_x,Diam_int_pz_y,Zsol_x,Zsol_y
36,"Origin_ID, Nappe",59,#conflict,59,remblais,Remblai_All,5.8,5.8,6.944,6.944,25.0,25.0,101.96,101.96
65,Origin_ID,186,#conflict,186,,Remblais,9.0,9.0,9.051,9.051,25.0,25.0,109.881,109.881
89,"Origin_ID, Nappe",FP49,#conflict,FP 49 SUP,Socle,Remblai_All,5.0,5.0,,,41.0,41.0,,
93,"Origin_ID, Nappe",FP63,#conflict,FP63 sup,Alluvions,Remblais,3.0,3.0,3.357,3.357,41.0,41.0,102.803,102.803
94,Origin_ID,FP76,#conflict,FP76 sup,,Remblais,5.0,5.0,6.503,6.503,41.0,41.0,102.73,102.73
96,Nappe,501,501,501,Socle,Remblai_All,2.0,2.0,9.69,9.69,25.0,25.0,110.04,110.04
99,Nappe,510,510,510,remblais,All_limoneuse,3.0,3.0,5.4,5.4,41.0,41.0,104.05,104.05
101,Nappe,513,513,513,Alluvions,Remblai_All,3.0,3.0,7.1,7.1,41.0,41.0,104.69,104.69
103,Nappe,528,528,528,remblais,Remblai_All,2.0,2.0,3.67,3.67,41.0,41.0,103.5,103.5
105,Nappe,522,522,522,remblais,Remblai_All,2.0,2.0,2.98,2.98,41.0,41.0,103.07,103.07


In [41]:
print(list(conflict_df.index)[:18])

[36, 65, 89, 93, 94, 96, 99, 101, 103, 105, 117, 119, 125, 127, 128, 130, 132, 134]


In [42]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

Validation done, but conflicts remain!


In [43]:
conflict_df

Unnamed: 0,Check_col,ID,Origin_ID_x,Origin_ID_y,Nappe_x,Nappe_y,Long_crep_x,Long_crep_y,Long_pz_x,Long_pz_y,Diam_int_pz_x,Diam_int_pz_y,Zsol_x,Zsol_y
36,Origin_ID,59,#conflict,59,Done,Done,5.8,5.8,6.944,6.944,25.0,25.0,101.96,101.96
89,Origin_ID,FP49,#conflict,FP 49 SUP,Done,Done,5.0,5.0,,,41.0,41.0,,
93,Origin_ID,FP63,#conflict,FP63 sup,Done,Done,3.0,3.0,3.357,3.357,41.0,41.0,102.803,102.803
119,"Origin_ID, Long_crep, Long_pz",541,541,541sup R,Done,Done,2.0,4.0,8.22,5.2,25.0,25.0,101.41,101.41
128,Origin_ID,502,#conflict,502sup,Done,Done,2.0,2.0,7.37,7.37,25.0,25.0,105.28,105.28
130,Origin_ID,512,#conflict,512sup,Done,Done,3.0,3.0,4.67,4.67,41.0,41.0,104.82,104.82
132,"Origin_ID, Long_crep, Diam_int_pz, Zsol, Long_pz",512,512,512sup,Done,Done,2.0,3.0,14.56,4.67,51.0,41.0,104.87,104.82
134,Origin_ID,595,#conflict,595sup Al,Done,Done,3.0,3.0,9.4,9.4,25.0,25.0,101.59,101.59
135,"Origin_ID, Nappe",540,#conflict,540sup R,#conflict,Remblais,5.0,5.0,7.76,7.76,25.0,25.0,106.91,106.91


In [44]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [45]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 411, columns : 31, Unique values on col 'ID': 358


interactive(children=(IntSlider(value=10, description='rows', max=411, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [46]:
file1= work_dir + 'profils_sols_donnees_forages/donnees_forage_Piezometers.csv' # 13
file2= work_dir + 'vUmons_logsFor/Analyse_eau_Phases1&2_Piezometers.csv' # 15  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 12, columns : 18, Unique values on col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=12…

Rows : 29, columns : 7, Unique values on col 'ID': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=7,…

(None, None)

In [47]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [48]:
fix_duplicates(df1, df2, drop_old_id=True)

1 duplicate objects fixed!


In [49]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1, error_tol_dict={1:['X', 'Y'], 0.01:['Z']})

#### Merge with object dataset

In [50]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [51]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [52]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 431, columns : 33, Unique values on col 'ID': 378


interactive(children=(IntSlider(value=10, description='rows', max=431, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [53]:
file1= work_dir + 'vUmons_logsFor/Analyse_sol_Phases1&2_Piezometers.csv' # 16

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID', view=t)

Rows : 59, columns : 7, Unique values on col 'ID': 32


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=7,…

#### Merge with object dataset

In [54]:
dataset, conflict_df=data_merger(dataset, df1, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [55]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
18,Long_for,P14M,3.2,2.8


In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_for_x':list(conflict_df.index)})

In [56]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

## No position dataset

#### $\color{green}{\textbf{Read and merge}}$

In [57]:
a, b = 14, 12
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

profils_sols_donnees_forages/piezometrie_Piezometers.csv || profils_sols_donnees_forages/Equipement_Piezometers.csv


In [58]:
file1= work_dir + 'profils_sols_donnees_forages/piezometrie_Piezometers.csv' # 14
file2= work_dir + 'profils_sols_donnees_forages/Equipement_Piezometers.csv' # 12  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 11, columns : 8, Unique values on col 'ID': 11


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=8,…

Rows : 12, columns : 5, Unique values on col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=5,…

(None, None)

In [59]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [60]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

In [61]:
no_pos_dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [62]:
file1= work_dir + 'donnees_terrain_2019/Equipement_Piezometers.csv' # 10
file2= work_dir + 'donnees_terrain_2019/Log_Piezometers.csv' # 11

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 9, columns : 5, Unique values on col 'ID': 9


interactive(children=(IntSlider(value=3, description='rows', max=9, min=3, readout=False), IntSlider(value=5, …

Rows : 10, columns : 9, Unique values on col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [63]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [64]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [65]:
no_pos_dataset, conflict_df=data_merger(no_pos_dataset, mdf, how=how[1], on='ID', dist_max=1)

#### $\color{green}{\textbf{Read and merge}}$

In [66]:
file1= work_dir + 'database_Memoris3/Profils_sol_Piezometers.csv' # 6

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID', view=t)

Rows : 111, columns : 6, Unique values on col 'ID': 111


interactive(children=(IntSlider(value=3, description='rows', max=111, min=3, readout=False), IntSlider(value=6…

#### Merge with object dataset

In [67]:
no_pos_dataset, conflict_df=data_merger(no_pos_dataset, df1, how=how[1], on='ID', dist_max=1)

## final merging

In [68]:
dataset.ID.replace('^P','F', regex=True, inplace=True)

In [69]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID']:
        dataset.drop(columns=c, inplace=True)

In [70]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[1, 2, 0, 4, 3, 6, 5, 8, 9, 7, 11, 10, 12, 14, 13, 16, 17, 15, 18, 21, 20, 26, 28, 30, 32, 37, 38, 35, 135, 141, 134, 146, 355, 365, 379, 399, 402, 412, 410, 415, 413, 420, 429, 431, 425, 457], will be dropped if drop is set True!
Rows : 419 ; Columns : 32 ; Unique on 'ID' : 364 ; 


In [71]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 419, columns : 32, Unique values on col 'ID': 364


interactive(children=(IntSlider(value=3, description='rows', max=419, min=3, readout=False), IntSlider(value=1…

In [72]:
dataframe_viewer(no_pos_dataset, rows=3, un_val='ID', view=t)

Rows : 142, columns : 19, Unique values on col 'ID': 135


interactive(children=(IntSlider(value=3, description='rows', max=142, min=3, readout=False), IntSlider(value=1…

In [73]:
no_pos_dataset.ID.replace('^P','F', regex=True, inplace=True)

In [74]:
no_pos_dataset, check1 = data_filter(no_pos_dataset, position=False, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[24, 25, 22, 28, 29, 30, 26], will be dropped if drop is set True!
Rows : 135 ; Columns : 20 ; Unique on 'ID' : 135 ; 


In [75]:
no_pos_dataset.drop(columns='Origin_ID', inplace=True)

#### Merge with object dataset

In [76]:
dataset, conflict_df=data_merger(dataset, no_pos_dataset, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [77]:
conflict_df

Unnamed: 0,Check_col,ID,Diam_ext_pz_x,Diam_ext_pz_y,Long_pz_x,Long_pz_y,Diam_for_x,Diam_for_y
382,Diam_ext_pz,50,25.0,45.0,3.6,,75.0,75.0
383,Diam_ext_pz,51,25.0,45.0,3.6,,75.0,75.0
385,Diam_ext_pz,52,25.0,45.0,3.6,,75.0,75.0
397,Diam_ext_pz,F2M,53.0,45.0,6.0,,75.0,75.0
398,"Diam_ext_pz, Long_pz",F3M,53.0,45.0,3.3,2.98,75.0,75.0
399,Diam_ext_pz,F4M,53.0,45.0,4.0,,75.0,75.0
400,Diam_ext_pz,F5M,53.0,45.0,6.0,,75.0,75.0
401,Diam_ext_pz,F6M,53.0,45.0,3.6,,75.0,75.0
402,Diam_ext_pz,F11M,53.0,45.0,3.0,,75.0,75.0
403,Diam_ext_pz,F12M,53.0,45.0,3.5,,75.0,75.0


In [78]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[114, 116, 112, 393, 394, 390], will be dropped if drop is set True!
Rows : 477 ; Columns : 42 ; Unique on 'ID' : 428 ; 


In [79]:
check1

Unnamed: 0,Check_col,ID,Resp_chantier,Keyword,Type_refus,Type_equip,Refus,Sous_zone,Methode,ID_2,Type_ech,Description,Method,Zone,Nappe,Date_ouv,ID_date,Societe
82,"Resp_chantier, Keyword, Type_refus, Type_equip...",F7,,,,,,,,,Eau,,,,remblais,2010-03-15,2010-F7,SITEREM
84,"Resp_chantier, Keyword, Type_refus, Type_equip...",F8,,,,,,,,,Eau,,,,remblais,2010-03-15,2010-F8,SITEREM
113,"Resp_chantier, Keyword, Type_refus, Type_equip...",541,,,,,,four à coke,,,Eau,,,Cokerie de Marchienne,All_limoneuse,,,SITEREM
114,"Resp_chantier, Keyword, Type_refus, Type_equip...",541,,,,,,four à coke,,,Eau,,,Cokerie de Marchienne,All_limoneuse,,,SITEREM
116,"Resp_chantier, Keyword, Type_refus, Type_equip...",541,,,,,,four à coke,,,Eau,,,Cokerie de Marchienne,All_limoneuse,,,SITEREM
123,"Resp_chantier, Keyword, Type_refus, Type_equip...",512,,,,,,Extraction de charbon,,,Eau,,,Charbonnage,#conflict,,,SITEREM
392,"Resp_chantier, Keyword, Type_refus, Type_equip...",531,,,,,,Dépôt de stériles,,,Eau,,,Dépôts sidérurgiques et terrils,All_limoneuse,,,SITEREM
393,"Resp_chantier, Keyword, Type_refus, Type_equip...",531,,,,,,Dépôt de stériles,,,Eau,,,Dépôts sidérurgiques et terrils,All_limoneuse,,,SITEREM
394,"Resp_chantier, Keyword, Type_refus, Type_equip...",531,,,,,,Dépôt de stériles,,,Eau,,,Dépôts sidérurgiques et terrils,All_limoneuse,,,SITEREM
407,"Resp_chantier, Keyword, Type_refus, Type_equip...",F15bM,,,,,,,,P15bM,,,,,Remblais,,,


In [80]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 477, columns : 42, Unique values on col 'ID': 428


interactive(children=(IntSlider(value=3, description='rows', max=477, min=3, readout=False), IntSlider(value=1…

####  $\color{red}{\textbf{Save final object dataset}}$

In [81]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID'] and c in dataset.columns:
        dataset.drop(columns=c, inplace=True)

In [82]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

## Querying