# Data Gathering

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    for f in files:
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df1 : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [4]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [5]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  13
Piezometer  	:  18
Piezair  	:  3
Trench  	:  1
Litho  	:  12
Equipm  	:  7
Measure  	:  11
Sample  	:  35
Analysis  	:  44
Facility  	:  4


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# PIEZOMETERS PROCESSING

In [8]:
key='Piezometer'
save_file = f'Merged_Piezometers.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

18 files


In [9]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Liste_XY/Sol_Eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Siterem_Ext_Pilote/Inorganic_major_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Donnees_piezos_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Drains_Pz_ENEL_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_eau_Piezometers.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_sol_P

In [10]:
data_overview(files_dict[key])

Same files:[(4, 10)]
Files with coordinates:[0, 1, 2, 3, 4, 5, 6, 8, 9, 10, 14, 16, 17]
Files without coordinates:[7, 11, 12, 13, 15]


#### $\color{green}{\textbf{Read and merge}}$

In [11]:
file1= work_dir + 'Phase_1_Memoris/Result_eau_Piezometers.csv' # 2
file2= work_dir + 'Memoris_seafile/Result_eau_Piezometers.csv' # 1  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 14, columns : 10, Unique values on col 'ID': 14


interactive(children=(IntSlider(value=3, description='rows', max=14, min=3, readout=False), IntSlider(value=10…

Rows : 30, columns : 9, Unique values on col 'ID': 30


interactive(children=(IntSlider(value=3, description='rows', max=30, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [12]:
fix_duplicates(df1, df2, drop_old_id=True)

0 duplicate objects fixed!


In [13]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### First object dataset save

In [14]:
dataset = mdf.copy() #saving

In [15]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 42, columns : 11, Unique values on col 'ID': 42


interactive(children=(IntSlider(value=3, description='rows', max=42, min=3, readout=False), IntSlider(value=11…

#### $\color{green}{\textbf{Read and merge}}$

In [16]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 117, columns : 13, Unique values on col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

Rows : 257, columns : 6, Unique values on col 'ID': 254


interactive(children=(IntSlider(value=3, description='rows', max=257, min=3, readout=False), IntSlider(value=6…

(None, None)

In [17]:
df1, check1 = data_filter(df1, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 117 ; Columns : 13 ; Unique on 'ID' : 104 ; 


In [18]:
df2, check2 = data_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True)

some data must be checked !
Rows : 257 ; Columns : 6 ; Unique on 'ID' : 245 ; 


In [19]:
df2

Unnamed: 0,ID,Nappe,X,Y,Type_ech,Type
0,160,Socle,152395.000,122839.000,Eau,Piezo
1,160,Socle,152395.000,122839.000,Eau,Piezo
2,502,Socle,152365.000,122855.000,Eau,Piezo
3,502,Alluvions,152366.396,122857.132,Eau,Piezo
4,508,Socle,152467.000,122850.000,Eau,Piezo
...,...,...,...,...,...,...
252,520,,152644.000,122791.000,Sol,Piezo
253,524,,152570.000,122789.000,Sol,Piezo
254,525,,152548.000,122783.000,Sol,Piezo
255,526,,152553.000,122757.000,Sol,Piezo


In [20]:
fix_duplicates(df1, df2)

0 duplicate objects fixed!


In [21]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [22]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y
64,Nappe,FP49,Remblai_All,Socle
65,Nappe,FP49,Remblai_All,Socle
72,Nappe,FP14,All_limoneuses_graveleuses,Alluvions
73,Nappe,FP14,All_limoneuses_graveleuses,Alluvions
74,Nappe,FP14,Remblais,Alluvions
75,Nappe,FP14,Remblais,Alluvions
81,Nappe,FP63,Remblais,Socle
82,Nappe,FP63,Remblais,Socle
85,Nappe,FP76,Remblais,Socle
86,Nappe,FP76,Remblais,Socle


In [23]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

Validation done, but conflicts remain!


In [24]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y
110,Nappe,537,All_limoneuse,Alluvions
111,Nappe,536,All_limoneuse,Alluvions
112,Nappe,543,All_limoneuse,Alluvions
122,Nappe,539,All_limoneuse,Alluvions
124,Nappe,533,All_limoneuses_graveleuses,Alluvions
127,Nappe,502,All_limoneuses_graveleuses,Socle
129,Nappe,512,Remblai_All,Socle
132,Nappe,595,Socle,Alluvions
133,Nappe,595,All_limoneuses_graveleuses,Alluvions
134,Nappe,595,Remblais,Alluvions


#### Merge with object dataset

In [25]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [26]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [27]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 409, columns : 20, Unique values on col 'ID': 360


interactive(children=(IntSlider(value=10, description='rows', max=409, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [28]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Piezometers.csv' # 3
file2= work_dir + 'database_Memoris3/Drains_Pz_ENEL_Piezometers.csv' # 5  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 10, columns : 9, Unique values on col 'ID': 10


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

Rows : 6, columns : 6, Unique values on col 'ID': 6


interactive(children=(IntSlider(value=3, description='rows', max=6, min=3, readout=False), IntSlider(value=6, …

(None, None)

In [29]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [30]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [31]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [32]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 420, columns : 21, Unique values on col 'ID': 369


interactive(children=(IntSlider(value=10, description='rows', max=420, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [33]:
file1= work_dir + 'donnees_terrain_2019/Donnees_forage_Piezometers.csv' # 9
file2= work_dir + 'database_Memoris3/Result_eau_Piezometers.csv' # 7  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 3, columns : 18, Unique values on col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=12,…

Rows : 117, columns : 13, Unique values on col 'ID': 117


interactive(children=(IntSlider(value=3, description='rows', max=117, min=3, readout=False), IntSlider(value=1…

(None, None)

In [34]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [35]:
df2, check = data_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 117 ; Columns : 13 ; Unique on 'ID' : 104 ; 


In [36]:
dataframe_viewer(df2, rows=10, un_val='ID', view=t)

Rows : 117, columns : 13, Unique values on col 'ID': 104


interactive(children=(IntSlider(value=10, description='rows', max=117, min=10, readout=False), IntSlider(value…

In [37]:
fix_duplicates(df1, df2)

0 duplicate objects fixed!


In [38]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [39]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [40]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y,Long_crep_x,Long_crep_y,Long_pz_x,Long_pz_y,Diam_int_pz_x,Diam_int_pz_y,Zsol_x,Zsol_y
101,"Nappe, Long_crep",FP49,Socle,Remblai_All,8.0,5.0,,,41.0,41.0,,
103,"Nappe, Long_crep",FP49,Socle,Remblai_All,8.0,5.0,,,41.0,41.0,,
104,Long_crep,FP49,,Socle,5.0,8.0,,,41.0,41.0,,
106,Long_crep,FP49,Socle,Socle,5.0,8.0,,,41.0,41.0,,
107,Nappe,FP49,Socle,Remblai_All,5.0,5.0,,,41.0,41.0,,
114,"Long_pz, Long_crep, Diam_int_pz",FP14,,Remblais,2.0,4.0,7.225,4.301,41.0,25.0,102.32,102.32
116,"Long_pz, Long_crep, Diam_int_pz",FP14,,Remblais,2.0,4.0,7.225,4.301,41.0,25.0,102.32,102.32
117,"Long_pz, Long_crep, Diam_int_pz",FP14,,All_limoneuses_graveleuses,4.0,2.0,4.301,7.225,25.0,41.0,102.32,102.32
119,"Long_pz, Long_crep, Diam_int_pz",FP14,,All_limoneuses_graveleuses,4.0,2.0,4.301,7.225,25.0,41.0,102.32,102.32
122,"Nappe, Long_pz, Long_crep, Diam_int_pz",FP160,Socle,Remblai_All,5.0,4.0,14.222,4.793,41.0,25.0,103.81,103.81


In [41]:
print(list(conflict_df.index)[:18])

[101, 103, 104, 106, 107, 114, 116, 117, 119, 122, 123, 126, 128, 129, 130, 131, 132, 134]


In [42]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

Validation done, but conflicts remain!


In [43]:
conflict_df

Unnamed: 0,Check_col,ID,Nappe_x,Nappe_y,Long_crep_x,Long_crep_y,Long_pz_x,Long_pz_y,Diam_int_pz_x,Diam_int_pz_y,Zsol_x,Zsol_y
101,Long_crep,FP49,Done,Done,8.0,5.0,,,41.0,41.0,,
103,Long_crep,FP49,Done,Done,8.0,5.0,,,41.0,41.0,,
104,Long_crep,FP49,Done,Done,5.0,8.0,,,41.0,41.0,,
106,Long_crep,FP49,Done,Done,5.0,8.0,,,41.0,41.0,,
114,"Long_pz, Long_crep, Diam_int_pz",FP14,Done,Done,2.0,4.0,7.225,4.301,41.0,25.0,102.32,102.32
116,"Long_pz, Long_crep, Diam_int_pz",FP14,Done,Done,2.0,4.0,7.225,4.301,41.0,25.0,102.32,102.32
117,"Long_pz, Long_crep, Diam_int_pz",FP14,Done,Done,4.0,2.0,4.301,7.225,25.0,41.0,102.32,102.32
119,"Long_pz, Long_crep, Diam_int_pz",FP14,Done,Done,4.0,2.0,4.301,7.225,25.0,41.0,102.32,102.32
122,"Long_pz, Long_crep, Diam_int_pz",FP160,Done,Done,5.0,4.0,14.222,4.793,41.0,25.0,103.81,103.81
123,"Long_pz, Long_crep, Diam_int_pz",FP160,Done,Done,4.0,5.0,4.793,14.222,25.0,41.0,103.81,103.81


In [44]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [45]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 510, columns : 30, Unique values on col 'ID': 372


interactive(children=(IntSlider(value=10, description='rows', max=510, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [46]:
file1= work_dir + 'profils_sols_donnees_forages/donnees_forage_Piezometers.csv' # 13
file2= work_dir + 'vUmons_logsFor/Analyse_eau_Phases1&2_Piezometers.csv' # 15  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 12, columns : 18, Unique values on col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=12…

Rows : 29, columns : 7, Unique values on col 'ID': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=7,…

(None, None)

In [47]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [48]:
fix_duplicates(df1, df2, drop_old_id=True)

0 duplicate objects fixed!


In [49]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1, error_tol_dict={1:['X', 'Y'], 0.01:['Z']})

#### Merge with object dataset

In [50]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [51]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [52]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 530, columns : 32, Unique values on col 'ID': 392


interactive(children=(IntSlider(value=10, description='rows', max=530, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [53]:
file1= work_dir + 'vUmons_logsFor/Analyse_sol_Phases1&2_Piezometers.csv' # 16

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID', view=t)

Rows : 59, columns : 7, Unique values on col 'ID': 32


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=7,…

#### Merge with object dataset

In [54]:
dataset, conflict_df=data_merger(dataset, df1, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [55]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
18,Long_for,P14M,3.2,2.8


In [56]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Long_for_x':list(conflict_df.index)})

all conflicts have been fixed!


In [57]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

## No position dataset

#### $\color{green}{\textbf{Read and merge}}$

In [58]:
a, b = 14, 12
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

profils_sols_donnees_forages/donnees_forage_Piezometers.csv || donnees_terrain_2019/Log_Piezometers.csv


In [59]:
file1= work_dir + 'profils_sols_donnees_forages/piezometrie_Piezometers.csv' # 14
file2= work_dir + 'profils_sols_donnees_forages/Equipement_Piezometers.csv' # 12  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 11, columns : 8, Unique values on col 'ID': 11


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=8,…

Rows : 12, columns : 5, Unique values on col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=12, min=3, readout=False), IntSlider(value=5,…

(None, None)

In [60]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [61]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

In [62]:
no_pos_dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [63]:
file1= work_dir + 'donnees_terrain_2019/Equipement_Piezometers.csv' # 10
file2= work_dir + 'donnees_terrain_2019/Log_Piezometers.csv' # 11

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 9, columns : 5, Unique values on col 'ID': 9


interactive(children=(IntSlider(value=3, description='rows', max=9, min=3, readout=False), IntSlider(value=5, …

Rows : 10, columns : 9, Unique values on col 'ID': 3


interactive(children=(IntSlider(value=3, description='rows', max=10, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [64]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [65]:
mdf, conflict_df = data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [66]:
no_pos_dataset, conflict_df=data_merger(no_pos_dataset, mdf, how=how[1], on='ID', dist_max=1)

#### $\color{green}{\textbf{Read and merge}}$

In [67]:
file1= work_dir + 'database_Memoris3/Profils_sol_Piezometers.csv' # 6

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID', view=t)

Rows : 111, columns : 6, Unique values on col 'ID': 111


interactive(children=(IntSlider(value=3, description='rows', max=111, min=3, readout=False), IntSlider(value=6…

#### Merge with object dataset

In [68]:
no_pos_dataset, conflict_df=data_merger(no_pos_dataset, df1, how=how[1], on='ID', dist_max=1)

## final merging

In [69]:
dataset.ID.replace('^P','F', regex=True, inplace=True)

In [70]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID']:
        dataset.drop(columns=c, inplace=True)

In [71]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 564 ; Columns : 31 ; Unique on 'ID' : 378 ; 


In [72]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 564, columns : 31, Unique values on col 'ID': 378


interactive(children=(IntSlider(value=3, description='rows', max=564, min=3, readout=False), IntSlider(value=1…

In [73]:
dataframe_viewer(no_pos_dataset, rows=3, un_val='ID', view=t)

Rows : 142, columns : 20, Unique values on col 'ID': 135


interactive(children=(IntSlider(value=3, description='rows', max=142, min=3, readout=False), IntSlider(value=1…

In [74]:
no_pos_dataset.ID.replace('^P','F', regex=True, inplace=True)

In [75]:
no_pos_dataset, check1 = data_filter(no_pos_dataset, position=False, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 142 ; Columns : 20 ; Unique on 'ID' : 135 ; 


#### Merge with object dataset

In [76]:
dataset, conflict_df=data_merger(dataset, no_pos_dataset, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [77]:
conflict_df

Unnamed: 0,Check_col,ID,Diam_ext_pz_x,Diam_ext_pz_y,Long_pz_x,Long_pz_y,Diam_for_x,Diam_for_y
3,Diam_ext_pz,F2M,53.0,45.0,6.0,,75.0,75.0
4,Long_pz,F3M,,45.0,2.76,2.98,,75.0
5,Long_pz,F3M,,45.0,2.76,2.98,,75.0
6,"Diam_ext_pz, Long_pz",F3M,53.0,45.0,3.3,2.98,75.0,75.0
9,Diam_ext_pz,F4M,53.0,45.0,4.0,,75.0,75.0
13,Diam_ext_pz,F5M,53.0,45.0,6.0,,75.0,75.0
16,Diam_ext_pz,F6M,53.0,45.0,3.6,,75.0,75.0
18,Diam_ext_pz,F11M,53.0,45.0,3.0,,75.0,75.0
21,Diam_ext_pz,F12M,53.0,45.0,3.5,,75.0,75.0
22,Long_pz,F13M,,45.0,3.36,4.04,,75.0


In [78]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 633 ; Columns : 41 ; Unique on 'ID' : 440 ; 


In [79]:
check1

Unnamed: 0,Check_col,ID,Type_ech,Methode,Resp_chantier,Refus,Method,Keyword,Sous_zone,Description,Type_refus,Societe,Zone,Date_ouv,ID_date,Type_equip,ID_2,Nappe,Long_for,Type
1,"Type_ech, Methode, Resp_chantier, Refus, Metho...",F2M,,,,,,,,,,,,,,Crepine,P2M,Remblais + limons,6.0,Piezo
2,"Type_ech, Methode, Resp_chantier, Refus, Metho...",F2M,,,,,,,,,,,,,,Crepine,P2M,Remblais + limons,6.0,Piezo
3,"Type_ech, Methode, Resp_chantier, Refus, Metho...",F2M,,,,,Dual tube,,,,,ECOPLANNING sprl,,2017-02-22,2017-F2M,,P2M,Remblais + limons,6.0,Piezo
5,"Type_ech, Methode, Resp_chantier, Method, Keyw...",F3M,,,,x,,,,,,,,,,Crepine,P3M,Remblais,3.3,Piezo
6,"Type_ech, Methode, Resp_chantier, Method, Keyw...",F3M,,,,x,Dual tube,,,,Laitier,ECOPLANNING sprl,,2017-02-22,2017-F3M,,P3M,Remblais,3.3,Piezo
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
561,"Type_ech, Methode, Resp_chantier, Method, Keyw...",F1aM,,,,x,,,,,,,,,,,,,2.4,Piezo
575,"Type_ech, Methode, Resp_chantier, Refus, Metho...",F52,,,,,,Remblais déchets de construction,,Remblais de déchets de construction - présence...,,,Mini-Pilote,2019-12-18,,,,,,Piezo
576,"Type_ech, Methode, Resp_chantier, Refus, Metho...",F52,,,,,,Remblais déchets sidérurgiques,,Remblais de couleur noir - Présence d'éclat ch...,,,Mini-Pilote,2019-12-18,,,,,,Piezo
577,"Type_ech, Methode, Resp_chantier, Refus, Metho...",F52,,,,,,Remblais déchets sidérurgiques,,Remblais grossiers saturés avec scories - char...,,,Mini-Pilote,2019-12-18,,,,,,Piezo


In [80]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 633, columns : 41, Unique values on col 'ID': 440


interactive(children=(IntSlider(value=3, description='rows', max=633, min=3, readout=False), IntSlider(value=1…

####  $\color{red}{\textbf{Save final object dataset}}$

In [81]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID'] and c in dataset.columns:
        dataset.drop(columns=c, inplace=True)

In [82]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

## Querying