# Data Gathering

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    i = 0
    for f in files:
        i += 1
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df{i} : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [4]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [5]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# ================== PROCESSING ===================== 

# Samples

In [8]:
key='Sample'
save_file = f'Merged_Samples.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

27 files


In [9]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Param_agro_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Param_agro_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_sol_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Siterem_Ext_Pilote/Inorganic_maj

In [10]:
data_overview(files_dict[key])

Same files:[(1, 9), (18, 20)]
Files with coordinates:[4, 5, 8, 25, 26]
Files without coordinates:[0, 1, 2, 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


## with coordinates

#### $\color{green}{\textbf{Read and merge}}$

In [11]:
file1= work_dir + 'Memoris_seafile/Result_eau_Samples-water.csv' # 4
file2= work_dir + 'Phase_1_Memoris/Result_eau_Samples-water.csv' # 5

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 45, columns : 10, Unique values on col 'ID_ech': 45


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=10…

Rows : 17, columns : 17, Unique values on col 'ID_ech': 17


interactive(children=(IntSlider(value=3, description='rows', max=17, min=3, readout=False), IntSlider(value=12…

(None, None)

In [12]:
fix_duplicates(df1, df2, id_col='ID_ech', drop_old_id=True)

31 duplicate objects fixed!


In [13]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### First object dataset save

In [14]:
dataset = mdf.copy() #saving

In [15]:
dataframe_viewer(dataset, rows=3, un_val='ID_ech', view=t)

Rows : 55, columns : 17, Unique values on col 'ID_ech': 44


interactive(children=(IntSlider(value=3, description='rows', max=55, min=3, readout=False), IntSlider(value=12…

#### $\color{green}{\textbf{Read and merge}}$

In [16]:
file1= work_dir + 'vUmons_logsFor/Analyse_eau_Phases1&2_Samples-water.csv' # 25
file2= work_dir + 'vUmons_logsFor/Analyse_sol_Phases1&2_Samples-soil.csv' # 26


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 45, columns : 10, Unique values on col 'ID_ech': 41


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=10…

Rows : 59, columns : 13, Unique values on col 'ID_ech': 59


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=12…

(None, None)

In [17]:
fix_duplicates(df1, df2, id_col='ID_ech', crit_2nd_col='Type_ech', drop_old_id=True)

0 duplicate objects fixed!


In [18]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [19]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [20]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [21]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 146, columns : 25, Unique values on col 'ID_ech': 100


interactive(children=(IntSlider(value=10, description='rows', max=146, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [22]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Samples-water.csv' # 8

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t)

Rows : 11, columns : 10, Unique values on col 'ID_ech': 11


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=10…

#### Merge with object dataset

In [23]:
dataset, conflict_df=data_merger(dataset, df1, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [24]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [25]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID']:
        dataset.drop(columns=c, inplace=True)

In [26]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[57, 0, 5, 6, 60, 3, 12, 13, 67, 10, 146, 148, 22, 38, 52, 50, 141, 142, 145, 147, 140], will be dropped if drop is set True!
Rows : 128 ; Columns : 25 ; Unique on 'ID_ech' : 94 ; 


In [27]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 128, columns : 25, Unique values on col 'ID_ech': 94


interactive(children=(IntSlider(value=10, description='rows', max=128, min=10, readout=False), IntSlider(value…

In [28]:
dataset_1 = dataset.copy()

## no coordinates

#### $\color{green}{\textbf{Read and merge}}$

In [29]:
file1= work_dir + 'Container_phyto/Param_agro_Samples-soil.csv' # 0
file2= work_dir + 'Container_phyto/Result_SOL_Samples-soil.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 5, columns : 21, Unique values on col 'ID_ech': 5


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=12,…

Rows : 5, columns : 8, Unique values on col 'ID_ech': 5


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=8, …

(None, None)

In [30]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [31]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1, 
                             error_tol_dict={1:['MS', 'Fract_2', 'Fract_2+']})

Conflict values present. Please resolve this manually !


In [32]:
mdf

Unnamed: 0,ID_ech,Date_prv,index,MO,pH_KCl,chlorures,Fract_arg,Fract_min_2µ,Fract_min_2,COT,...,sulfures_tot,Residu_perte_feu,Tem_pH_mes,Fract_min_50µ,MS,Fract_2,Fract_2+,Ech_base,Ech_top,Type_ech
0,Ech. 1,2017-12-14 00:00:00,0.0,6.3,11.8,34.0,1.0,1.0,57.0,160000.0,...,130.0,93.3,20.7,4.8,89.8,#conflict,#conflict,15,12,Sol
1,Ech. 2,2017-12-14 00:00:00,1.0,11.5,8.2,18.0,1.9,1.0,56.0,400000.0,...,78.0,88.2,20.6,13.0,80.0,47,53,12,9,Sol
2,Ech. 3,2017-12-14 00:00:00,2.0,10.2,11.0,36.0,1.0,1.0,72.0,200000.0,...,86.0,89.4,20.6,10.0,87.9,#conflict,#conflict,9,6,Sol
3,Ech. 4,2017-12-14 00:00:00,3.0,9.0,11.1,36.0,1.0,1.0,65.0,300000.0,...,60.0,90.6,20.4,9.0,90.5,#conflict,#conflict,6,3,Sol
4,Ech. 5,2017-12-14 00:00:00,4.0,8.4,11.4,43.0,1.0,1.0,65.0,210000.0,...,80.0,91.2,20.9,8.1,84.9,#conflict,#conflict,3,0,Sol


#### $\color{blue}{\textbf{Manage conflicts}}$

In [33]:
conflict_df # values are seemingly inverted (Fract)

Unnamed: 0,Check_col,ID_ech,Fract_2_x,Fract_2_y,Fract_2+_x,Fract_2+_y
0,"Fract_2, Fract_2+",Ech. 1,79,33,21,67
2,"Fract_2, Fract_2+",Ech. 3,69,40,31,60
3,"Fract_2, Fract_2+",Ech. 4,70,45,30,55
4,"Fract_2, Fract_2+",Ech. 5,71,29,29,71


In [34]:
mdf = data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', valid_all=True)
                #valid_dict={'Long_for_x':[136,142],},)

all conflicts have been fixed!


  indexer = self._get_setitem_indexer(key)


In [35]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 9, columns : 21, Unique values on col 'ID_ech': 9


interactive(children=(IntSlider(value=9, description='rows', max=9, min=9, readout=False), IntSlider(value=12,…

#### Merge with object dataset

In [36]:
dataset = mdf.copy()

In [37]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [38]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 9, columns : 21, Unique values on col 'ID_ech': 9


interactive(children=(IntSlider(value=9, description='rows', max=9, min=9, readout=False), IntSlider(value=12,…

#### $\color{green}{\textbf{Read and merge}}$

In [39]:
file1= work_dir + 'Memoris_seafile/Result_SOL_Samples-soil.csv' # 3
file2= work_dir + 'Phase_1_Memoris/Result_sol_Samples-soil.csv' # 6

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 60, columns : 12, Unique values on col 'ID_ech': 60


interactive(children=(IntSlider(value=3, description='rows', max=60, min=3, readout=False), IntSlider(value=12…

Rows : 29, columns : 11, Unique values on col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=11…

(None, None)

In [40]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


#### Merge with object dataset

In [41]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [42]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [43]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

Rows : 69 ; Columns : 26 ; Unique on 'ID_ech' : 69 ; 


In [44]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 69, columns : 26, Unique values on col 'ID_ech': 69


interactive(children=(IntSlider(value=10, description='rows', max=69, min=10, readout=False), IntSlider(value=…

#### $\color{green}{\textbf{Read and merge}}$

In [45]:
file1= work_dir + 'Phase_2_Memoris/Result_SOL_Samples-soil.csv' # 7
file2= work_dir + 'Siterem_Ext_Pilote/Param_physico_Samples-water.csv' # 10

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 25, columns : 12, Unique values on col 'ID_ech': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

Rows : 33, columns : 13, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=33, min=3, readout=False), IntSlider(value=12…

(None, None)

In [46]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [47]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [48]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [49]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [50]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 102, columns : 36, Unique values on col 'ID_ech': 75


interactive(children=(IntSlider(value=10, description='rows', max=102, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [51]:
file1= work_dir + 'Siterem_Ext_Pilote/Result_eau_Samples-water.csv' # 11
file2= work_dir + 'Siterem_Pilote/Inorganic_major_Samples-water.csv' # 12

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 31, columns : 8, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=31, min=3, readout=False), IntSlider(value=8,…

Rows : 51, columns : 6, Unique values on col 'ID_ech': 7


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=6,…

(None, None)

In [52]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [53]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [54]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [55]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [56]:
dataframe_viewer(conflict_df, rows=5,)

Rows : 5, columns : 8


interactive(children=(IntSlider(value=5, description='rows', max=5, min=5, readout=False), IntSlider(value=8, …

In [57]:
dataset = data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', valid_all=True)

all conflicts have been fixed!


  indexer = self._get_setitem_indexer(key)


In [58]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 552, columns : 37, Unique values on col 'ID_ech': 88


interactive(children=(IntSlider(value=10, description='rows', max=552, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [59]:
file1= work_dir + 'Siterem_Result_Sol/Result_sol_ExtP_Samples-soil.csv' # 15
file2= work_dir + 'Siterem_Result_Sol/SOL_T1_Pilote_Samples-soil.csv' # 16

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 43, columns : 19, Unique values on col 'ID_ech': 43


interactive(children=(IntSlider(value=3, description='rows', max=43, min=3, readout=False), IntSlider(value=12…

Rows : 15, columns : 11, Unique values on col 'ID_ech': 15


interactive(children=(IntSlider(value=3, description='rows', max=15, min=3, readout=False), IntSlider(value=11…

(None, None)

In [60]:
#df2.rename(columns={'ID':'ID_ech'}, inplace=True)
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [61]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [62]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 58, columns : 20, Unique values on col 'ID_ech': 58


interactive(children=(IntSlider(value=10, description='rows', max=58, min=10, readout=False), IntSlider(value=…

#### Merge with object dataset

In [63]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [64]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [65]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
same objects at indices:[71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 69, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 104, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 168, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 27

#### $\color{green}{\textbf{Read and merge}}$

In [66]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Samples-water.csv' # 17
file2= work_dir + 'database_Memoris3/Result_eau_Samples-water.csv' # 18

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 130, columns : 7, Unique values on col 'ID': 130


interactive(children=(IntSlider(value=3, description='rows', max=130, min=3, readout=False), IntSlider(value=7…

Rows : 130, columns : 9, Unique values on col 'ID': 130


interactive(children=(IntSlider(value=3, description='rows', max=130, min=3, readout=False), IntSlider(value=9…

(None, None)

In [67]:
df1.rename(columns={'ID':'ID_ech'}, inplace=True)
df2.rename(columns={'ID':'ID_ech'}, inplace=True)
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [68]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [69]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 130, columns : 10, Unique values on col 'ID_ech': 130


interactive(children=(IntSlider(value=10, description='rows', max=130, min=10, readout=False), IntSlider(value…

#### Merge with object dataset

In [70]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [71]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [72]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

Rows : 268 ; Columns : 46 ; Unique on 'ID_ech' : 268 ; 


In [73]:
dataset_2 = dataset.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [74]:
file1= work_dir + 'database_Memoris3/Result_sol_Samples-soil.csv' # 19
file2= work_dir + 'donnees_terrain_2019/Echantillon_Samples-soil.csv' # 21

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 1351, columns : 25, Unique values on col 'ID': 259


interactive(children=(IntSlider(value=3, description='rows', max=1351, min=3, readout=False), IntSlider(value=…

Rows : 70, columns : 9, Unique values on col 'ID': 22


interactive(children=(IntSlider(value=3, description='rows', max=70, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [75]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [76]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [77]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 1421, columns : 27, Unique values on col 'ID_ech': 1373


interactive(children=(IntSlider(value=10, description='rows', max=1421, min=10, readout=False), IntSlider(valu…

#### Merge with object dataset

In [78]:
dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [79]:
file1= work_dir + 'donnees_terrain_2019/Result_Sol_Samples-soil.csv' # 22
file2= work_dir + 'profils_sols_donnees_forages/Echant-organo_Samples-soil.csv' # 24

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 55, columns : 19, Unique values on col 'ID_ech': 55


interactive(children=(IntSlider(value=3, description='rows', max=55, min=3, readout=False), IntSlider(value=12…

Rows : 32, columns : 9, Unique values on col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=32, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [80]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)
df2.rename(columns={'intensité':'intensite'}, inplace=True)

In [81]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [82]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 87, columns : 24, Unique values on col 'ID_ech': 84


interactive(children=(IntSlider(value=10, description='rows', max=87, min=10, readout=False), IntSlider(value=…

In [83]:
dataset.columns

Index(['ID_ech', 'T° pH H2O', 'Fract_2µ', 'CE', 'pH CaCl2', 'Fract_63µ',
       'Intensité', 'Fract_2000µ', 'MS', 'pH H2O', 'ID', 'Argile ',
       'T° pH CaCl2', 'Intensite', 'Fract_16µ', 'T° CE', 'T° pH KCl', 'MO',
       'Fract_45µ', 'pH KCl', 'Pol_base', 'Polluant', 'Ech_base', 'Ech_top',
       'ID', 'Type_ech', 'Pol_top'],
      dtype='object')

In [84]:
pause

NameError: name 'pause' is not defined

#### Merge with object dataset

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

In [None]:
pause

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

In [None]:
a, b = 22, 24
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""),'\n')

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3), dataframe_viewer(df2, rows=3)

In [None]:
pause

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
dataframe_viewer(conflict_df, rows=5,)

In [None]:
dataframe_viewer(dataset, rows=10, un_val=['ID_ech', 'Date_prv'], view=t)

### TEST ----------------------------------------------------------------

In [None]:
a, b = 17, 18
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""),'\n')

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, )#un_val='ID_ech', view=t)

In [None]:
pause

In [None]:
a, b = 1, 13
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""),'\n')

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

In [None]:
dataframe_viewer(dataset, rows=10, un_val=['ID_ech', 'Date_prv'], view=t)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

In [None]:
x_min, x_max = 152883, 152885
y_min, y_max =  122605, 122608
id_col = 'ID_ech'
bh_name = 'F1.' # regex
#bh_name = None

q = dataset.query(f'X >= {x_min} and X <= {x_max} and Y >= {y_min} and Y <= {y_max}')
if bh_name is not None:
    q = dataset.query(f'{id_col}.str.contains("{bh_name}")', engine='python')
#dataframe_viewer(q[coi_1 + coi_2], rows=10, cols=15, un_val='ID', view=t)
dataframe_viewer(q, rows=10, cols=15, un_val=id_col, view=t)

###  -----------------------------------------------------------------

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'Container_phyto/Param_agro_Samples-water.csv' # 1
file2= work_dir + 'Siterem_Pilote/Param_physico_Samples-water.csv' # 13

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

In [None]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [None]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

#### Merge with object dataset

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
dataframe_viewer(conflict_df, rows=5,)

In [None]:
dataframe_viewer(dataset, rows=10, un_val=['ID_ech','Date_prv'], view=t)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
#file1= work_dir + 'Memoris_seafile/Result_SOL_Samples-soil.csv' # 3
#file2= work_dir + 'Phase_1_Memoris/Result_sol_Samples-soil.csv' # 6

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

In [None]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

####  $\color{red}{\textbf{Save final object dataset}}$

In [None]:
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)