# Data Gathering

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    i = 0
    for f in files:
        i += 1
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df{i} : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [4]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [5]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# ================== PROCESSING ===================== 

# Samples

In [8]:
key='Sample'
save_file = f'Merged_Samples.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

27 files


In [9]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Param_agro_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Param_agro_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_sol_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_SOL_Samples-soil.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_eau_Samples-water.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Siterem_Ext_Pilote/Inorganic_maj

In [10]:
data_overview(files_dict[key])

Same files:[(1, 9), (18, 20)]
Files with coordinates:[4, 5, 8, 25, 26]
Files without coordinates:[0, 1, 2, 3, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24]


## with coordinates

#### $\color{green}{\textbf{Read and merge}}$

In [11]:
file1= work_dir + 'Memoris_seafile/Result_eau_Samples-water.csv' # 4
file2= work_dir + 'Phase_1_Memoris/Result_eau_Samples-water.csv' # 5

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 45, columns : 10, Unique values on col 'ID_ech': 45


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=10…

Rows : 17, columns : 17, Unique values on col 'ID_ech': 17


interactive(children=(IntSlider(value=3, description='rows', max=17, min=3, readout=False), IntSlider(value=12…

(None, None)

In [12]:
fix_duplicates(df1, df2, id_col='ID_ech', drop_old_id=True)

31 duplicate objects fixed!


In [13]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### First object dataset save

In [14]:
dataset = mdf.copy() #saving

In [15]:
dataframe_viewer(dataset, rows=3, un_val='ID_ech', view=t)

Rows : 55, columns : 17, Unique values on col 'ID_ech': 44


interactive(children=(IntSlider(value=3, description='rows', max=55, min=3, readout=False), IntSlider(value=12…

#### $\color{green}{\textbf{Read and merge}}$

In [16]:
file1= work_dir + 'vUmons_logsFor/Analyse_eau_Phases1&2_Samples-water.csv' # 25
file2= work_dir + 'vUmons_logsFor/Analyse_sol_Phases1&2_Samples-soil.csv' # 26


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 45, columns : 10, Unique values on col 'ID_ech': 41


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=10…

Rows : 59, columns : 13, Unique values on col 'ID_ech': 59


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=12…

(None, None)

In [17]:
fix_duplicates(df1, df2, id_col='ID_ech', crit_2nd_col='Type_ech', drop_old_id=True)

0 duplicate objects fixed!


In [18]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [19]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [20]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [21]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 146, columns : 25, Unique values on col 'ID_ech': 100


interactive(children=(IntSlider(value=10, description='rows', max=146, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [22]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Samples-water.csv' # 8

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t)

Rows : 11, columns : 10, Unique values on col 'ID_ech': 11


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=10…

#### Merge with object dataset

In [23]:
dataset, conflict_df=data_merger(dataset, df1, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [24]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [25]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID']:
        dataset.drop(columns=c, inplace=True)

In [26]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 149 ; Columns : 25 ; Unique on 'ID_ech' : 94 ; 


In [27]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 149, columns : 25, Unique values on col 'ID_ech': 94


interactive(children=(IntSlider(value=10, description='rows', max=149, min=10, readout=False), IntSlider(value…

In [28]:
dataset_1 = dataset.copy()

## no coordinates

#### $\color{green}{\textbf{Read and merge}}$

In [29]:
file1= work_dir + 'Container_phyto/Param_agro_Samples-soil.csv' # 0
file2= work_dir + 'Container_phyto/Result_SOL_Samples-soil.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 5, columns : 21, Unique values on col 'ID_ech': 5


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=12,…

Rows : 5, columns : 8, Unique values on col 'ID_ech': 5


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=8, …

(None, None)

In [30]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [31]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1, 
                             error_tol_dict={1:['MS', 'Fract_2', 'Fract_2+']})

Conflict values present. Please resolve this manually !


In [32]:
mdf

Unnamed: 0,ID_ech,Date_prv,index,sulfures_tot,pH_KCl,Fract_min_2,Fract_min_2µ,Fract_min_50µ,Tem_pH_mes,Residu_perte_feu,...,chlorures,Fract_arg,azote_Kjeldahl,MO,Type_ech,Fract_2+,MS,Fract_2,Ech_base,Ech_top
0,Ech. 1,2017-12-14 00:00:00,0.0,130.0,11.8,57.0,1.0,4.8,20.7,93.3,...,34.0,1.0,1320.0,6.3,Sol,#conflict,89.8,#conflict,15,12
1,Ech. 2,2017-12-14 00:00:00,1.0,78.0,8.2,56.0,1.0,13.0,20.6,88.2,...,18.0,1.9,3810.0,11.5,Sol,53,80.0,47,12,9
2,Ech. 3,2017-12-14 00:00:00,2.0,86.0,11.0,72.0,1.0,10.0,20.6,89.4,...,36.0,1.0,3040.0,10.2,Sol,#conflict,87.9,#conflict,9,6
3,Ech. 4,2017-12-14 00:00:00,3.0,60.0,11.1,65.0,1.0,9.0,20.4,90.6,...,36.0,1.0,2550.0,9.0,Sol,#conflict,90.5,#conflict,6,3
4,Ech. 5,2017-12-14 00:00:00,4.0,80.0,11.4,65.0,1.0,8.1,20.9,91.2,...,43.0,1.0,2300.0,8.4,Sol,#conflict,84.9,#conflict,3,0


#### $\color{blue}{\textbf{Manage conflicts}}$

In [33]:
conflict_df # values are seemingly inverted (Fract)

Unnamed: 0,Check_col,ID_ech,Fract_2+_x,Fract_2+_y,Fract_2_x,Fract_2_y
0,"Fract_2+, Fract_2",Ech. 1,21,67,79,33
2,"Fract_2+, Fract_2",Ech. 3,31,60,69,40
3,"Fract_2+, Fract_2",Ech. 4,30,55,70,45
4,"Fract_2+, Fract_2",Ech. 5,29,71,71,29


In [34]:
mdf = data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', valid_all=True)
                #valid_dict={'Long_for_x':[136,142],},)

all conflicts have been fixed!


  indexer = self._get_setitem_indexer(key)


In [35]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 9, columns : 21, Unique values on col 'ID_ech': 9


interactive(children=(IntSlider(value=9, description='rows', max=9, min=9, readout=False), IntSlider(value=12,…

#### Merge with object dataset

In [36]:
dataset = mdf.copy()

In [37]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [38]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 9, columns : 21, Unique values on col 'ID_ech': 9


interactive(children=(IntSlider(value=9, description='rows', max=9, min=9, readout=False), IntSlider(value=12,…

#### $\color{green}{\textbf{Read and merge}}$

In [39]:
file1= work_dir + 'Memoris_seafile/Result_SOL_Samples-soil.csv' # 3
file2= work_dir + 'Phase_1_Memoris/Result_sol_Samples-soil.csv' # 6

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 60, columns : 12, Unique values on col 'ID_ech': 60


interactive(children=(IntSlider(value=3, description='rows', max=60, min=3, readout=False), IntSlider(value=12…

Rows : 29, columns : 11, Unique values on col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=11…

(None, None)

In [40]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


#### Merge with object dataset

In [41]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [42]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [43]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

Rows : 69 ; Columns : 26 ; Unique on 'ID_ech' : 69 ; 


In [44]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 69, columns : 26, Unique values on col 'ID_ech': 69


interactive(children=(IntSlider(value=10, description='rows', max=69, min=10, readout=False), IntSlider(value=…

#### $\color{green}{\textbf{Read and merge}}$

In [45]:
file1= work_dir + 'Phase_2_Memoris/Result_SOL_Samples-soil.csv' # 7
file2= work_dir + 'Siterem_Ext_Pilote/Param_physico_Samples-water.csv' # 10

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 25, columns : 12, Unique values on col 'ID_ech': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

Rows : 33, columns : 13, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=33, min=3, readout=False), IntSlider(value=12…

(None, None)

In [46]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [47]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [48]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [49]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [50]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 102, columns : 36, Unique values on col 'ID_ech': 75


interactive(children=(IntSlider(value=10, description='rows', max=102, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [51]:
file1= work_dir + 'Siterem_Ext_Pilote/Result_eau_Samples-water.csv' # 11
file2= work_dir + 'Siterem_Pilote/Inorganic_major_Samples-water.csv' # 12

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 31, columns : 8, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=31, min=3, readout=False), IntSlider(value=8,…

Rows : 51, columns : 6, Unique values on col 'ID_ech': 7


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=6,…

(None, None)

In [52]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [53]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [54]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [55]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [56]:
dataframe_viewer(conflict_df, rows=5,)

Rows : 5, columns : 8


interactive(children=(IntSlider(value=5, description='rows', max=5, min=5, readout=False), IntSlider(value=8, …

In [57]:
dataset = data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', valid_all=True)

all conflicts have been fixed!


  indexer = self._get_setitem_indexer(key)


In [58]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 552, columns : 37, Unique values on col 'ID_ech': 88


interactive(children=(IntSlider(value=10, description='rows', max=552, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [59]:
file1= work_dir + 'Siterem_Result_Sol/Result_sol_ExtP_Samples-soil.csv' # 15
file2= work_dir + 'Siterem_Result_Sol/SOL_T1_Pilote_Samples-soil.csv' # 16

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 43, columns : 19, Unique values on col 'ID_ech': 43


interactive(children=(IntSlider(value=3, description='rows', max=43, min=3, readout=False), IntSlider(value=12…

Rows : 15, columns : 11, Unique values on col 'ID_ech': 15


interactive(children=(IntSlider(value=3, description='rows', max=15, min=3, readout=False), IntSlider(value=11…

(None, None)

In [60]:
#df2.rename(columns={'ID':'ID_ech'}, inplace=True)
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [61]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [62]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 58, columns : 20, Unique values on col 'ID_ech': 58


interactive(children=(IntSlider(value=10, description='rows', max=58, min=10, readout=False), IntSlider(value=…

#### Merge with object dataset

In [63]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [64]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [65]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 604 ; Columns : 39 ; Unique on 'ID_ech' : 140 ; 


#### $\color{green}{\textbf{Read and merge}}$

In [66]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Samples-water.csv' # 17
file2= work_dir + 'database_Memoris3/Result_eau_Samples-water.csv' # 18

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 130, columns : 7, Unique values on col 'ID': 130


interactive(children=(IntSlider(value=3, description='rows', max=130, min=3, readout=False), IntSlider(value=7…

Rows : 130, columns : 9, Unique values on col 'ID': 130


interactive(children=(IntSlider(value=3, description='rows', max=130, min=3, readout=False), IntSlider(value=9…

(None, None)

In [67]:
df1.rename(columns={'ID':'ID_ech'}, inplace=True)
df2.rename(columns={'ID':'ID_ech'}, inplace=True)
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [68]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [69]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 130, columns : 10, Unique values on col 'ID_ech': 130


interactive(children=(IntSlider(value=10, description='rows', max=130, min=10, readout=False), IntSlider(value…

#### Merge with object dataset

In [70]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [71]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [72]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 732 ; Columns : 46 ; Unique on 'ID_ech' : 268 ; 


In [73]:
dataset_2 = dataset.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [74]:
file1= work_dir + 'database_Memoris3/Result_sol_Samples-soil.csv' # 19
file2= work_dir + 'donnees_terrain_2019/Echantillon_Samples-soil.csv' # 21

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 1351, columns : 25, Unique values on col 'ID': 259


interactive(children=(IntSlider(value=3, description='rows', max=1351, min=3, readout=False), IntSlider(value=…

Rows : 70, columns : 9, Unique values on col 'ID': 22


interactive(children=(IntSlider(value=3, description='rows', max=70, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [75]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)
df1.rename(columns={'Intensité':'Intensite'}, inplace=True)

In [76]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [77]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 1421, columns : 26, Unique values on col 'ID_ech': 1373


interactive(children=(IntSlider(value=10, description='rows', max=1421, min=10, readout=False), IntSlider(valu…

#### Merge with object dataset

In [78]:
dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [79]:
file1= work_dir + 'donnees_terrain_2019/Result_Sol_Samples-soil.csv' # 22
file2= work_dir + 'profils_sols_donnees_forages/Echant-organo_Samples-soil.csv' # 24

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 55, columns : 19, Unique values on col 'ID_ech': 55


interactive(children=(IntSlider(value=3, description='rows', max=55, min=3, readout=False), IntSlider(value=12…

Rows : 32, columns : 9, Unique values on col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=32, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [80]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)
df2.rename(columns={'Intensité':'Intensite'}, inplace=True)

In [81]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [82]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 87, columns : 24, Unique values on col 'ID_ech': 84


interactive(children=(IntSlider(value=10, description='rows', max=87, min=10, readout=False), IntSlider(value=…

In [83]:
dataset.drop(columns='ID', inplace=True)
mdf.drop(columns='ID', inplace=True)

#### Merge with object dataset

In [84]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [85]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [86]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 1455 ; Columns : 38 ; Unique on 'ID_ech' : 1404 ; 


#### $\color{green}{\textbf{Read and merge}}$

In [87]:
file1= work_dir + 'Siterem_Pilote/Result_eau_Samples-water.csv' # 14
file2= work_dir + 'observ_terrain/Piezometrie_Samples-water.csv' # 23

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 87, columns : 13, Unique values on col 'ID_ech': 10


interactive(children=(IntSlider(value=3, description='rows', max=87, min=3, readout=False), IntSlider(value=12…

Rows : 27, columns : 14, Unique values on col 'ID': 27


interactive(children=(IntSlider(value=3, description='rows', max=27, min=3, readout=False), IntSlider(value=12…

(None, None)

In [88]:
df2.rename(columns={'ID':'ID_ech'}, inplace=True)
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [89]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [90]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 114, columns : 20, Unique values on col 'ID_ech': 37


interactive(children=(IntSlider(value=10, description='rows', max=114, min=10, readout=False), IntSlider(value…

#### Merge with object dataset

In [91]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [92]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [93]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 1569 ; Columns : 53 ; Unique on 'ID_ech' : 1441 ; 


In [94]:
dataset_3 = dataset.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [95]:
file1= work_dir + 'Container_phyto/Param_agro_Samples-water.csv' # 1
file2= work_dir + 'Siterem_Pilote/Param_physico_Samples-water.csv' # 13

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 24, columns : 6, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=24, min=3, readout=False), IntSlider(value=6,…

Rows : 95, columns : 13, Unique values on col 'ID_ech': 15


interactive(children=(IntSlider(value=3, description='rows', max=95, min=3, readout=False), IntSlider(value=12…

(None, None)

In [96]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [97]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [98]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 119, columns : 15, Unique values on col 'ID_ech': 21


interactive(children=(IntSlider(value=10, description='rows', max=119, min=10, readout=False), IntSlider(value…

#### Merge with object dataset

In [99]:
dataset = mdf.copy()

In [100]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [101]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression=None,
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 119 ; Columns : 15 ; Unique on 'ID_ech' : 21 ; 


## Merging dataset copies

In [102]:
dataframe_viewer(dataset_1, rows=10, un_val=['ID_ech', 'Date_prv'], view=t)

Rows : 149, columns : 25, Unique values on cols: {'ID_ech': 94, 'Date_prv': 11}


interactive(children=(IntSlider(value=10, description='rows', max=149, min=10, readout=False), IntSlider(value…

In [103]:
dataframe_viewer(dataset_2, rows=10, un_val=['ID_ech', 'Date_prv'], view=t)

Rows : 732, columns : 46, Unique values on cols: {'ID_ech': 268, 'Date_prv': 30}


interactive(children=(IntSlider(value=10, description='rows', max=732, min=10, readout=False), IntSlider(value…

In [104]:
mdf_1, conflict_df=data_merger(dataset_1, dataset_2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [105]:
dataset = mdf_1
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [106]:
mdf_2, conflict_df=data_merger(dataset, dataset_3, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [107]:
dataframe_viewer(conflict_df, rows=5,)

Rows : 48, columns : 16


interactive(children=(IntSlider(value=5, description='rows', max=48, min=5, readout=False), IntSlider(value=12…

In [108]:
mdf_2.query(f'index=={list(conflict_df.index)}')

Unnamed: 0,ID_ech,Date_prv,X,Y,Z,level_0,Zone,T,Fract_45µ,Polluant,...,Long_for,Nappe,Niv_eau_pz,Periode,Temp_pH_mes,Fract_min_50µ,pH_H20,Long_pz,Ech_top,Description
70,P12 essai désorption,2017-09-21 00:00:00,152877.82,122573.9,102.23,70,,,,,...,,,,,,,,,,
71,P13,2017-03-13 00:00:00,152824.64,122646.92,102.93,71,,20.3,,,...,,,,,,,,,,
79,P19/1,2017-05-18,152876.876,122569.597,102.129,84,,,,,...,,,,,,,,,0.3,
306,221,2020-06-02 00:00:00,,,,311,,,,,...,,,1.22,T4,,,,4.0,,
323,221,2020-12-01 00:00:00,,,,328,,,,,...,,,1.09,T7,,,,3.34,,
340,221,2020-03-09 00:00:00,,,,345,,,,,...,,,1.22,T2,,,,4.0,,
357,221,2020-08-04 00:00:00,,,,362,,,,,...,,,1.02,T5,,,,3.87,,
374,25,2019-09-03,,,,492,Cokerie de Marchienne,,,,...,,Remblai_All,1.53,#conflict,,,,,,
392,23,2018-01-25,,,,772,,,,,...,,,,Avant chauffe,19.4,,,,,
409,_201_,2019-12-09 00:00:00,,,,949,,,,,...,,,1.38,Avant chauffe,,,,3.86,,


In [109]:
mdf_2.drop(index=conflict_df.index, inplace=True)
mdf_2.reset_index(drop=True, inplace=True)

In [110]:
dataset = mdf_2
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save final object dataset}}$

In [111]:
dataset = mdf_1
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [112]:
dataset = mdf_2
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [113]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
mdf_1.to_csv(save_dir + save_file, index=False)
mdf_2.to_csv(save_dir + save_file.rstrip('.csv')+'_plus.csv', index=False)

## Querying