# Data Gathering

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    i = 0
    for f in files:
        i += 1
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df{i} : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [4]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0, 
            'Measure':0,'Sample':0,'Analysis':0,'Facility':0, 'Final':0}

In [5]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  9
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  9
Equipm  	:  4
Measure  	:  8
Sample  	:  31
Analysis  	:  29
Facility  	:  4
Final  	:  18


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# ================== PROCESSING ===================== 

# Analysis

In [15]:
key='Analysis'
save_file = f'Merged_Analysis.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

29 files


In [16]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Param_agro_Analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Container_phyto/Result_SOL_Analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_SOL_Analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Memoris_seafile/Result_eau_Analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Final_merge/Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Final_merge/Water_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_eau_Analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Result_sol_Analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Final_merge/Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Result_SOL_Analysis.csv',
 '/home/yan

In [8]:
key='Final'
save_file = f'Merged_Analysis.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

18 files


In [9]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Final_merge/Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Final_merge/Water_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Final_merge/Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Final_merge/Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Final_merge/Lithologies.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Final_merge/Measures.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Final_merge/Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Final_merge/Soil_samples.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Final_merge/Unknow_facilities.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memo

In [10]:
data_overview(files_dict[key])

Same files:[]
Files with coordinates:[1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17]
Files without coordinates:[0, 2]


## No coordinates

#### $\color{green}{\textbf{Read and merge}}$

In [11]:
file1= work_dir + 'Phase_1_Memoris/Final_merge/Soil_analysis.csv' # 0
file2= work_dir + 'Phase_2_Memoris/Final_merge/Soil_analysis.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 29, columns : 74, Unique values on col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=12…

Rows : 25, columns : 64, Unique values on col 'ID_ech': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

(None, None)

In [12]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### First object dataset save

In [13]:
dataset_1 = mdf.copy() #saving

In [14]:
dataframe_viewer(dataset_1, rows=3, un_val='ID_ech', view=t)

Rows : 54, columns : 81, Unique values on col 'ID_ech': 54


interactive(children=(IntSlider(value=3, description='rows', max=54, min=3, readout=False), IntSlider(value=12…

## with coordinates

#### $\color{green}{\textbf{Read and merge}}$

In [40]:
a, b = 1, 5
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=5, un_val='ID', view=t), dataframe_viewer(df2, rows=5, un_val='ID', view=t)

Phase_1_Memoris/Final_merge/Water_analysis.csv || database_Memoris3/Final_merge/Measures.csv
df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 17, columns : 86, Unique values on col 'ID': 14


interactive(children=(IntSlider(value=5, description='rows', max=17, min=5, readout=False), IntSlider(value=12…

Rows : 534, columns : 28, Unique values on col 'ID': 366


interactive(children=(IntSlider(value=5, description='rows', max=534, min=5, readout=False), IntSlider(value=1…

(None, None)

In [31]:
file1= work_dir + 'Phase_1_Memoris/Final_merge/Soil_analysis.csv' # 0
file2= work_dir + 'Phase_2_Memoris/Final_merge/Soil_analysis.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 29, columns : 74, Unique values on col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=12…

Rows : 25, columns : 64, Unique values on col 'ID_ech': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

(None, None)

In [12]:
fix_duplicates(df1, df2, id_col='ID_ech', drop_old_id=True)

31 duplicate objects fixed!


In [13]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### First object dataset save

In [14]:
dataset = mdf.copy() #saving

In [15]:
dataframe_viewer(dataset, rows=3, un_val='ID_ech', view=t)

Rows : 55, columns : 17, Unique values on col 'ID_ech': 44


interactive(children=(IntSlider(value=3, description='rows', max=55, min=3, readout=False), IntSlider(value=12…

#### $\color{green}{\textbf{Read and merge}}$

In [16]:
file1= work_dir + 'vUmons_logsFor/Analyse_eau_Phases1&2_Samples-water.csv' # 25
file2= work_dir + 'vUmons_logsFor/Analyse_sol_Phases1&2_Samples-soil.csv' # 26


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 45, columns : 11, Unique values on col 'ID_ech': 41


interactive(children=(IntSlider(value=3, description='rows', max=45, min=3, readout=False), IntSlider(value=11…

Rows : 59, columns : 13, Unique values on col 'ID_ech': 59


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=12…

(None, None)

In [17]:
fix_duplicates(df1, df2, id_col='ID_ech', crit_2nd_col='Type_ech', drop_old_id=True)

0 duplicate objects fixed!


In [18]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [19]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [20]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [21]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 146, columns : 26, Unique values on col 'ID_ech': 100


interactive(children=(IntSlider(value=10, description='rows', max=146, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [22]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Samples-water.csv' # 8

df1 = pd.read_csv(file1)
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t)

Rows : 11, columns : 10, Unique values on col 'ID_ech': 11


interactive(children=(IntSlider(value=3, description='rows', max=11, min=3, readout=False), IntSlider(value=10…

#### Merge with object dataset

In [23]:
dataset, conflict_df=data_merger(dataset, df1, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [24]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [25]:
for c in dataset.columns:
    if c in ['index', 'Origin_ID']:
        dataset.drop(columns=c, inplace=True)

In [26]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

some data must be checked !
Rows : 149 ; Columns : 26 ; Unique on 'ID_ech' : 94 ; 


In [27]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 149, columns : 26, Unique values on col 'ID_ech': 94


interactive(children=(IntSlider(value=10, description='rows', max=149, min=10, readout=False), IntSlider(value…

In [28]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [29]:
dataset_1 = dataset.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [30]:
file1= work_dir + 'Container_phyto/Param_agro_Samples-soil.csv' # 0
file2= work_dir + 'Container_phyto/Result_SOL_Samples-soil.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 5, columns : 21, Unique values on col 'ID_ech': 5


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=12,…

Rows : 5, columns : 8, Unique values on col 'ID_ech': 5


interactive(children=(IntSlider(value=3, description='rows', max=5, min=3, readout=False), IntSlider(value=8, …

(None, None)

In [31]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [32]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID_ech', 'Date_prv'], crit_2nd_col='Date_prv', dist_max=1, 
                             error_tol_dict={1:['MS', 'Fract_2', 'Fract_2+']})

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [33]:
conflict_df

Unnamed: 0,Check_col,ID_ech,Date_prv,Fract_2_x,Fract_2_y,Fract_2+_x,Fract_2+_y
0,"Fract_2, Fract_2+",Ech. 1,2017-12-14 00:00:00,79,33,21,67
2,"Fract_2, Fract_2+",Ech. 3,2017-12-14 00:00:00,69,40,31,60
3,"Fract_2, Fract_2+",Ech. 4,2017-12-14 00:00:00,70,45,30,55
4,"Fract_2, Fract_2+",Ech. 5,2017-12-14 00:00:00,71,29,29,71


In [34]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                      valid_dict={'Fract_2+_y':list(conflict_df.index), 'Fract_2_x':list(conflict_df.index)})

all conflicts have been fixed!


In [35]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 5, columns : 21, Unique values on col 'ID_ech': 5


interactive(children=(IntSlider(value=5, description='rows', max=5, min=5, readout=False), IntSlider(value=12,…

#### Merge with object dataset

In [36]:
dataset = mdf.copy()

In [37]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [38]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 5, columns : 21, Unique values on col 'ID_ech': 5


interactive(children=(IntSlider(value=5, description='rows', max=5, min=5, readout=False), IntSlider(value=12,…

#### $\color{green}{\textbf{Read and merge}}$

In [39]:
file1= work_dir + 'Memoris_seafile/Result_SOL_Samples-soil.csv' # 3
file2= work_dir + 'Phase_1_Memoris/Result_sol_Samples-soil.csv' # 6

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 60, columns : 12, Unique values on col 'ID_ech': 60


interactive(children=(IntSlider(value=3, description='rows', max=60, min=3, readout=False), IntSlider(value=12…

Rows : 29, columns : 11, Unique values on col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=11…

(None, None)

In [40]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID_ech'], crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [41]:
dataframe_viewer(conflict_df, rows=3, un_val='ID_ech', view=t)

Rows : 28, columns : 6, Unique values on col 'ID_ech': 28


interactive(children=(IntSlider(value=3, description='rows', max=28, min=3, readout=False), IntSlider(value=6,…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [42]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Fract_2_y':list(conflict_df.index), 'Fract_2+_y':list(conflict_df.index)})

all conflicts have been fixed!


#### Merge with object dataset

In [43]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [44]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [45]:
dataset, check1 = data_filter(dataset, position=True, id_col='ID_ech', expression='sup|prof', 
                              dist_max=1, drop=True, drop_old_id=True)

Rows : 65 ; Columns : 25 ; Unique on 'ID_ech' : 65 ; 


In [46]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 65, columns : 25, Unique values on col 'ID_ech': 65


interactive(children=(IntSlider(value=10, description='rows', max=65, min=10, readout=False), IntSlider(value=…

#### $\color{green}{\textbf{Read and merge}}$

In [47]:
file1= work_dir + 'Phase_2_Memoris/Result_SOL_Samples-soil.csv' # 7
file2= work_dir + 'Siterem_Ext_Pilote/Param_physico_Samples-water.csv' # 10

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 25, columns : 12, Unique values on col 'ID_ech': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

Rows : 33, columns : 13, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=33, min=3, readout=False), IntSlider(value=12…

(None, None)

In [48]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [49]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [50]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


In [51]:
dataframe_viewer(conflict_df, rows=3, un_val='ID_ech', view=t)

Rows : 25, columns : 6, Unique values on col 'ID_ech': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=6,…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [52]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Fract_2_y':list(conflict_df.index), 'Fract_2+_y':list(conflict_df.index)})

all conflicts have been fixed!


In [53]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [54]:
dataframe_viewer(dataset, rows=10, un_val='ID_ech', view=t)

Rows : 98, columns : 35, Unique values on col 'ID_ech': 71


interactive(children=(IntSlider(value=10, description='rows', max=98, min=10, readout=False), IntSlider(value=…

#### $\color{green}{\textbf{Read and merge}}$

In [55]:
file1= work_dir + 'Siterem_Ext_Pilote/Result_eau_Samples-water.csv' # 11
file2= work_dir + 'Siterem_Pilote/Inorganic_major_Samples-water.csv' # 12

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 31, columns : 8, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=31, min=3, readout=False), IntSlider(value=8,…

Rows : 51, columns : 6, Unique values on col 'ID_ech': 7


interactive(children=(IntSlider(value=3, description='rows', max=51, min=3, readout=False), IntSlider(value=6,…

(None, None)

In [56]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [57]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

#### Merge with object dataset

In [58]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on=['ID_ech', 'Periode'], 
                                 crit_2nd_col='Date_prv', dist_max=1)

dataframe_viewer(t1.query("ID_ech=='207'"), rows=5, cols=13, un_val='ID_ech', view=t)

len(mdf), len(dataset), len(t2)

In [59]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [60]:
file1= work_dir + 'Siterem_Result_Sol/Result_sol_ExtP_Samples-soil.csv' # 15
file2= work_dir + 'Siterem_Result_Sol/SOL_T1_Pilote_Samples-soil.csv' # 16

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 43, columns : 19, Unique values on col 'ID_ech': 43


interactive(children=(IntSlider(value=3, description='rows', max=43, min=3, readout=False), IntSlider(value=12…

Rows : 15, columns : 11, Unique values on col 'ID_ech': 15


interactive(children=(IntSlider(value=3, description='rows', max=15, min=3, readout=False), IntSlider(value=11…

(None, None)

In [61]:
#df2.rename(columns={'ID':'ID_ech'}, inplace=True)
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [62]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [63]:
dataframe_viewer(mdf, rows=5, un_val='ID_ech', view=t)

Rows : 58, columns : 20, Unique values on col 'ID_ech': 58


interactive(children=(IntSlider(value=5, description='rows', max=58, min=5, readout=False), IntSlider(value=12…

#### Merge with object dataset

In [64]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [65]:
dataframe_viewer(conflict_df, rows=3, un_val='ID_ech', view=t)

Rows : 6, columns : 6, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=6, min=3, readout=False), IntSlider(value=6, …

In [66]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Fract_2_y':list(conflict_df.index), 'Fract_2+_y':list(conflict_df.index)})

all conflicts have been fixed!


In [67]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [68]:
v = dataset.query("ID_ech=='201'")#[['ID_ech', 'Date_prv', 'Periode', 'pH', 'Emplacement']]
dataframe_viewer(v, rows=3, un_val='ID_ech', view=t)

Rows : 7, columns : 38, Unique values on col 'ID_ech': 1


interactive(children=(IntSlider(value=3, description='rows', max=7, min=3, readout=False), IntSlider(value=12,…

#### $\color{green}{\textbf{Read and merge}}$

In [69]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Samples-water.csv' # 17
file2= work_dir + 'database_Memoris3/Result_eau_Samples-water.csv' # 18

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 130, columns : 12, Unique values on col 'ID': 130


interactive(children=(IntSlider(value=3, description='rows', max=130, min=3, readout=False), IntSlider(value=1…

Rows : 130, columns : 13, Unique values on col 'ID': 130


interactive(children=(IntSlider(value=3, description='rows', max=130, min=3, readout=False), IntSlider(value=1…

(None, None)

In [70]:
df1.rename(columns={'ID':'ID_ech'}, inplace=True)
df2.rename(columns={'ID':'ID_ech'}, inplace=True)
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [71]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [72]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 130, columns : 15, Unique values on col 'ID_ech': 130


interactive(children=(IntSlider(value=10, description='rows', max=130, min=10, readout=False), IntSlider(value…

#### Merge with object dataset

In [73]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [74]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [75]:
dataset_2 = dataset.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [76]:
file1= work_dir + 'database_Memoris3/Result_sol_Samples-soil.csv' # 19
file2= work_dir + 'donnees_terrain_2019/Echantillon_Samples-soil.csv' # 21

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df2 :  --> No coordinates
Rows : 1414, columns : 28, Unique values on col 'ID': 321


interactive(children=(IntSlider(value=3, description='rows', max=1414, min=3, readout=False), IntSlider(value=…

Rows : 70, columns : 9, Unique values on col 'ID': 22


interactive(children=(IntSlider(value=3, description='rows', max=70, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [77]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)
df1.rename(columns={'Intensité':'Intensite'}, inplace=True)

In [78]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [79]:
mdf = mdf.query("ID_ech==ID_ech")

In [80]:
dataframe_viewer(mdf, rows=5, un_val='ID_ech', view=t)

Rows : 1419, columns : 29, Unique values on col 'ID_ech': 1372


interactive(children=(IntSlider(value=5, description='rows', max=1419, min=5, readout=False), IntSlider(value=…

#### Merge with object dataset

In [81]:
dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [82]:
file1= work_dir + 'donnees_terrain_2019/Result_Sol_Samples-soil.csv' # 22
file2= work_dir + 'profils_sols_donnees_forages/Echant-organo_Samples-soil.csv' # 24

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 55, columns : 19, Unique values on col 'ID_ech': 55


interactive(children=(IntSlider(value=3, description='rows', max=55, min=3, readout=False), IntSlider(value=12…

Rows : 32, columns : 9, Unique values on col 'ID_ech': 29


interactive(children=(IntSlider(value=3, description='rows', max=32, min=3, readout=False), IntSlider(value=9,…

(None, None)

In [83]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)
df2.rename(columns={'Intensité':'Intensite'}, inplace=True)

In [84]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [85]:
dataframe_viewer(mdf, rows=10, un_val='ID_ech', view=t)

Rows : 87, columns : 24, Unique values on col 'ID_ech': 84


interactive(children=(IntSlider(value=10, description='rows', max=87, min=10, readout=False), IntSlider(value=…

In [86]:
dataset.drop(columns='ID', inplace=True)
mdf.drop(columns='ID', inplace=True)

#### Merge with object dataset

In [87]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [88]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [89]:
file1= work_dir + 'Siterem_Pilote/Result_eau_Samples-water.csv' # 14
file2= work_dir + 'observ_terrain/Piezometrie_Samples-water.csv' # 23

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 87, columns : 13, Unique values on col 'ID_ech': 10


interactive(children=(IntSlider(value=3, description='rows', max=87, min=3, readout=False), IntSlider(value=12…

Rows : 27, columns : 14, Unique values on col 'ID': 27


interactive(children=(IntSlider(value=3, description='rows', max=27, min=3, readout=False), IntSlider(value=12…

(None, None)

In [90]:
df2.rename(columns={'ID':'ID_ech'}, inplace=True)
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)

In [91]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID_ech'], crit_2nd_col='Date_prv', dist_max=1)

In [92]:
dataframe_viewer(mdf, rows=5, un_val='ID_ech', view=t)

Rows : 114, columns : 20, Unique values on col 'ID_ech': 37


interactive(children=(IntSlider(value=5, description='rows', max=114, min=5, readout=False), IntSlider(value=1…

#### Merge with object dataset

In [93]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [94]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [95]:
file1= work_dir + 'Container_phyto/Param_agro_Samples-water.csv' # 1
file2= work_dir + 'Siterem_Pilote/Param_physico_Samples-water.csv' # 13

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID_ech', view=t), dataframe_viewer(df2, rows=3, un_val='ID_ech', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 24, columns : 6, Unique values on col 'ID_ech': 6


interactive(children=(IntSlider(value=3, description='rows', max=24, min=3, readout=False), IntSlider(value=6,…

Rows : 95, columns : 13, Unique values on col 'ID_ech': 15


interactive(children=(IntSlider(value=3, description='rows', max=95, min=3, readout=False), IntSlider(value=12…

(None, None)

In [96]:
df1.ID_ech = df1.ID_ech.astype(str)
df2.ID_ech = df2.ID_ech.astype(str)
df2.rename(columns={'Temp_prv ':'Temp_prv'}, inplace=True)

In [97]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID_ech', crit_2nd_col='Date_prv', dist_max=1)

In [98]:
dataframe_viewer(mdf, rows=5, un_val='ID_ech', view=t)

Rows : 119, columns : 14, Unique values on col 'ID_ech': 21


interactive(children=(IntSlider(value=5, description='rows', max=119, min=5, readout=False), IntSlider(value=1…

In [99]:
mdf = mdf.query('ID_ech==ID_ech')

#### Merge with object dataset

In [100]:
t1, conflict_df=data_merger(dataset, mdf, how=how[1], on=['ID_ech', 'Date_prv'], crit_2nd_col='Date_prv', dist_max=1)

In [101]:
dataframe_viewer(t1, rows=3, un_val='ID_ech', view=t)

Rows : 1685, columns : 56, Unique values on col 'ID_ech': 1451


interactive(children=(IntSlider(value=3, description='rows', max=1685, min=3, readout=False), IntSlider(value=…

In [102]:
dataset_3 = dataset.copy()

===============================================================================================
## Merging dataset versions

In [103]:
# 1st merge
dataframe_viewer(dataset_1, rows=3, un_val=['ID_ech', 'Date_prv'], view=t), 
dataframe_viewer(dataset_2, rows=3, un_val=['ID_ech', 'Date_prv'], view=t)

Rows : 149, columns : 26, Unique values on cols: {'ID_ech': 94, 'Date_prv': 11}


interactive(children=(IntSlider(value=3, description='rows', max=149, min=3, readout=False), IntSlider(value=1…

Rows : 336, columns : 50, Unique values on cols: {'ID_ech': 260, 'Date_prv': 30}


interactive(children=(IntSlider(value=3, description='rows', max=336, min=3, readout=False), IntSlider(value=1…

In [104]:
for i, r in dataset_1.iterrows():
    if isinstance(r['Date_prv'], str) and '00:00:00' not in r['Date_prv']:
        dataset_1.loc[i, 'Date_prv'] = r['Date_prv'] + ' 00:00:00'

In [105]:
for i, r in dataset_2.iterrows():
    if isinstance(r['Date_prv'], str) and '00:00:00' not in r['Date_prv']:
        dataset_2.loc[i, 'Date_prv'] = r['Date_prv'] + ' 00:00:00'

In [106]:
dataset_2.Ech_top = dataset_2.Ech_top.replace(',','.', regex=True)
dataset_2.Ech_base = dataset_2.Ech_base.replace(',','.', regex=True)

In [107]:
dataset_2.Ech_top = dataset_2.Ech_top.astype(float)
dataset_2.Ech_base = dataset_2.Ech_base.astype(float)

In [108]:
mdf, conflict_df=data_merger(dataset_1, dataset_2, how=how[1], on=['ID_ech', 'Date_prv','Ech_top', 'Ech_base'], 
                               crit_2nd_col='Date_prv', dist_max=1)

Conflict values present. Please resolve this manually !


#### $\color{blue}{\textbf{Manage conflicts}}$

In [109]:
dataframe_viewer(conflict_df, rows=5)

Rows : 3, columns : 13


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=12,…

In [110]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Fract_2_y':list(conflict_df.index), 'Fract_2+_y':list(conflict_df.index),
                           'MS_y':list(conflict_df.index), 'Organo_y':list(conflict_df.index)})

all conflicts have been fixed!


In [111]:
dataset = mdf.copy()
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [112]:
# 2nd merge
dataframe_viewer(dataset, rows=3, un_val=['ID_ech', 'Date_prv'], view=t), 
dataframe_viewer(dataset_3, rows=3, un_val=['ID_ech', 'Date_prv'], view=t)

Rows : 477, columns : 58, Unique values on cols: {'ID_ech': 325, 'Date_prv': 30}


interactive(children=(IntSlider(value=3, description='rows', max=477, min=3, readout=False), IntSlider(value=1…

Rows : 1567, columns : 55, Unique values on cols: {'ID_ech': 1440, 'Date_prv': 1374}


interactive(children=(IntSlider(value=3, description='rows', max=1567, min=3, readout=False), IntSlider(value=…

In [113]:
mdf, conflict_df=data_merger(dataset, dataset_3, how=how[1], on=['ID_ech', 'Date_prv'], dist_max=1)

In [114]:
dataset = mdf.copy()
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save final object dataset}}$

In [115]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

## Querying