# Final merge of memoris data

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    i = 0
    for f in files:
        i += 1
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df{i} : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/final_data/'

In [4]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Litho':0,'Equipm':0,'Measure':0,'Sample':0,'Analysis':0}

In [5]:
files_search(work_dir, files_dict, prefix='final', skip='source')

Borehole  	:  2
Litho  	:  2
Equipm  	:  2
Measure  	:  2
Sample  	:  2
Analysis  	:  10


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# ================== PROCESSING  ===================== 

For a first purpose, only Boreholes, lithologies, samples and soil analysis will treated here

# Boreholes

In [8]:
key='Borehole'
save_file = f'Merged_{key}.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

2 files


In [9]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Final_merge/final_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Final_merge/final_Boreholes.csv']

#### $\color{green}{\textbf{Read and merge}}$

In [10]:
file1= work_dir + 'donnees_terrain_2019/Final_merge/final_Boreholes.csv' # 0
file2= work_dir + 'profils_sols_donnees_forages/Final_merge/final_Boreholes.csv' # 1

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 25, columns : 18, Unique values on col 'ID': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

Rows : 25, columns : 23, Unique values on col 'ID': 25


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

(None, None)

In [11]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', crit_2nd_col='Date_ouv', dist_max=1)

In [12]:
boreholes = mdf.copy() #saving

In [13]:
dataframe_viewer(boreholes, rows=3, un_val='ID', view=t)

Rows : 50, columns : 26, Unique values on col 'ID': 50


interactive(children=(IntSlider(value=3, description='rows', max=50, min=3, readout=False), IntSlider(value=12…

In [14]:
dataset = mdf.copy()
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save object dataset}}$

In [15]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

# Lithologies

In [16]:
key='Litho'
save_file = f'Merged_{key}.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

2 files


In [17]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Final_merge/final_Lithologies.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Final_merge/final_Lithologies.csv']

#### $\color{green}{\textbf{Read and merge}}$

In [18]:
file1= work_dir + 'donnees_terrain_2019/Final_merge/final_Lithologies.csv' # 0
file2= work_dir + 'profils_sols_donnees_forages/Final_merge/final_Lithologies.csv' # 1

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 88, columns : 23, Unique values on col 'ID': 26


interactive(children=(IntSlider(value=3, description='rows', max=88, min=3, readout=False), IntSlider(value=12…

Rows : 55, columns : 18, Unique values on col 'ID': 25


interactive(children=(IntSlider(value=3, description='rows', max=55, min=3, readout=False), IntSlider(value=12…

(None, None)

In [19]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID', 'Litho_top', 'Litho_base'], dist_max=1)

In [20]:
lithologies = mdf.copy() #saving

In [21]:
dataframe_viewer(lithologies, rows=3, un_val='ID', view=t)

Rows : 143, columns : 25, Unique values on col 'ID': 51


interactive(children=(IntSlider(value=3, description='rows', max=143, min=3, readout=False), IntSlider(value=1…

In [22]:
dataset = mdf.copy()
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save object dataset}}$

In [23]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

# Samples

In [24]:
key='Sample'
save_file = f'Merged_{key}.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

2 files


In [25]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Final_merge/final_Soil_samples.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Final_merge/final_Soil_samples.csv']

#### $\color{green}{\textbf{Read and merge}}$

In [26]:
file1= work_dir + 'donnees_terrain_2019/Final_merge/final_Soil_samples.csv' # 0
file2= work_dir + 'profils_sols_donnees_forages/Final_merge/final_Soil_samples.csv' # 1

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 73, columns : 26, Unique values on col 'ID': 25


interactive(children=(IntSlider(value=3, description='rows', max=73, min=3, readout=False), IntSlider(value=12…

Rows : 42, columns : 22, Unique values on col 'ID': 25


interactive(children=(IntSlider(value=3, description='rows', max=42, min=3, readout=False), IntSlider(value=12…

(None, None)

In [27]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID', 'ID_ech'], crit_2nd_col='Date_ouv', dist_max=1)

In [28]:
samples = mdf.copy() #saving

In [29]:
dataframe_viewer(samples, rows=3, un_val='ID', view=t)

Rows : 115, columns : 29, Unique values on col 'ID': 50


interactive(children=(IntSlider(value=3, description='rows', max=115, min=3, readout=False), IntSlider(value=1…

In [30]:
dataset = mdf.copy()
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save object dataset}}$

In [31]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

# Analysis (soil only)

In [32]:
key='Analysis'
save_file = f'Merged_{key}.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

10 files


In [33]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Final_merge/final_Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_1_Memoris/Final_merge/final_Water_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Final_merge/final_Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Phase_2_Memoris/Final_merge/final_Water_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Siterem_Ext_Pilote/Final_merge/final_water_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Siterem_Pilote/Final_merge/final_water_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Siterem_Result_Sol/Final_merge/final_Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Final_merge/final_Soil_analysis.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/vUmons_logsFor/Final_merge/final_Soil_analysis.csv',
 '/hom

#### $\color{green}{\textbf{Read and merge}}$

In [34]:
file1= work_dir + 'Phase_1_Memoris/Final_merge/final_Soil_analysis.csv' # 0
file2= work_dir + 'Phase_2_Memoris/Final_merge/final_Soil_analysis.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df2 :  --> No coordinates
Rows : 29, columns : 74, Unique values on col 'ID': 15


interactive(children=(IntSlider(value=3, description='rows', max=29, min=3, readout=False), IntSlider(value=12…

Rows : 25, columns : 64, Unique values on col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=25, min=3, readout=False), IntSlider(value=12…

(None, None)

In [35]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID_ech'], dist_max=1)

#### First object dataset save

In [36]:
analysis = mdf.copy() #saving

In [37]:
dataframe_viewer(analysis, rows=3, un_val='ID', view=t)

Rows : 54, columns : 81, Unique values on col 'ID': 27


interactive(children=(IntSlider(value=3, description='rows', max=54, min=3, readout=False), IntSlider(value=12…

#### $\color{green}{\textbf{Read and merge}}$

In [38]:
file1= work_dir + 'Siterem_Result_Sol/Final_merge/final_Soil_analysis.csv' # 6
file2= work_dir + 'donnees_terrain_2019/Final_merge/final_Soil_analysis.csv' # 7

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df2 :  --> Coordinates
Rows : 58, columns : 83, Unique values on col 'ID': 58


interactive(children=(IntSlider(value=3, description='rows', max=58, min=3, readout=False), IntSlider(value=12…

Rows : 75, columns : 112, Unique values on col 'ID': 26


interactive(children=(IntSlider(value=3, description='rows', max=75, min=3, readout=False), IntSlider(value=12…

(None, None)

In [39]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID_ech'], dist_max=1)

Conflict values present. Please resolve this manually !


In [40]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={i+'_y':list(conflict_df.index) for i in conflict_df.loc[44,'Check_col'].split(", ")})

all conflicts have been fixed!


In [41]:
analysis, conflict_df=data_merger(analysis, mdf, how=how[1], on=['ID_ech'], dist_max=1)

#### $\color{green}{\textbf{Read and merge}}$

In [42]:
file1= work_dir + 'vUmons_logsFor/Final_merge/final_Soil_analysis.csv' # 8
file2= work_dir + 'donnees_terrain_2019/Final_merge/final_Soil_analysis.csv' # 7

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> Coordinates
df2 :  --> Coordinates
Rows : 59, columns : 74, Unique values on col 'ID': 30


interactive(children=(IntSlider(value=3, description='rows', max=59, min=3, readout=False), IntSlider(value=12…

Rows : 75, columns : 112, Unique values on col 'ID': 26


interactive(children=(IntSlider(value=3, description='rows', max=75, min=3, readout=False), IntSlider(value=12…

(None, None)

In [43]:
mdf, conflict_df=data_merger(analysis, df1, how=how[1], on=['ID_ech'], dist_max=1)

Conflict values present. Please resolve this manually !


In [44]:
vlist = []
for x in conflict_df.Check_col:
    vlist += x.split(", ")

vlist = list(pd.unique(vlist))
print(vlist)

['Fract_C21C35', 'ID', 'Fract_C16C21', 'HC_tot_C10C35', 'Fract_C10C12', 'Fract_C12C16', 'Date_prv']


In [45]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={i+'_y':list(conflict_df.index) for i in vlist})

all conflicts have been fixed!


In [46]:
analysis = mdf.copy() #saving

In [47]:
dataset = mdf.copy()
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save object dataset}}$

In [48]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

### ================= DATASETS MERGING ===========================

In [49]:
mdf, conflict_df = data_merger(samples, boreholes, how=how[1], on=['ID', 'Date_ouv'], 
                             crit_2nd_col='Date_ouv', dist_max=1)

In [50]:
dataset, conflict_df = data_merger(mdf, lithologies, how=how[1], on=['ID', 'Date_ouv'], 
                              crit_2nd_col='Date_ouv', dist_max=1)

In [51]:
mdf, conflict_df = data_merger(dataset, analysis, how=how[1], on=['ID_ech', 'Date_ouv'], 
                              crit_2nd_col='Date_ouv', dist_max=1)

Conflict values present. Please resolve this manually !


In [52]:
vlist = []
for x in conflict_df.Check_col:
    vlist += x.split(", ")

vlist = list(pd.unique(vlist))
print(vlist)

['Description', 'Pol_top', 'Pol_base', 'Intensite', 'ID', 'ID_date', 'Long_for']


In [53]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={**{i+'_y':list(conflict_df.index) for i in vlist if i != 'Description'},
                           **{'Description_x':list(conflict_df.index)}})

all conflicts have been fixed!


In [61]:
dataframe_viewer(mdf, rows=3, un_val='ID', view=t)

Rows : 533, columns : 131, Unique values on col 'ID': 47


interactive(children=(IntSlider(value=3, description='rows', max=533, min=3, readout=False), IntSlider(value=1…

## Filtering

In [54]:
mdf = mdf.query('ID_ech==ID_ech and X==X and Description==Description').query('ID_ech.str.contains("\d")', engine='python')
mdf.reset_index(inplace=True, drop=True)

In [55]:
mdf['test'] = mdf.pop('cyanure (libre)').fillna(mdf.pop('Cyanure (libre)'))
mdf['test'] = mdf['test'].fillna(mdf['CN_libre'])
mdf['CN_libre'] = mdf.pop('test')

In [56]:
mdf = mdf[['ID_ech', 'X', 'Y', 'Z', 'Date_ouv', 'Date_prv', 'Ech_top', 'Ech_base', 'Type_ech', 'Nature_ech', 
           'Description', 'Litho_top', 'Litho_base', 'Long_pz', 'Long_for', 'Profondeur', 'Long_pz-sol',
           'Diam_for', 'Diam_int_pz', 'Diam_ext_pz', 'Type', 'Refus','Type_refus', 'Resp_chantier', 'Societe', 
           'Methode', 'Method', 'ID', 'haut_pz-sol',  'Niv_eau_sol', 'Nappe', 'Crep_long', 'Long_crep', 
           'Polluant', 'Pol_top', 'Pol_base', 'Intensite', 'Intensité', 
           '1,2-DCE', 'Acenaphtyl', 'P-M-Xyl', 'Chloroforme', 'Fract_C21C35', 'Bnz(a)pyr', 'Arom_C7C8', 'EOX', 
           '1,1-DCE', '1,1-DCEn','Bnz(b)flranth', 'Cr_VI', 'CN_libre', 'Bnz(ghi)peryl', 'Ca', 'Bnz(k)flranth', 
           'TCEyn', 'NH4', 'Cis-1,2-DCEn', 'Flranth', 'Na',  '1,1,2-TCE', 'CN_comp', 'HAP_tot_EPA', 'CVinyl', 
            'Mg',  'Cobalt', 'N_Kjdl', 'HC_tot_C10C35', 'Phenol', 'Arom_C8C10', 'Fe', 
           'chlorures', 'K', 'Arom_C6C7', 'aluminium', 'P_tot', 'Cr', 'TetraCEyn', 'thioCN', 'Fract_2',
           'Trans 1,2-DCEyl', 'Pyr', 'Organo', 'Anthrc', 'Fract_C16C21', 'DCM', 'Ni', 
           '(cis,trans) 1,2-DCE_tot', 'Naphta', 'EthylBnz', 'MS', 'Aliphat_C8C10', 'Chrys', 
           'Indeno(1,2,3-cd)pyr', 'Hg', 'Bnz', 'Toln', 'Zone', 'sulfate',  'CN_APE', 'HC_tot_C10C40', 
           'Fract_2+', 'Aliphat_C6C8', 'Zn', 'Fract_C5C8', 'Aliphat_C5C6', 
           'Bnz(a)anthrc', 'O-Xyl', 'Fluorene', 'Fract_C10C12', 'Xyl', '1,1,1-TCE', 'Cd', 'Idc_Phenol', 'As', 
           'Fract_C35C40', 'Fract_C12C16', 'Dibnz(ah)anthrc', 'MTBE', 'Phenanthr', 'TCM', 'ID_2', 'Pb', 
           'Acenaphtn', 'col_34', 'phosphore', '1,2-DCP', 'Styr', 'CN_tot', 'Cu', 'Fract_C8C10', 'Mn', ]]

In [57]:
uniq_mdf = mdf.drop_duplicates(['ID_ech'])

In [58]:
dataframe_viewer(uniq_mdf, rows=5, un_val='ID_ech', view=t)

Rows : 112, columns : 131, Unique values on col 'ID_ech': 112


interactive(children=(IntSlider(value=5, description='rows', max=112, min=5, readout=False), IntSlider(value=1…

In [59]:
dataset = uniq_mdf.copy()
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save final object dataset}}$

In [60]:
save_file = 'Final_dataset.csv'
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)

## Querying