# Final merge of memoris data

In [None]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates, gen_id_from_ech

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [None]:
sufx = ['sup', 'prof', 'inf', '/\dM(\*)?']
prefx = ['eau forage ']
id_reg = '\s*(?P<id>(?:^canne |Piezair )*\w*\d+\w*)\s*'

In [None]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    i = 0
    for f in files:
        i += 1
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df{i} : {msg}")
            
    return dfs

## Collecting files

In [None]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Fusion_finale/'

In [None]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Litho':0,'Equipm':0,'Measure':0,'Sample':0,'Unknow':0}

In [None]:
files_search(work_dir, files_dict, prefix='source', details=False)

In [None]:
how=['inner', 'outer', 'left', 'right']
view = False

# ================== PROCESSING  ===================== 

# Boreholes

In [None]:
key='Borehole'
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
data_overview(files_dict[key])

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'Memoris_seafile/source_merge/source_Boreholes.csv' # 1
file2= work_dir + 'Phase_1_Memoris/source_merge/source_Boreholes.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID'], dist_max=1)

In [None]:
dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'Phase_2_Memoris/source_merge/source_Boreholes.csv' # 3
file2= work_dir + 'Prof_contact_sol_forage/source_merge/source_Boreholes.csv' # 4

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID'], dist_max=1)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'database_Memoris3/source_merge/source_Boreholes.csv' # 8
file2= work_dir + 'donnees_terrain_2019/source_merge/source_Boreholes.csv' # 9

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Type_y':list(conflict_df.index), 'Long_for_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'profils_sols_donnees_forages/source_merge/source_Boreholes.csv' # 11
file2= work_dir + 'vUmons_logsFor/source_merge/source_Boreholes.csv' # 13

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Date_for_y':list(conflict_df.index), 'Type_y':list(conflict_df.index), 
                            'Long_for_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'Forage_Pilote/source_merge/source_Boreholes.csv' # 0
file2= work_dir + 'Siterem_Ext_Pilote/source_merge/source_Boreholes.csv' # 5

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df2['ID'] = df2['ID'].astype('object')

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Z_x':list(conflict_df.index), 'Type_x':list(conflict_df.index), 
                            'Long_for_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'Siterem_Pilote/source_merge/source_Boreholes.csv' # 6
file2= work_dir + 'Siterem_Result_Sol/source_merge/source_Boreholes.csv' # 7

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Long_pz_x':list(conflict_df.index), 'Type_x':list(conflict_df.index), 
                            'Long_for_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'observ_terrain/source_merge/source_Boreholes.csv' # 10
file2= work_dir + 'result_sol_ext_pilote/source_merge/source_Boreholes.csv' # 12

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Long_pz_sol_x':[18], 'Long_pz_sol_y':[i for i in conflict_df.index if i not in [18]], 
                            'Rmq_y':list(conflict_df.index), 'Type_x':list(conflict_df.index),})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save dataset}}$

In [None]:
all_bh = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID','ID_ech'])

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + 'Boreholes.csv', index=False)

###  ====================================================

# Lithologies

In [None]:
key='Litho'
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
data_overview(files_dict[key])

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'database_Memoris3/source_merge/source_Lithologies.csv' # 0
file2= work_dir + 'profils_sols_donnees_forages/source_merge/source_Lithologies.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID', 'Litho_top', 'Litho_base'], dist_max=1)

In [None]:
dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'donnees_terrain_2019/source_merge/source_Lithologies.csv' # 1
file2= work_dir + 'vUmons_logsFor/source_merge/source_Lithologies.csv' # 3

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID', 'Litho_top', 'Litho_base'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID', 'Litho_top', 'Litho_base'], dist_max=1)

####  $\color{red}{\textbf{Save dataset}}$

In [None]:
all_litho = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID','ID_ech'])

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + 'Lithologies.csv', index=False)

###  ====================================================

# Samples

In [None]:
key='Sample'
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
data_overview(files_dict[key])

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'Memoris_seafile/source_merge/source_Samples.csv' # 2
file2= work_dir + 'Liste_XY/source_merge/source_Samples.csv' # 1

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID', 'ID_ech'], dist_max=1)

In [None]:
dataset = mdf.copy()

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir +  'Phase_1_Memoris/source_merge/source_Samples.csv' # 3
file2= work_dir + 'Phase_2_Memoris/source_merge/source_Samples.csv' # 4

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'C16-C21_y':list(conflict_df.index), 'C21-C35_y':list(conflict_df.index), 
                            'C12-C16_y':list(conflict_df.index), 'Fract_2+_y':list(conflict_df.index), 
                            'C10-C12_y':list(conflict_df.index), 'HC_tot_C10-C35_y':list(conflict_df.index), 
                            'Fract_2_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir +  'database_Memoris3/source_merge/source_Samples.csv' # 8
file2= work_dir +  'profils_sols_donnees_forages/source_merge/source_Samples.csv' # 10

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID', 'ID_ech'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Nappe_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir +  'Container_phyto/source_merge/source_Samples.csv' # 0
file2= work_dir +  'vUmons_logsFor/source_merge/source_Samples.csv' # 12

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID_ech', 'Date_ech'], dist_max=1)

In [None]:
mdf = gen_id_from_ech(mdf, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
data = mdf
col = 'ID_ech'
for i in data.index:
    v = data.loc[i, col]
    if not pd.isnull(v) and re.search('ech', v, re.I):
        data.loc[i, 'ID'] = 'F_' + re.sub(' |.','', v,re.I)
mdf = data.copy()

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir +  'Siterem_Ext_Pilote/source_merge/source_Samples.csv' # 5
file2= work_dir +  'Siterem_Pilote/source_merge/source_Samples.csv' # 6

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
df1['ID_ech'] = df1['ID_ech'].astype('object')

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'C16-C21_x':list(conflict_df.index), 'Type_ech_x':list(conflict_df.index), 
                            'C12-C16_x':list(conflict_df.index), 'C10-C12_x':list(conflict_df.index), 
                            'HC_tot_C10-C35_x':list(conflict_df.index),  'C21-C35_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir +  'Siterem_Result_Sol/source_merge/source_Samples.csv' # 7
file2= work_dir +  'donnees_terrain_2019/source_merge/source_Samples.csv' # 9

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df2['ID'] = df2['ID'].astype('object')

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on=['ID', 'ID_ech'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how=how[1], on=['ID', 'ID_ech'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index', pass_col='ID', 
                valid_dict={'Nappe_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

####  $\color{red}{\textbf{Save dataset}}$

In [None]:
all_samp = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID','ID_ech'])

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + 'Samples.csv', index=False)

###  ====================================================

###  ====================================================

# Measures

In [None]:
key='Measure'
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

###  ====================================================

# Equipements

In [None]:
key='Equip'
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

###  ====================================================

# Unknows

In [None]:
key='Unknow'
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

### ================= DATASETS MERGING WITH BOREHOLES ===========================

## Querying