# FUSION FINALE DES JEUX DE DONNEES

In [None]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
gen_id_from_ech, na_line_drop

import re, os
import pandas as pd
from definitions import ROOT_DIR

In [None]:
def create_df(files, check_position=True, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    i = 0
    for f in files:
        i += 1
        df = pd.read_csv(f, delimiter=',')
        id_cols = ['ID', 'ID_ech']
        for id_col in id_cols:
            if id_col in df.columns:
                df[id_col] = df[id_col].apply(lambda x: str(x) if not pd.isnull(x) else x)
                
        if check_position:
            if 'X' in df.columns:
                df['X'] = df['X'].apply(lambda x: x.replace(',','.') if isinstance(x, str) else x)
                df['X'] = df['X'].astype('float64')
            if 'Y' in df.columns:
                df['Y'] = df['Y'].apply(lambda x: x.replace(',','.') if isinstance(x, str) else x)
                df['Y'] = df['Y'].astype('float64')
            if 'Z' in df.columns:
                df['Z'] = df['Z'].apply(lambda x: x.replace(',','.') if isinstance(x, str) else x)
                df['Z'] = df['Z'].astype('float64')
        
        df = na_line_drop(df, line_non_na=1)
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df{i} : {msg}")
    
            
    return dfs

In [None]:
csv_data_dir = ROOT_DIR + '/CF_data/Result_traitem/organisation/'
save_dir = csv_data_dir + '../fusion_finale/'

In [None]:
os.system(f"rm -rf {save_dir}") 
os.system(f"mkdir {save_dir}") 

In [None]:
sufx = ['sup', 'prof', 'inf', '/\dM(\*)?']
prefx = ['eau forage ']
id_reg = '\s*(?P<id>(?:^canne |Piezair )*\w*\d+\w*)\s*'

# Collecte des fichiers

In [None]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Litho':0,'Equipm':0,'Measure':0,'Sample':0,'Unknow':0}
data_dict={'Borehole':0,'Litho':0,'Equipm':0,'Measure':0,'Sample':0,'Unknow':0}

In [None]:
files_search(csv_data_dir, files_dict, prefix='source', details=False)

In [None]:
how=['inner', 'outer', 'left', 'right']
view = False

## Forages

In [None]:
key='Borehole'
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
data_overview(files_dict[key])

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Memoris_seafile/source_merge/source_Boreholes.csv' # 1
file2= csv_data_dir + 'Phase_1_Memoris/source_merge/source_Boreholes.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID'], dist_max=1)

In [None]:
dataset = mdf.copy()

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Phase_2_Memoris/source_merge/source_Boreholes.csv' # 3
file2= csv_data_dir + 'Prof_contact_sol_forage/source_merge/source_Boreholes.csv' # 4

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'database_Memoris3/source_merge/source_Boreholes.csv' # 8
file2= csv_data_dir + 'donnees_terrain_2019/source_merge/source_Boreholes.csv' # 9

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'Type_y':list(conflict_df.index), 'Long_for_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'profils_sols_donnees_forages/source_merge/source_Boreholes.csv' # 11
file2= csv_data_dir + 'vUmons_logsFor/source_merge/source_Boreholes.csv' # 13

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df1['Type_refus'] = df1['Refus'].apply(lambda x: x if not pd.isnull(x) else x)
df1['Refus'] = df1['Refus'].apply(lambda x: 'x' if not pd.isnull(x) else x)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID'], dist_max=1)

In [None]:
data = mdf
data_validation(overall_data=data, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Type_y':list(conflict_df.index), 'Long_for_y':list(conflict_df.index),
                           'Long_pz_sol_y':list(conflict_df.index)})

if 'level_0' in data.columns:
    if 'index' in data.columns:
        data.drop(columns='index', inplace=True)
    data.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'Date_for_y':list(conflict_df.index), 'Type_y':list(conflict_df.index), 
                            'Long_pz_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Forage_Pilote/source_merge/source_Boreholes.csv' # 0
file2= csv_data_dir + 'Siterem_Ext_Pilote/source_merge/source_Boreholes.csv' # 5

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df2['ID'] = df2['ID'].astype('object')

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Siterem_Pilote/source_merge/source_Boreholes.csv' # 6
file2= csv_data_dir + 'Siterem_Result_Sol/source_merge/source_Boreholes.csv' # 7

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'observ_terrain/source_merge/source_Boreholes.csv' # 10
file2= csv_data_dir + 'result_sol_ext_pilote/source_merge/source_Boreholes.csv' # 12

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'Long_pz_sol_y':list(conflict_df.index), 'Rmq_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'coord_ext_pilote/source_merge/source_Boreholes.csv' # 

df1 = create_df([file1])[0]
dataframe_viewer(df1, rows=3, un_val='ID', view=view)

In [None]:
data = df1
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['X','Y'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'ID_x':[569, 570], 'ID_y':[565], 'Date_for_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

####  $\color{red}{\textbf{Sauvegarde du jeu de données}}$

In [None]:
data_dict['Borehole'] = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID','ID_ech'])

In [None]:
dataset.to_csv(save_dir + 'Boreholes.csv', index=False)

###  ====================================================

# Lithologies

In [None]:
key='Litho'
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
data_overview(files_dict[key])

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'database_Memoris3/source_merge/source_Lithologies.csv' # 0
file2= csv_data_dir + 'profils_sols_donnees_forages/source_merge/source_Lithologies.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID', 'Litho_top', 'Litho_base'], dist_max=1)

In [None]:
dataset = mdf.copy()

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'donnees_terrain_2019/source_merge/source_Lithologies.csv' # 1
file2= csv_data_dir + 'vUmons_logsFor/source_merge/source_Lithologies.csv' # 3

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df1.drop(index = df1.query('Litho_top.str.contains("De")', engine='python').index, inplace=True)

In [None]:
dt_list = [df1, df2]
for n, dt in enumerate(dt_list):
    for w in ['_top', '_base']:
        for c in dt.columns:
            if re.search(w, c, flags=re.I):
                print(n, c)
                dt[c] = dt[c].astype('float') 

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID', 'Litho_top', 'Litho_base'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'Litho_top', 'Litho_base'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

####  $\color{red}{\textbf{Sauvegarde du jeu de données}}$

In [None]:
data_dict['Litho'] = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID','ID_ech'])

In [None]:
dataset.to_csv(save_dir + 'Lithologies.csv', index=False)

###  ====================================================

# Echantillons

In [None]:
key='Sample'
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
data_overview(files_dict[key])

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Memoris_seafile/source_merge/source_Samples.csv' # 2
file2= csv_data_dir + 'Liste_XY/source_merge/source_Samples.csv' # 1

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID', 'ID_ech'], dist_max=1)

In [None]:
dataset = mdf.copy()

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir +  'Phase_1_Memoris/source_merge/source_Samples.csv' # 3
file2= csv_data_dir + 'Phase_2_Memoris/source_merge/source_Samples.csv' # 4

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'C16-C21_y':list(conflict_df.index), 'C21-C35_y':list(conflict_df.index), 
                            'C12-C16_y':list(conflict_df.index), 'Fract_2+_y':list(conflict_df.index), 
                            'C10-C12_y':list(conflict_df.index), 'HC_tot_C10-C35_y':list(conflict_df.index), 
                            'Fract_2_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir +  'database_Memoris3/source_merge/source_Samples.csv' # 8
file2= csv_data_dir +  'profils_sols_donnees_forages/source_merge/source_Samples.csv' # 10

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID', 'ID_ech'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'Nappe_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir +  'Container_phyto/source_merge/source_Samples.csv' # 0
file2= csv_data_dir +  'vUmons_logsFor/source_merge/source_Samples.csv' # 12

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1)

In [None]:
mdf = gen_id_from_ech(mdf, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)

In [None]:
data = mdf
col = 'ID_ech'
for i in data.index:
    v = data.loc[i, col]
    if not pd.isnull(v) and re.search('ech', v, re.I):
        data.loc[i, 'ID'] = 'F_' + re.sub(' |.','', v,re.I)
mdf = data.copy()

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir +  'Siterem_Ext_Pilote/source_merge/source_Samples.csv' # 5
file2= csv_data_dir +  'Siterem_Pilote/source_merge/source_Samples.csv' # 6

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
df1['ID_ech'] = df1['ID_ech'].astype('object')

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'C16-C21_x':list(conflict_df.index), 'C12-C16_x':list(conflict_df.index), 
                            'C10-C12_x':list(conflict_df.index), 'HC_tot_C10-C35_x':list(conflict_df.index), 
                            'C21-C35_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir +  'Siterem_Result_Sol/source_merge/source_Samples.csv' # 7
file2= csv_data_dir +  'donnees_terrain_2019/source_merge/source_Samples.csv' # 9

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df2['ID'] = df2['ID'].astype('object')

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID', 'ID_ech'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'Ech_top_y':list(conflict_df.index), 'Ech_base_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir +  'result_sol_ext_pilote/source_merge/source_Samples.csv' # 11

df1 = create_df([file1])[0]
dataframe_viewer(df1, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
data = df1
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'ID_ech', 'Date_ech'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

####  $\color{red}{\textbf{Sauvegarde du jeu de données}}$

In [None]:
data_dict['Sample'] = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID','ID_ech'])

In [None]:
dataset.to_csv(save_dir + 'Samples.csv', index=False)

###  ====================================================

## Objets inconnus

In [None]:
key='Unknow'
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir +  'database_Memoris3/source_merge/source_Unknown.csv' # 11

df1 = create_df([file1])[0]
dataframe_viewer(df1, rows=3, un_val='ID', view=view)

In [None]:
dataset= df1

In [None]:
print(f'Dataset rows: {len(dataset)}')

####  $\color{red}{\textbf{Sauvegarde du jeu de données}}$

In [None]:
data_dict['Unknow'] = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID','ID_ech'])

In [None]:
dataset.to_csv(save_dir + 'Unknown.csv', index=False)

###  ====================================================

## Equipements

In [None]:
key='Equipm'
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
data_overview(files_dict[key])

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Memoris_seafile/source_merge/source_Equipments.csv' # 0
file2= csv_data_dir + 'Phase_1_Memoris/source_merge/source_Equipments.csv' # 1

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID'], dist_max=1)

In [None]:
dataset = mdf.copy()

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Phase_2_Memoris/source_merge/source_Equipments.csv' # 2
file1= csv_data_dir + 'profils_sols_donnees_forages/source_merge/source_Equipments.csv' # 4

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID','Type_equip'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'Type_equip'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'donnees_terrain_2019/source_merge/source_Equipments.csv' # 3

df1 = create_df([file1])[0]
dataframe_viewer(df1, rows=3, un_val='ID', view=view)

In [None]:
df1['ID'] = df1['ID'].astype('object')
df1.rename(columns={'Legende':'Type_equip'}, inplace=True)

In [None]:
data = df1
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID', 'Type_equip'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

####  $\color{red}{\textbf{Sauvegarde du jeu de données}}$

In [None]:
data_dict['Equipm'] = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID'])

In [None]:
dataset.to_csv(save_dir + 'Equipments.csv', index=False)

###  ====================================================

# Mesures

In [None]:
key='Measure'
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
files_dict[key]

In [None]:
data_overview(files_dict[key])

In [None]:
#id_reg = '\s*(?P<id>(?:^canne |Piezair )*\w*\d+\w*)\s*' # default 
id_reg = '\s*(?P<id>(?:^canne |Piezair |Drain |Moni )*\w*\d+\w*)\s*'

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Memoris_seafile/source_merge/source_Measures.csv' #1
file2= csv_data_dir + 'Phase_1_Memoris/source_merge/source_Measures.csv' # 2

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID','ID_ech','Date_mes'], dist_max=1)

In [None]:
dataset = mdf.copy()

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Phase_2_Memoris/source_merge/source_Measures.csv' # 3
file2= csv_data_dir + 'database_Memoris3/source_merge/source_Measures.csv' # 7

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID','ID_ech','Date_mes'], dist_max=1)

In [None]:
datafr = mdf.copy() 
found = []
for i in datafr.index:
    if pd.isnull(datafr.loc[i, 'ID_ech']):
        found.append(i)
        datafr.loc[i, 'ID_ech'] = datafr.loc[i, 'ID']
if found: print(f"{len(found)} Nan found in 'ID_ech' and fixed")

In [None]:
for i in datafr.index:
    if re.search('FP 49 PROF', datafr.loc[i, 'ID_ech']):
        datafr.loc[i, 'ID_ech'] = 'FP49 PROF'
    elif re.search('FP 49 SUP', datafr.loc[i, 'ID_ech']):
        datafr.loc[i, 'ID_ech'] = 'FP49 SUP'

In [None]:
datafr = gen_id_from_ech(datafr, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)#, verbose=True)
mdf = datafr.copy()

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID','ID_ech','Date_mes'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'Fract_2_y':list(conflict_df.index), 'Fract_2+_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'profils_sols_donnees_forages/source_merge/source_Measures.csv' # 10
file2= csv_data_dir + 'database_Memoris3/source_merge/source_Measures.csv' # 12

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID','Date_mes'], dist_max=1)

In [None]:
datafr = mdf.copy() 
found = []
for i in datafr.index:
    if pd.isnull(datafr.loc[i, 'ID_ech']):
        found.append(i)
        datafr.loc[i, 'ID_ech'] = datafr.loc[i, 'ID']
if found: print(f"{len(found)} Nan found in 'ID_ech' and fixed")

In [None]:
datafr = gen_id_from_ech(datafr, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)
mdf = datafr.copy()

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID','ID_ech','Date_mes'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'MS_x':list(conflict_df.index), 'pH_CaCl2_x':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Container_phyto/source_merge/source_Measures.csv' # 0
file2= csv_data_dir + 'Siterem_Ext_Pilote/source_merge/source_Measures.csv' # 4

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
df2['ID_ech'] = df2['ID_ech'].astype('object')

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID_ech','Date_mes'], dist_max=1)

In [None]:
datafr = mdf.copy() 
found = []
for i in datafr.index:
    if pd.isnull(datafr.loc[i, 'ID_ech']):
        found.append(i)
        datafr.loc[i, 'ID_ech'] = datafr.loc[i, 'ID']
if found: print(f"{len(found)} Nan found in 'ID_ech' and fixed")

In [None]:
#datafr = gen_id_from_ech(datafr, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)
mdf = datafr.copy()

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID_ech','Date_mes'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'Siterem_Pilote/source_merge/source_Measures.csv' #5
file2= csv_data_dir + 'Siterem_Result_Sol/source_merge/source_Measures.csv' #6

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID','ID_ech','Date_mes'], dist_max=1)

In [None]:
datafr = mdf.copy() 
found = []
for i in datafr.index:
    if pd.isnull(datafr.loc[i, 'ID_ech']):
        found.append(i)
        datafr.loc[i, 'ID_ech'] = datafr.loc[i, 'ID']
if found: print(f"{len(found)} Nan found in 'ID_ech' and fixed")

In [None]:
datafr = gen_id_from_ech(datafr, id_ech_col='ID_ech', suffixes=sufx, prefixes=prefx, capture_regex=id_reg)
mdf = datafr.copy()

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID','ID_ech','Date_mes'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'Fract_2_y':list(conflict_df.index), 'Fract_2+_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'donnees_terrain_2019/source_merge/source_Measures.csv' # 8
file2= csv_data_dir + 'observ_terrain/source_merge/source_Measures.csv' # 9

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=view), dataframe_viewer(df2, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1, df2]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how='outer', on=['ID','Date_mes'], dist_max=1)

In [None]:
data = mdf
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID','Date_mes'], dist_max=1)

In [None]:
data_validation(overall_data=dataset, conflict_data=conflict_df, index_col='index',  
                valid_dict={'CE_y':list(conflict_df.index), 'Temp_y':list(conflict_df.index),
                           'pH_y':list(conflict_df.index)})

if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
print(f'Dataset rows: {len(dataset)}')

#### $\color{green}{\textbf{Lecture et fusion}}$

In [None]:
file1= csv_data_dir + 'result_sol_ext_pilote/source_merge/source_Measures.csv' # 11

df1 = create_df([file1])[0]
dataframe_viewer(df1, rows=3, un_val='ID', view=view)

In [None]:
df_list = [df1]
look_for = ['Date', 'ID']
for i, df in enumerate(df_list):
    for l in look_for:
        for c in df.columns:
            if re.search(l, c, re.I): print(i, c)

In [None]:
data = df1
dataset, conflict_df=data_merger(dataset, data, how='outer', on=['ID','ID_ech','Date_mes'], dist_max=1)

In [None]:
print(f'Dataset rows: {len(dataset)}')

####  $\color{red}{\textbf{Sauvegarde du jeu de données}}$

In [None]:
data_dict['Measure'] = dataset.copy()
dataframe_viewer(dataset, rows=3, un_val=['ID','ID_ech'])

In [None]:
dataset.to_csv(save_dir + 'Measures.csv', index=False)