# Data Gathering

In [30]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [31]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    for f in files:
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df1 : {msg}")
            
    return dfs

## Reading files

In [32]:
work_dir = ROOT_DIR + '/CF_data/Result_traitem/'
save_dir = ROOT_DIR + '/CF_data/Donnees_fusionnees/'

In [33]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [34]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [35]:
how=['inner', 'outer', 'left', 'right']

In [36]:
f = False
t = True

# ================== PROCESSING ===================== 

# Lithologies

In [37]:
key='Litho'
save_file = f'Merged_Lithologies.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

7 files


In [38]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Drains_Pz_ENEL_Lithology.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Lithology.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Result_sol_Lithology.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Log_Lithology.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Log_Lithology.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/vUmons_logsFor/Sond2017v2_Lithology.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/vUmons_logsFor/Synthese_Lithology.csv']

In [39]:
data_overview(files_dict[key])

Same files:[]
Files with coordinates:[2, 5, 6]
Files without coordinates:[0, 1, 3, 4]


In [40]:
a, b = 2, 5
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=10, un_val='ID', view=t), dataframe_viewer(df2, rows=10, un_val='ID', view=t)

database_Memoris3/Result_sol_Lithology.csv || vUmons_logsFor/Sond2017v2_Lithology.csv
df1 :  --> Coordinates
df1 :  --> Coordinates
Rows : 1423, columns : 14, Unique values on col 'ID': 330


interactive(children=(IntSlider(value=10, description='rows', max=1423, min=10, readout=False), IntSlider(valu…

Rows : 109, columns : 7, Unique values on col 'ID': 71


interactive(children=(IntSlider(value=10, description='rows', max=109, min=10, readout=False), IntSlider(value…

(None, None)

In [41]:
df1.X.replace(',','.', regex=True, inplace=True)
df1.Y.replace(',','.', regex=True, inplace=True)

In [42]:
df1.X = df1.X.astype(float)
df1.Y = df1.Y.astype(float)

In [43]:
df1 = df1.drop(index=df1[df1.duplicated(keep='first')].index).reset_index(drop=True)
df2 = df2.drop(index=df2[df2.duplicated(keep='first')].index).reset_index(drop=True)

In [47]:
id_ = 'FP76'
id_col = 'ID'
dataframe_viewer(df1.query(f'{id_col}=="{id_}"'), rows=10, cols=15, un_val=f'{id_col}', view=t), 
dataframe_viewer(df2.query(f'{id_col}=="{id_}"'), rows=10, cols=15, un_val=f'{id_col}', view=t)

Rows : 5, columns : 14, Unique values on col 'ID': 1


interactive(children=(IntSlider(value=5, description='rows', max=5, min=5, readout=False), IntSlider(value=14,…

Rows : 3, columns : 7, Unique values on col 'ID': 1


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=7, …

Rows : 3, columns : 7, Unique values on col 'ID': 1


interactive(children=(IntSlider(value=3, description='rows', max=3, min=3, readout=False), IntSlider(value=7, …

#### $\color{green}{\textbf{Read and merge}}$

In [17]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [20]:
cols = ['ID', 'X', 'Y', 'Z', 'index', 'Long', 'Litho_top', 'Litho_base', 'Description','Ep_alluv', 'Sous_zone',
        'Zone',  'Societe', 'Ep_remb', 'Type', 'Refus', , 'Nappe']

Index(['ID', 'X', 'Y', 'Z', 'index', 'Long', 'Ep_alluv', 'Sous_zone',
       'Litho_base', 'Zone', 'Description', 'Societe', 'Ep_remb', 'Type',
       'Refus', 'Litho_top', 'Nappe'],
      dtype='object')

In [18]:
dataframe_viewer(mdf, rows=10, cols=15, un_val='ID', view=t)

Rows : 882, columns : 17, Unique values on col 'ID': 400


interactive(children=(IntSlider(value=10, description='rows', max=882, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

In [None]:
pause

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

In [None]:
conflict_df

#### First object dataset save

In [None]:
dataset = mdf.copy() #saving

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1, check = data_filter(df1, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True, drop_old_id=True)

In [None]:
df2, check = data_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, drop=True)

In [None]:
df2

In [None]:
fix_duplicates(df1, df2)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

In [None]:
conflict_df

#### Merge with object dataset

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
a, b = 3, 5
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

In [None]:
file1= work_dir + 'Phase_2_Memoris/Result_eau_Piezometers.csv' # 3
file2= work_dir + 'database_Memoris3/Drains_Pz_ENEL_Piezometers.csv' # 5  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
file1= work_dir + 'donnees_terrain_2019/Donnees_forage_Piezometers.csv' # 9
file2= work_dir + 'database_Memoris3/Result_eau_Piezometers.csv' # 7  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [None]:
df2, check = data_filter(df2, position=True, id_col='ID', expression='sup|prof', dist_max=1, val_max=1.1, drop=True, drop_old_id=True)

In [None]:
dataframe_viewer(df2, rows=10, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df

In [None]:
data_validation(overall_data=mdf, conflict_data=conflict_df, index_col='index', 
                valid_dict={'Nappe_y':list(conflict_df.index)[:18]})

In [None]:
conflict_df

In [None]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [None]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
a, b = 13, 15
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))

In [None]:
#file1= work_dir + 'donnees_terrain_2019/Donnees_forage_Piezometers.csv' # 9
#file2= work_dir + 'database_Memoris3/Result_eau_Piezometers.csv' # 7  

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### Merge with object dataset

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
conflict_df # i think there are not the same, but no date or postition to distinguish them !
# --> check boreholes sheets (pdf)

In [None]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final object dataset}}$

In [None]:
if 'index' in piezometers.columns:
    piezometers.drop(columns='index', inplace=True)

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
piezometers.to_csv(save_dir + save_file, index=False)

###  ------------------------------------- Testing area ------------------------------

#### $\color{green}{\textbf{Read and merge}}$

In [None]:
a, b = 4, 0
file1= files_dict[key][a]
file2= files_dict[key][b]
print(files_dict[key][a].replace(work_dir,""),'||', files_dict[key][b].replace(work_dir,""))
#file1= work_dir + 'database_Memoris3/Donnees_piezos_Piezometers.csv' # 4
#file2= work_dir + 'Liste_XY/Sol_Eau_Piezometers.csv' # 0  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

#### $\color{blue}{\textbf{Manage conflicts}}$

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df([file1, file2])
print(files_dict[key][1].replace(work_dir,""),'||', files_dict[key][2].replace(work_dir,""))
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
fix_duplicates(df1, df2)

In [None]:
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [None]:
test, conf_test=data_merger(df1, df2, how=how[1], on='new_ID', dist_max=1)

In [None]:
if len(conf_test) > 0:
    dataframe_viewer(conf_test, rows=10, un_val='new_ID', view=t)

In [None]:
dataframe_viewer(test, rows=10, un_val='ID', view=t)

In [None]:
pause

###  °°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°°

In [None]:
file1= files_dict[key][6]
file2= files_dict[key][4]

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][1]

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

##### check and validate duplicate objects
- The function "data_filter()" doesn't work in some cases, so we use function "doubled_objects_check()"
- we have same objects Names but differents by positions here

In [None]:
mdf, check = data_filter(mdf, position=True, id_on='ID', expression='sup|prof', dist_max=1, drop=True, rapp_val=1)

In [None]:
double_objects_check(mdf)

In [None]:
drop_id = [2,25,30] # objects are seemingly the same, but is it possible to get 2 objects so close (~ 1m)?
mdf.drop(index=drop_id, inplace=True)
mdf.reset_index(drop=True, inplace=True)

In [None]:
dataframe_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

#### Merge with object type dataset

In [None]:
piezometers = mdf.copy() #saving

In [None]:
file1= files_dict[key][2]
file2= files_dict[key][3]

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataframe_viewer(mdf, rows=3, cols=13, un_val='ID', view=f)

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
dataframe_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
double_objects_check(piezometers)

In [None]:
drop_id = [292, 293]
piezometers.drop(index=drop_id, inplace=True)
dataframe_viewer(dataset, rows=5, un_val='ID', view=f)

In [None]:
file1= files_dict[key][4]
file2= files_dict[key][5]

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
dataframe_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
dataset, check = data_filter(dataset, position=True, id_on='ID', expression='sup|prof', dist_max=1, drop=True)
#dataframe_viewer(dataset, rows=5, un_val='ID', view=t)

In [None]:
double_objects_check(piezometers)

In [None]:
drop_id = [2,4,30,94,106]
piezometers.drop(index=drop_id, inplace=True)
dataframe_viewer(dataset, rows=5, un_val='ID', view=f)

In [None]:
file1= files_dict[key][6]
file2= files_dict[key][9]

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df2['ID'] = df2.ID.astype('object')

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
dataframe_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][10]
file2= files_dict[key][11]

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1['ID'] = df1.ID.astype('object')

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
dataframe_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][12]
file2= files_dict[key][13]

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
conflict_df

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
dataframe_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][14]
file2= files_dict[key][15]

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
dataframe_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][16]
df1 = pd.read_csv(file1, delimiter=',')

print(f"df1 : {file1.replace(work_dir,'')}")
dataframe_viewer(df1, rows=3, un_val='ID', view=t)

#### Last merging

In [None]:
dataset, conflict_df=data_merger(dataset, df1, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
conflict_df

In [None]:
dataframe_viewer(dataset, rows=3, cols=13, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final Piezometers data}}$

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
piezometers.to_csv(save_dir+save_file, index=False)