# Data Gathering

In [1]:
%matplotlib widget

In [2]:
from utils.io import gen_id_dated, gdf_viewer, gdf_geom, gdf_merger, gdf_filter, na_col_drop, na_line_drop
import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
#from shapely.geometry import Point
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR

In [3]:
def files_search(work_dir, files_dict, prefix='', skip=None, details=False):   
    
    if skip is None:
        skip = "we don't want to skip a word"
        
    for k in files_dict.keys():
        tmp_list = []
        for p,d,f in os.walk(work_dir):
            for x in f:
                add = False
                if re.search(prefix,x,re.I) and not re.search(skip,x,re.I):
                    add = True
                    i = str(f'{p}/{x}')
                else:
                    add = False
                    i=''
                    
                if re.search(k,i,re.I) and add:
                    tmp_list.append(i)
        tmp_list.sort()
        files_dict.update({k:tmp_list})

    for k,v in files_dict.items():
        print(k,' \t: ',len(v))
    
    if details: # Look filenames
        which = files_dict.keys()

        for w in which:
            print('\n+++++++++++++++++')
            print(f'+  {w.upper()}\t+ ')
            print('+++++++++++++++++')
            [print(i, '-', x) for i, x in enumerate(files_dict[w], 0)]    


In [4]:
def check_col(data):
    cols_idx = []
    
    class DoubledColumns(Exception):
        """Merging process doubled column(s) still remain. Check and drop them before continue"""
        pass
    
    for i in range(len(data.columns)):
        if re.search('_x|_y', list(data.columns)[i]):
            cols_idx.append(i)
    
    if len(cols_idx) != 0 :
        raise DoubledColumns(f'Merging process doubled column(s) still remain.'
                             f'\nCheck and drop them before continue ! Doubled columns position {cols_idx}')

In [5]:
def distinct_obj_test(df1, df2, on='ID', how='outer', dist_max=1):
    test_distinct = df1.merge(df2, on=on, how=how)

    dist_max = dist_max
    
    if 'X' in list(df1.columns) and 'X' in list(df2.columns):
        for idx in test_distinct.index:
            distinct_objects = True
            if not pd.isnull(test_distinct.loc[idx,'X_x']) and not pd.isnull(test_distinct.loc[idx,'X_y']):
                dist = (test_distinct.loc[idx,'X_x'] - test_distinct.loc[idx,'X_y']) ** 2 + (test_distinct.loc[idx,'Y_x'] - test_distinct.loc[idx,'Y_y']) ** 2
                if dist <= (dist_max) ** 2:  # consider as same object
                    distinct_objects = False
            else:
                distinct_objects = False
            test_distinct.loc[idx, 'Distinct_obj'] = distinct_objects

        test_distinct.insert(1,'Distinct_obj', test_distinct.pop('Distinct_obj') )

        gdf_viewer(test_distinct)
    else:
        print('Cannot proceed ! No position data in one of the dataframe')

In [6]:
def create_df(file1, file2): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    """
    
    df1 = pd.read_csv(file1, delimiter=',')
    df2 = pd.read_csv(file2, delimiter=',')
    
    print(f"df1 : {file1.replace(work_dir,'')} \ndf2 : {file2.replace(work_dir,'')}\n")

    if 'X' in list(df1.columns): print('df1 - Position data')
    else: print('df1 - No position data')
    if 'X' in list(df2.columns): print('df2 - Position data')
    else: print('df2 - No position data')
    
    return df1, df2

In [7]:
def validate_data(data, data_to_check, valid_data, col, idx_list, valid_col):
    
    old_idx_col = 'Source_index'
    
    for col, idx_list in valid_data.items():
        if old_idx_col in data_to_check.columns:
            idx = data_to_check.loc[i, old_idx_col]
            data.loc[idx, col] = data_to_check.loc[i, valid_col]
        else:
            raise NameError(f"Dataframe to check must contain a column named : '{old_idx_col}'!")

    data_to_check.drop(index=idx_list, inplace=True)
    data_to_check.reset_index(drop=True, inplace=True)
    print(f"Operation done ")
    
    #return data

In [8]:
def double_objects_check(data):
    uniq_ID = []
    dbl_ID = []
    idx_ = []
    qdf = pd.DataFrame()

    for i in data.index:
        id_ = data.loc[i, 'ID']

        if id_ not in uniq_ID:
            uniq_ID.append(id_)
        elif id_ not in dbl_ID:
            dbl_ID.append(id_)
        else:
            idx_.append(i)

    for i in dbl_ID:
        qdf = qdf.append(data.query(f"ID=='{i}'"))

    return qdf

## Files reading

In [9]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [10]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'facility':0}

In [11]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  5
Piezometer  	:  14
Piezair  	:  1
Trench  	:  1
Litho  	:  6
Equipm  	:  2
Measure  	:  6
Sample  	:  26
Analysis  	:  20
facility  	:  4


In [12]:
how=['inner', 'outer', 'left', 'right']

In [13]:
f = False
t = True

# Boreholes

Some corrections todo in 'data organization':
- correct extraction in the file 2 -> Samples
- file 4 and file 5 are the same in result (check it)
- try to concatenate file 1 with piezo (if possible because no position)
- check processing for 'refus and 'type_refus' (every object)

In [14]:
# keys: Borehole','Piezometer','Litho', 'Trench','Equipm','Measure','Sample','Analysis','facility'
files_dict['Borehole']

['/home/su530201/PycharmProjects/GSDMA/CF_data/Result_traitem/Forage_Pilote/leve_Z_elect_pos_Boreholes.csv',
 '/home/su530201/PycharmProjects/GSDMA/CF_data/Result_traitem/Prof_contact_sol_forage/Feuil1_Boreholes.csv',
 '/home/su530201/PycharmProjects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Boreholes.csv',
 '/home/su530201/PycharmProjects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Donnees_forage_Boreholes.csv',
 '/home/su530201/PycharmProjects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/donnees_forage_Boreholes.csv']

In [15]:
key='Borehole'
save_file = f'Merged_Boreholes.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus', 'Societe'] #columns of interest
boreholes = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

5 files


In [16]:
file1= files_dict[key][1]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : Prof_contact_sol_forage/Feuil1_Boreholes.csv 
df2 : donnees_terrain_2019/Donnees_forage_Boreholes.csv

df1 - Position data
df2 - Position data
Rows : 8, columns : 6, Unique col 'ID': 8


interactive(children=(IntSlider(value=3, description='rows', max=8, min=3, readout=False), IntSlider(value=6, …

Rows : 16, columns : 18, Unique col 'ID': 16


interactive(children=(IntSlider(value=3, description='rows', max=16, min=3, readout=False), IntSlider(value=12…

(None, None)

In [17]:
df2.ID=df2.ID.apply(lambda x: 'F'+x) # name recent (2019) boreholes

In [18]:
distinct_obj_test(df1, df2, dist_max=1)

Rows : 17, columns : 24


interactive(children=(IntSlider(value=10, description='rows', max=17, min=10, readout=False), IntSlider(value=…

In [19]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf) # check if columns with '..._x' or '..._y' are still present and raise an error

    index Type_refus    Methode  Crep_long     Resp_chantier  Diam_ext_pz  \
0       0        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
1       1        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
2       2        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
3       3        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
4       4        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
5       5        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
6       6        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
7       7        NaN        NaN        NaN               NaN          NaN   
8       8      Béton  Dual tube        NaN  Liza Niemirowski          NaN   
9       9        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
10     10      Béton  Dual tube        NaN  Liza Niemirowski          NaN   
11     11        NaN  Dual tube        NaN  Liza Niemirowski          NaN   

In [20]:
err_df

Unnamed: 0,ID,Long_for_x,Long_for_y,Source_index
0,F205,3.2,4.8,0
1,F208,3.4,4.8,1
2,F212,3.4,4.8,2
3,F207,3.4,4.8,3
4,F214,3.6,4.8,4
5,F217,4.2,4.8,5
6,F225,4.0,4.8,6


In [21]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 17, columns : 20, Unique col 'ID': 17


interactive(children=(IntSlider(value=10, description='rows', max=17, min=10, readout=False), IntSlider(value=…

#### boreholes merging

In [22]:
boreholes = mdf.copy() #saving

In [23]:
file1= files_dict[key][2]
file2= files_dict[key][4]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

df1 : database_Memoris3/Profils_sol_Boreholes.csv 
df2 : profils_sols_donnees_forages/donnees_forage_Boreholes.csv

df1 - No position data
df2 - Position data
Rows : 826, columns : 6, Unique col 'ID': 172


interactive(children=(IntSlider(value=3, description='rows', max=826, min=3, readout=False), IntSlider(value=6…

Rows : 13, columns : 13, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=12…

(None, None)

In [24]:
distinct_obj_test(df1, df2, dist_max=1)

Cannot proceed ! No position data in one of the dataframe


In [25]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID')
check_col(mdf)

     index         Type_refus  Long_for           Z              X Refus  \
0        0                NaN       NaN         NaN            NaN   NaN   
1        1                NaN       NaN         NaN            NaN   NaN   
2        2                NaN       NaN         NaN            NaN   NaN   
3        3                NaN       NaN         NaN            NaN   NaN   
4        4                NaN       NaN         NaN            NaN   NaN   
..     ...                ...       ...         ...            ...   ...   
834    834              Béton       0.8  103.207928  152838.481267     x   
835    835              Béton       0.5  103.182578  152840.031071     x   
836    836              Béton       0.8  103.225362  152871.977024     x   
837    837  Matériaux indurés       1.4  103.253143  152874.285127     x   
838    838  Matériaux indurés       1.4  103.246920  152874.975616     x   

         ID  Diam_for              Y     Method    Type    Date_ouv  \
0        F2     

In [26]:
mdf['ID_date'] = mdf['ID_date'].apply(lambda x: str(x).upper())

In [27]:
mdf['Long_for'] = mdf[['Profondeur', 'Long_for']].apply(lambda x: x[0] if pd.isnull(x[1]) else x[1], axis=1)
mdf.drop(columns=['Profondeur'], inplace=True)

KeyError: "['Profondeur'] not in index"

In [None]:
gdf_viewer(mdf, rows=3, cols=15, un_val='ID', view=t)

#### boreholes merging

In [30]:
boreholes.drop('index', axis='columns', inplace=True)

In [36]:
boreholes.drop('split_distinct', axis='columns', inplace=True)

In [38]:
boreholes, err_df=gdf_merger(boreholes, mdf, how=how[1], on='ID', dist_max=2)
check_col(boreholes)

     index  Crep_long    Methode     Method     Resp_chantier  Diam_ext_pz  \
0        0        NaN  Dual tube   Liner_60  Liza Niemirowski          NaN   
1        1        NaN  Dual tube   Liner_60  Liza Niemirowski          NaN   
2        2        NaN  Dual tube        NaN  Liza Niemirowski          NaN   
3        3        NaN  Dual tube   Gouge_75  Liza Niemirowski          NaN   
4        4        NaN  Dual tube   Gouge_75  Liza Niemirowski          NaN   
..     ...        ...        ...        ...               ...          ...   
847    847        NaN        NaN  Dual tube               NaN          NaN   
848    848        NaN        NaN  Dual tube               NaN          NaN   
849    849        NaN        NaN  Dual tube               NaN          NaN   
850    850        NaN        NaN  Dual tube               NaN          NaN   
851    851        NaN        NaN  Dual tube               NaN          NaN   

         ID  Long_pz  Diam_int_pz         Type_refus  ...      

In [42]:
boreholes.loc[[12,13,14,15,16],:]

Unnamed: 0,index,Crep_long,Methode,Method,Resp_chantier,Diam_ext_pz,ID,Long_pz,Diam_int_pz,Type_refus,...,Z,Refus,X,Diam_for,Type,Date_ouv,Y,Societe,ID_date,split_distinct
12,12.0,,Dual tube,tarrière,Liza Niemirowski,,F217,,,,...,101.815,,152886.185,75.0,Forage,#conflict,122587.152,#conflict,#conflict,False
13,13.0,,Dual tube,tarrière,Liza Niemirowski,,F217,,,,...,101.815,,152886.185,75.0,Forage,#conflict,122587.152,#conflict,#conflict,False
14,14.0,,Dual tube,tarrière,Liza Niemirowski,,F217,,,,...,101.815,,152886.185,75.0,Forage,#conflict,122587.152,#conflict,#conflict,False
15,15.0,,Dual tube,tarrière,Liza Niemirowski,,F217,,,,...,101.815,,152886.185,75.0,Forage,#conflict,122587.152,#conflict,#conflict,False
16,16.0,,Dual tube,tarrière,Liza Niemirowski,,F217,,,,...,101.815,,152886.185,75.0,Forage,#conflict,122587.152,#conflict,#conflict,False


In [39]:
err_df

Unnamed: 0,ID,Date_ouv_x,Date_ouv_y,Societe_x,Societe_y,ID_date_x,ID_date_y,Source_index
0,F205,2019-10-07,2010-03-26,ECOPLANNING sprl,SITEREM,2019-205,2010-F205,0
1,F205,2019-10-07,2010-03-26,ECOPLANNING sprl,SITEREM,2019-205,2010-F205,1
3,F212,2019-10-07,2010-05-10,ECOPLANNING sprl,SITEREM,2019-212,2010-F212,3
4,F212,2019-10-07,2010-05-10,ECOPLANNING sprl,SITEREM,2019-212,2010-F212,4
5,F212,2019-10-07,2010-05-10,ECOPLANNING sprl,SITEREM,2019-212,2010-F212,5
6,F212,2019-10-07,2010-05-10,ECOPLANNING sprl,SITEREM,2019-212,2010-F212,6
7,F212,2019-10-07,2010-05-10,ECOPLANNING sprl,SITEREM,2019-212,2010-F212,7
8,F212,2019-10-07,2010-05-10,ECOPLANNING sprl,SITEREM,2019-212,2010-F212,8
9,F212,2019-10-07,2010-05-10,ECOPLANNING sprl,SITEREM,2019-212,2010-F212,9
12,F217,2019-10-07,2010-03-02,ECOPLANNING sprl,SBS Environnement,2019-217,2010-F217,12


In [None]:
validate_data(boreholes, err_df, on='ID', col='Long_for', idx_list=[0,1,2], valid_col='Long_for_y')

In [None]:
err_df

In [None]:
gdf_viewer(boreholes, rows=3, cols=15, un_val='ID', view=t)

In [None]:
file1= files_dict[key][5]
file2= files_dict[key][0]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
distinct_obj_test(df1, df2, dist_max=1)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', )
check_col(mdf) # check if columns with '..._x' or '..._y' are still present and raise an error

In [None]:
mdf['Long_for'] = mdf[['Profondeur', 'Long_for']].apply(lambda x: x[0] if pd.isnull(x[1]) else x[1], axis=1)
mdf.drop(columns=['Profondeur'], inplace=True)

In [None]:
gdf_viewer(mdf, rows=10, un_val='ID', view=t)

#### Last boreholes merging

In [None]:
boreholes, err_df=gdf_merger(boreholes, mdf, how=how[1], on='ID')
check_col(boreholes)

In [None]:
err_df # i think there are not the same, but no date or postition to distinguish them !
# --> check boreholes sheets (pdf)

In [None]:
gdf_viewer(boreholes, rows=3, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final Boreholes data}}$

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
boreholes.to_csv(save_dir+save_file, index=False)

# Piezometers

Some corrections todo in 'data organization':

In [None]:
key='Piezometer'
save_file = f'Merged_Piezometers.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
piezometers = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][1]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

##### check and validate duplicate objects
- The function "gdf_filter()" doesn't work in some cases, so we use function "doubled_objects_check()"
- we have same objects Names but differents by positions here

In [None]:
mdf, check = gdf_filter(mdf, position=True, id_on='ID', expression='sup|prof', dist_crit=1, drop=True, rapp_val=1)

In [None]:
double_objects_check(mdf)

In [None]:
drop_id = [2,25,30] # objects are seemingly the same, but is it possible to get 2 objects so close (~ 1m)?
mdf.drop(index=drop_id, inplace=True)
mdf.reset_index(drop=True, inplace=True)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

##### Piezometers merging 

In [None]:
piezometers = mdf.copy() #saving

In [None]:
file1= files_dict[key][2]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=f)

In [None]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

In [None]:
double_objects_check(piezometers)

In [None]:
drop_id = [292, 293]
piezometers.drop(index=drop_id, inplace=True)
gdf_viewer(piezometers, rows=5, un_val='ID', view=f)

In [None]:
file1= files_dict[key][4]
file2= files_dict[key][5]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

In [None]:
piezometers, check = gdf_filter(piezometers, position=True, id_on='ID', expression='sup|prof', dist_crit=1, drop=True)
#gdf_viewer(piezometers, rows=5, un_val='ID', view=t)

In [None]:
double_objects_check(piezometers)

In [None]:
drop_id = [2,4,30,94,106]
piezometers.drop(index=drop_id, inplace=True)
gdf_viewer(piezometers, rows=5, un_val='ID', view=f)

In [None]:
file1= files_dict[key][6]
file2= files_dict[key][9]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df2['ID'] = df2.ID.astype('object')

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][10]
file2= files_dict[key][11]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
df1['ID'] = df1.ID.astype('object')

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
err_df

In [None]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][12]
file2= files_dict[key][13]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
err_df

In [None]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][14]
file2= files_dict[key][15]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=2)
check_col(mdf)

In [None]:
piezometers, err_df=gdf_merger(piezometers, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
err_df

In [None]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

In [None]:
file1= files_dict[key][16]
df1 = pd.read_csv(file1, delimiter=',')

print(f"df1 : {file1.replace(work_dir,'')}")
gdf_viewer(df1, rows=3, un_val='ID', view=t)

#### Last merging

In [None]:
piezometers, err_df=gdf_merger(piezometers, df1, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
err_df

In [None]:
gdf_viewer(piezometers, rows=3, cols=13, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final Piezometers data}}$

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
piezometers.to_csv(save_dir+save_file, index=False)

==========================================================================================================

# Unknown facilities

In [None]:
key='facility'
save_file = f'Merged_Facilites_unknw.csv'
#coi=['ID','X','Y','Z','Litho_top','Litho_base','Description']  #columns of interest
facilities = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID')#, step_merge
check_col(mdf)

In [None]:
facilities = mdf.copy() #saving

In [None]:
file1= files_dict[key][1]
df1 = pd.read_csv(file1, delimiter=',')

print(f"df1 : {file1.replace(work_dir,'')}")
gdf_viewer(df1, rows=3, un_val='ID', view=t)

In [None]:
facilities, err_df=gdf_merger(facilities, df1, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(facilities, rows=3, un_val='ID', view=t)

####  $\color{red}{\textbf{Save final Unknown Facilities data}}$

In [None]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
facilities.to_csv(save_dir+save_file, index=False)

# Lithologies

Do not add parameter 'dist_max' when merging without considering position !!! otherwise, unuseless rows added

In [None]:
key='Litho'
save_file = f'Merged_Lithologies.csv'
coi=['ID','X','Y','Z','Litho_top','Litho_base','Description']  #columns of interest
lithologies = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][3]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID')#, step_merge
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

In [None]:
gdf_viewer(err_df, rows=5, un_val='ID', view=t) #err_df.ID.unique()

In [None]:
common_cols = list(set(df1.columns) & set(df2.columns))
test1 = df1.merge(df2, how = 'inner', on='ID')
test2 = df1.merge(df2, how = 'outer', on='ID', indicator=True).loc[lambda x : x.query('_merge =="right_only" or _merge=="left_only"').index]
test3 = test1.merge(test2, how = 'outer', on='ID')
test4 = df1.merge(df2, how = 'outer', on=list(common_cols))
print((len(test1), len(test2), len(test3)))
gdf_viewer(test4)

In [None]:
lithologies = mdf.copy() #saving

In [None]:
file1= files_dict[key][2]
file2= files_dict[key][4]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID')
check_col(mdf)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

##### Lithologies merging 

In [None]:
lithologies, err_df=gdf_merger(lithologies, mdf, how=how[1], on='ID')
check_col(mdf)

In [None]:
gdf_viewer(lithologies, rows=10, cols=15, un_val='ID', view=t)

In [None]:
file1= files_dict[key][5]
file2= files_dict[key][6]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
gdf_viewer(df1.merge(df2, how='inner', on='ID'), rows=5, cols=15, un_val='ID', view=t)

In [None]:
gdf_viewer(df1.merge(df2, how = 'outer', on='ID',indicator=True), rows=5, cols=15, un_val='ID', view=t)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(err_df, rows=5, un_val='ID', view=t) #err_df.ID.unique()

##### Lithologies merging 

In [None]:
lithologies, err_df=gdf_merger(lithologies, mdf, how=how[1], on='ID', dist_max=1)
check_col(mdf)

In [None]:
gdf_viewer(lithologies, rows=3, un_val='ID', view=t)

In [None]:
stop

# Equipments

We must also retrieve equipments information from boreholes and piezometers

In [None]:
key='Equipm'
save_file = f'Merged_Equipments.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
equipments = pd.DataFrame()
print(len(files_dict[key]), 'files')

In [None]:
file1= files_dict[key][0]
file2= files_dict[key][1]

df1, df2 = create_df(file1, file2)
gdf_viewer(df1, rows=3, un_val='ID', view=t), gdf_viewer(df2, rows=3, un_val='ID', view=t)

In [None]:
mdf, err_df=gdf_merger(df1, df2, how=how[1], on='ID', dist_max=1)
check_col(mdf)

##### check and validate duplicate objects
- The function "gdf_filter()" doesn't work in some cases, so we use function "doubled_objects_check()"
- we have same objects Names but differents by positions here

In [None]:
mdf, check = gdf_filter(mdf, position=True, id_on='ID', expression='sup|prof', dist_crit=1, drop=True, rapp_val=1)

In [None]:
double_objects_check(mdf)

In [None]:
drop_id = [2,25,30] # objects are seemingly the same, but is it possible to get 2 objects so close (~ 1m)?
mdf.drop(index=drop_id, inplace=True)
mdf.reset_index(drop=True, inplace=True)

In [None]:
gdf_viewer(mdf, rows=3, cols=13, un_val='ID', view=t)

##### Piezometers merging 

In [None]:
piezometers = mdf.copy() #saving

# Samples

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

# Measures

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)

# Analysis

Some corrections todo in 'data organization':
- file 0 and file 1 are the same in result (check it)