# Data Gathering

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
import datetime as dtm
import matplotlib.pyplot as plt
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    for f in files:
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df1 : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [4]:
# create a dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [5]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# BOREHOLES PROCESSING

Some corrections todo in 'data organization':
- correct extraction in the file 2 -> Samples
- file 4 and file 5 are the same in result (check it)
- try to concatenate file 1 with piezo (if possible because no position)
- check processing for 'refus and 'type_refus' (every object)

In [8]:
key='Borehole'
save_file = f'Merged_Boreholes.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus', 'Societe'] #columns of interest
dataset = pd.DataFrame() # for saving object info after last merging
print(len(files_dict[key]), 'files')

7 files


In [9]:
files_dict[key] #files_dict[key][0]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Forage_Pilote/leve_Z_elect_pos_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/Prof_contact_sol_forage/Feuil1_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Profils_sol_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Donnees_forage_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Equipement_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Log_Boreholes.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/donnees_forage_Boreholes.csv']

In [10]:
data_overview(files_dict[key])

Same files:[(4, 6)]
Files with coordinates:[1, 3, 4, 6]
Files without coordinates:[0, 2, 5]


#### $\color{green}{\textbf{Read and merge}}$

In [11]:
file1= work_dir + 'profils_sols_donnees_forages/Log_Boreholes.csv' # 5
file2= work_dir + 'profils_sols_donnees_forages/Equipement_Boreholes.csv' # 4

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> Coordinates
Rows : 24, columns : 3, Unique col 'ID': 24


interactive(children=(IntSlider(value=3, description='rows', max=24, min=3, readout=False), IntSlider(value=3,…

Rows : 13, columns : 13, Unique col 'ID': 13


interactive(children=(IntSlider(value=3, description='rows', max=13, min=3, readout=False), IntSlider(value=12…

(None, None)

In [12]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [13]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

In [14]:
dataframe_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 25, columns : 13, Unique col 'ID': 25


interactive(children=(IntSlider(value=10, description='rows', max=25, min=10, readout=False), IntSlider(value=…

#### First object dataset save

In [15]:
dataset = mdf.copy() #saving

#### $\color{green}{\textbf{Read and merge}}$

In [16]:
file1= work_dir + 'database_Memoris3/Profils_sol_Boreholes.csv' # 2
file2= work_dir + 'Prof_contact_sol_forage/Feuil1_Boreholes.csv' # 1

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> Coordinates
Rows : 172, columns : 6, Unique col 'ID': 172


interactive(children=(IntSlider(value=3, description='rows', max=172, min=3, readout=False), IntSlider(value=6…

Rows : 8, columns : 6, Unique col 'ID': 8


interactive(children=(IntSlider(value=3, description='rows', max=8, min=3, readout=False), IntSlider(value=6, …

(None, None)

In [17]:
df1.rename(columns={'Profondeur':'Long_for'}, inplace=True)
df2.rename(columns={'Profondeur':'Long_for'}, inplace=True)

In [18]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [19]:
dataframe_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 177, columns : 10, Unique col 'ID': 177


interactive(children=(IntSlider(value=10, description='rows', max=177, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [20]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
136,Long_for,F205,1.4,3.2
138,Long_for,F212,5.8,3.4
142,Long_for,F217,5.7,4.2


#### Merge with object dataset

In [21]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

In [22]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [23]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 202, columns : 14, Unique col 'ID': 202


interactive(children=(IntSlider(value=10, description='rows', max=202, min=10, readout=False), IntSlider(value…

#### $\color{green}{\textbf{Read and merge}}$

In [24]:
file1= work_dir + 'Forage_Pilote/leve_Z_elect_pos_Boreholes.csv' # 0
file2= work_dir + 'donnees_terrain_2019/Donnees_forage_Boreholes.csv' # 3

df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> Coordinates
Rows : 72, columns : 5, Unique col 'ID': 72


interactive(children=(IntSlider(value=3, description='rows', max=72, min=3, readout=False), IntSlider(value=5,…

Rows : 16, columns : 18, Unique col 'ID': 16


interactive(children=(IntSlider(value=3, description='rows', max=16, min=3, readout=False), IntSlider(value=12…

(None, None)

In [25]:
df2.ID=df2.ID.apply(lambda x: 'F'+x) # name recent (2019) boreholes

In [26]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [27]:
dataframe_viewer(mdf, rows=10, un_val='ID', view=t)

Rows : 74, columns : 20, Unique col 'ID': 74


interactive(children=(IntSlider(value=10, description='rows', max=74, min=10, readout=False), IntSlider(value=…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [28]:
conflict_df

Unnamed: 0,Check_col,ID,Long_for_x,Long_for_y
45,Long_for,F201,5.835,2.4
49,Long_for,F205,5.84,4.8
51,Long_for,F207,5.79,4.8
52,Long_for,F208,5.77,4.8
56,Long_for,F212,5.675,4.8
58,Long_for,F214,5.685,4.8
61,Long_for,F217,5.73,4.8
63,Long_for,F219,5.63,1.5
64,Long_for,F220,5.655,0.5
65,Long_for,F221,5.72,1.4


#### Merge with object dataset

In [29]:
dataset, conflict_df=data_merger(dataset, mdf, how=how[1], on='ID', dist_max=1)

Conflict values present. Please resolve this manually !


In [30]:
if 'level_0' in dataset.columns:
    if 'index' in dataset.columns:
        dataset.drop(columns='index', inplace=True)
    dataset.rename(columns={'level_0':'index'}, inplace=True)

In [31]:
dataframe_viewer(dataset, rows=10, un_val='ID', view=t)

Rows : 235, columns : 21, Unique col 'ID': 235


interactive(children=(IntSlider(value=10, description='rows', max=235, min=10, readout=False), IntSlider(value…

#### $\color{blue}{\textbf{Manage conflicts}}$

In [32]:
conflict_df

Unnamed: 0,Check_col,ID,index_x,index_y,Long_for_x,Long_for_y,Date_ouv_x,Date_ouv_y,ID_date_x,ID_date_y,Societe_x,Societe_y
33,index,F10,33.0,0.0,7.0,,2010-03-15,,2010-F10,,SITEREM,
34,index,F11,34.0,1.0,6.0,,2010-03-16,,2010-F11,,SITEREM,
36,index,F13,36.0,3.0,8.4,,2010-03-16,,2010-F13,,SITEREM,
38,index,F23,38.0,10.0,6.8,,2010-03-18,,2010-F23,,SBS Environnement,
39,index,F24,39.0,11.0,6.8,,2010-03-18,,2010-F24,,SBS Environnement,
54,index,F41,54.0,13.0,6.8,,2010-03-12,,2010-F41,,SBS Environnement,
97,"Long_for, index",F100,97.0,14.0,5.5,5.905,2010-02-11,,2010-F100,,SBS Environnement,
98,"Long_for, index",F101,98.0,15.0,2.0,5.775,2010-02-11,,2010-F101,,SBS Environnement,
99,"Long_for, index",F102,99.0,16.0,6.9,5.945,2010-02-23,,2010-F102,,SBS Environnement,
100,"Long_for, index",F103,100.0,17.0,8.1,5.725,2010-02-23,,2010-F103,,SBS Environnement,


In [33]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 235, columns : 21, Unique col 'ID': 235


interactive(children=(IntSlider(value=3, description='rows', max=235, min=3, readout=False), IntSlider(value=1…

####  $\color{red}{\textbf{Save final object dataset}}$

In [34]:
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [35]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)