# Data Gathering

In [1]:
from utils.io import dataframe_viewer, files_search, data_merger, data_validation, data_overview, \
data_filter, fix_duplicates

import re, os
import numpy as np
import pandas as pd
import datetime as dtm
from definitions import ROOT_DIR

In [2]:
def create_df(files, verbose=True): # find another name for this function
    """
    create dataframes from files and test if they contain position informations
    files: list of files name
    """
    dfs = []
    for f in files:
        df = pd.read_csv(f, delimiter=',')
        dfs.append(df)
        
        if verbose:
            if 'X' in list(df.columns): msg = ' --> Coordinates'
            else: msg = ' --> No coordinates'

            print(f"df1 : {msg}")
            
    return dfs

## Reading files

In [3]:
work_dir = ROOT_DIR+'/CF_data/Result_traitem/'
save_dir = ROOT_DIR+'/CF_data/Donnees_fusionnees/'

In [4]:
# create my dictionary structure to retrieve good files (Keynames !!!)
files_dict={'Borehole':0,'Piezometer':0,'Piezair':0,'Trench':0,'Litho':0,'Equipm':0,
        'Measure':0,'Sample':0,'Analysis':0,'Facility':0}

In [5]:
files_search(work_dir, files_dict, prefix='', skip='source')

Borehole  	:  7
Piezometer  	:  17
Piezair  	:  2
Trench  	:  1
Litho  	:  7
Equipm  	:  3
Measure  	:  6
Sample  	:  27
Analysis  	:  21
Facility  	:  4


In [6]:
how=['inner', 'outer', 'left', 'right']

In [7]:
f = False
t = True

# ================== PROCESSING ===================== 

# Equipments

In [8]:
key='Equipm'
save_file = f'Merged_Equipments.csv'
coi=['ID','ID_date','X','Y','Z','Type','Long_for','Diam_for','Refus'] #columns of interest
dataset = pd.DataFrame()
print(len(files_dict[key]), 'files')

3 files


In [9]:
files_dict[key]

['/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/database_Memoris3/Drains_Pz_ENEL_Equipment.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/donnees_terrain_2019/Equipement_Equipment.csv',
 '/home/yanathan/Projects/GSDMA/CF_data/Result_traitem/profils_sols_donnees_forages/Equipement_Equipments.csv']

In [10]:
data_overview(files_dict[key])

Same files:[]
Files with coordinates:[]
Files without coordinates:[0, 1, 2]


#### $\color{green}{\textbf{Read and merge}}$

In [11]:
file1= work_dir + 'donnees_terrain_2019/Equipement_Equipment.csv' # 1
file2= work_dir + 'profils_sols_donnees_forages/Equipement_Equipments.csv' # 2  


df1, df2 = create_df([file1, file2])
dataframe_viewer(df1, rows=3, un_val='ID', view=t), dataframe_viewer(df2, rows=3, un_val='ID', view=t)

df1 :  --> No coordinates
df1 :  --> No coordinates
Rows : 33, columns : 7, Unique values on col 'ID': 9


interactive(children=(IntSlider(value=3, description='rows', max=33, min=3, readout=False), IntSlider(value=7,…

Rows : 36, columns : 7, Unique values on col 'ID': 12


interactive(children=(IntSlider(value=3, description='rows', max=36, min=3, readout=False), IntSlider(value=7,…

(None, None)

In [12]:
df1.ID = df1.ID.astype(str)
df2.ID = df2.ID.astype(str)

In [13]:
mdf, conflict_df=data_merger(df1, df2, how=how[1], on='ID', dist_max=1)

#### First object dataset save

In [14]:
dataset = mdf.copy() #saving

In [15]:
dataframe_viewer(dataset, rows=3, un_val='ID', view=t)

Rows : 69, columns : 7, Unique values on col 'ID': 21


interactive(children=(IntSlider(value=3, description='rows', max=69, min=3, readout=False), IntSlider(value=7,…

####  $\color{red}{\textbf{Save final object dataset}}$

In [16]:
if 'index' in dataset.columns:
    dataset.drop(columns='index', inplace=True)

In [17]:
if not os.path.exists(save_dir):
    os.makedirs(save_dir)
    
dataset.to_csv(save_dir + save_file, index=False)