# DATA ORGANIZATION

In [1]:
%matplotlib widget

In [2]:
from utils.io import gen_id_dated, gdf_viewer, gdf_geom, gdf_merger
import re, os
import numpy as np
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point
import datetime
import matplotlib.pyplot as plt

Data format (excel files)

In [3]:
def na_line_drop(data, col_n=3):
    data['line_na']=False
    for i in range(len(data)):
        verif=True
        for j in data.columns.to_list()[col_n:-1]:
            if not pd.isnull(data.loc[i,j]): verif=False

        data.loc[i,'line_na']=verif

    print(f"lines dropped: {list(data.query('line_na==True').index)}")
    data=data.query('line_na==False')
    data.reset_index(drop=True, inplace=True)
    data.drop('line_na', axis=1, inplace=True)
    
    return data

In [4]:
# drop columns if not enough data
def na_col_drop(data, crit=10, drop=True, verbose=False):
    """
    delete NaN columns in the dataframe based on a minimum number of non-NaN values 
    """

    drop_cols=[]
    if verbose: print('Non-NaN values\n----------------')
    for c in data.columns:
        v=len(data.iloc[:,0])-data[c].isnull().sum()
        if verbose: print(f'{c}: {v} ; {data[c].isnull().sum()}')
        if v<crit:
            drop_cols.append(c)

    if drop:
        print(f'\nwill be dropped :{drop_cols}')
        data.drop(drop_cols, axis=1, inplace=True)

    return data

In [5]:
def col_ren(data, line_to_col=1, mode=0, name=[]):
    """
    mode: int
        set 0 to rename columns with a line, set 1 if provide name list or dict
    """
    new_name={}
    
    if mode!=0 and mode!=1:
        print("Error! Parameter \'Mode\' must be 0 or 1 (if 1, colums length must be equal to name length)")
        
    elif mode==0:
        for i in data.columns:
            col = str(data.iloc[line_to_col, i])
            if re.search('nan',col, flags=re.IGNORECASE):
                new_name.update({i:f'col_{i}'})
            else:
                new_name.update({i:col})
        
        data.drop([line_to_col], axis=0, inplace=True)
        data.reset_index(drop=True, inplace=True)

    elif mode==1 and len(name)==len(data.columns):
        for i in range(len(data.columns)):
            new_name.update({data.columns[i]:name[i]})
    
    elif mode==1 and len(name)!=len(data.columns):
        print('Error! names list length and columns length are not the same')
    
    data.rename(columns=new_name, inplace=True)
    
    return data 

In [6]:
def dble_col_drop(data):
    twins=[]
    idx_drop=[]
    for c in range(len(data.columns)):
        if data.columns[c] not in twins: twins.append(data.columns[c])
        else: idx_drop.append(c)

    data.drop(columns=data.columns[idx_drop], axis=1, inplace=True)
    print('column(s) dropped: ', idx_drop)
    
    return data

In [7]:
pol_fields=[]

## Retrieve common fields names for pollutant analysis

In [9]:
tmp_dir='../../CF_data/synthese/Result_traitem/Phase_1_Memoris/'
sheet='Result_sol'

In [10]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
gdf_viewer(df, rows=5)

lines dropped: [30]

will be dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 135, columns : 35


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [11]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [12]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [13]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [14]:
an=col_ren(an, 1)

In [15]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [16]:
for c in an.columns:
    if c not in pol_fields:
        pol_fields.append(c)

In [18]:
tmp_dir='../../CF_data/synthese/Result_traitem/Phase_1_Memoris/'
sheet='Result_eau'

In [19]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 1/'
                   'Résultats phase 1_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
gdf_viewer(df, rows=5)

lines dropped: [130]

will be dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 136, columns : 23


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=136, min=5, readout=False), IntSlider(value=1…

In [20]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [21]:
an.loc[0.5] = df.loc[0] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [22]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [23]:
an=col_ren(an, 1)

In [24]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [25]:
for c in an.columns:
    if c not in pol_fields:
        pol_fields.append(c)

In [29]:
tmp_dir='../../CF_data/synthese/Result_traitem/Phase_2_Memoris/'
sheet='Result_SOL'

In [30]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult SOL', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
gdf_viewer(df, rows=5)

lines dropped: [30]

will be dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 135, columns : 31


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


interactive(children=(IntSlider(value=5, description='rows', max=135, min=5, readout=False), IntSlider(value=1…

In [31]:
prv_sol=df.loc[:35]
an=df.loc[36:]

In [32]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [33]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [34]:
an=col_ren(an, 1)

In [35]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [36]:
for c in an.columns:
    if c not in pol_fields:
        pol_fields.append(c)

In [38]:
tmp_dir='../../CF_data/synthese/Result_traitem/Phase_2_Memoris/'
sheet='Result_eau'

In [39]:
df = pd.read_excel('../../CF_data/Data_UMONS/Contamination/Résultats et mesures Siterem/résultats phase 2/'
                   'Résultats phase 2_MEMORIS.xls', sheet_name='Résult EAU', skiprows=4)
df=na_line_drop(df,0)
df=na_col_drop(df,1)
gdf_viewer(df, rows=5)

lines dropped: []

will be dropped :['Unnamed: 0', 'Unnamed: 2']
Rows : 138, columns : 17


interactive(children=(IntSlider(value=5, description='rows', max=138, min=5, readout=False), IntSlider(value=1…

In [40]:
prv_eau=df.loc[:32]
an=df.loc[33:]

In [41]:
an.loc[0.5] = df.loc[1] # put data on first line
an = an.sort_index().reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value, self.name)


In [42]:
an=an.transpose()
an.reset_index(drop=True, inplace=True)

In [43]:
an=col_ren(an, 1)

In [44]:
an.replace(r'<|>','', inplace=True, regex=True)
an.replace(r'-',np.nan, inplace=True, regex=True)
an.rename(columns={an.columns[0]:'ID_ech'}, inplace=True)

In [45]:
an.drop(list(range(5)), axis=0, inplace=True)
an.reset_index(drop=True, inplace=True)

In [46]:
for c in an.columns:
    if c not in pol_fields:
        pol_fields.append(c)

--------------------------------------------------------------------------------------------------------

In [54]:
print(len(pol_fields),"\n", pol_fields)

128 
 ['ID_ech', 'METAUX LOURDS', 'Arsenic', 'Cadmium', 'Chrome', 'Chrome VI', 'Cuivre', 'Mercure', 'Plomb', 'Nickel', 'Zinc', 'CYANURES', 'Cyanure (libre)', 'Cyanure (totaux)', 'cyanure (APE)', 'cyanure complex', 'thiocyanate', 'COMPOSES AROMATIQUES VOLATILS', 'Benzène', 'Toluène', 'Éthylbenzène', 'Orthoxylène', 'Para- et métaxylène', 'Xylènes', 'Styrène', 'BTEX totaux', 'PHENOLS', 'Phénol', 'Indice phénol', 'HYDROCARBURES AROMATIQUES POLYCYCLIQUES', 'Naphtalène', 'Acénaphtylène', 'Acénaphtène', 'Fluorène', 'col_34', 'Anthracène', 'Fluoranthène', 'Pyrène', 'Benzo(a)anthracène', 'Chrysène', 'Benzo(b)fluoranthène', 'Benzo(k)fluoranthène', 'Benzo(a)pyrène', 'Dibenzo(ah)anthracène', 'Benzo(ghi)pérylène', 'Indéno(1,2,3-cd)pyrène', 'HAP Totaux (16) - EPA', 'COMPOSES ORGANOHALOGENES VOLATILS', '1,1-Dichloroéthane', '1,2-Dichloroéthane', '1,1-dichloroéthène', 'Cis-1,2-dichloroéthène', 'Trans 1,2-dichloroéthylène', 'Dichlorométhane', 'Totaux (cis,trans) 1,2-dichloroéthènes', '1,2-dichloropropa

In [77]:
pol_old_name=['Arsenic','Cadmium','Chrome','Chrome VI','Cuivre','Mercure','Plomb','Nickel','Zinc','Cyanure (libre)',
'Cyanure (totaux)','cyanure (APE)','cyanure complex','thiocyanate','Benzène','Toluène','Éthylbenzène',
'Orthoxylène','Para- et métaxylène','Xylènes','Styrène','BTEX totaux','Phénol',
'Indice phénol','Naphtalène','Acénaphtylène','Acénaphtène','Fluorène','Phénanthrène','Anthracène',
'Fluoranthène','Pyrène','Benzo(a)anthracène','Chrysène','Benzo(b)fluoranthène','Benzo(k)fluoranthène',
'Benzo(a)pyrène','Dibenzo(ah)anthracène','Benzo(ghi)pérylène','Indéno(1,2,3-cd)pyrène',
'HAP Totaux (16) - EPA','1,1-Dichloroéthane','1,2-Dichloroéthane','1,1-dichloroéthène',
'Cis-1,2-dichloroéthène','Trans 1,2-dichloroéthylène','Dichlorométhane',
'Totaux (cis,trans) 1,2-dichloroéthènes','1,2-dichloropropane','Tétrachloroéthylène',
'Tétrachlorométhane','1,1,1-Trichloroéthane','1,1,2-Trichloroéthane','Trichloroéthylène','Chloroforme',
'Chlorure de vinyle','EOX (****)','fraction aromat. >C6-C7','fraction aromat. >C7-C8',
'fraction aromat. >C8-C10','fraction aliphat. C5-C6','fraction aliphat. >C6-C8',
'fraction aliphat. >C8-C10','Fraction C5 - C8','Fraction C8 - C10','Fraction C10-C12',
'Fraction C12-C16','Fraction C16 - C21','Fraction C21 - C35','Fraction C35 - C40',
'Hydrocarbures totaux C10-C35','Hydrocarbures totaux C10-C40','MTBE','PCB 28','PCB 52','PCB 101',
'PCB 118','PCB 138','PCB 153','PCB 180','PCB totaux (7)','Chlorures','Soufre Total', 'sulfites','sulfate']

In [78]:
pol_new_name=['As','Cd','Cr','Cr_VI','Cu','Hg','Pb','Ni','Zn','CN_libre','CN_tot','CN_APE','CN_comp',
'thioCN','Bnz','Toln','EthylBnz','O-Xyl','P-M-Xyl','Xyl','Styr','BTEX_tot','Phenol','Idc_Phenol','Naphta',
'Acenaphtyl','Acenaphtn','Fluorene','Phenanthr','Anthrc','Flranth','Pyr','Bnz(a)anthrc','Chrys','Bnz(b)flranth',
'Bnz(k)flranth','Bnz(a)pyr','Dibnz(ah)anthrc','Bnz(ghi)peryl','Indeno(1,2,3-cd)pyr','HAP_tot_EPA',
'1,1-DCE','1,2-DCE','1,1-DCEn','Cis-1,2-DCEn','Trans 1,2-DCEyl','DCM','(cis,trans) 1,2-DCE_tot',
'1,2-DCP','TetraCEyn','TCM','1,1,1-TCE','1,1,2-TCE','TCEyn','Chloroforme','CVinyl','EOX',
'Arom_C6C7','Arom_C7C8','Arom_C8C10','Aliphat_C5C6','Aliphat_C6C8','Aliphat_C8C10','Fract_C5C8','Fract_C8C10',
'Fract_C10C12','Fract_C12C16','Fract_C16C21','Fract_C21C35','Fract_C35C40','HC_tot_C10C35','HC_tot_C10C40','MTBE','PCB_28',
'PCB_52','PCB_101','PCB_118','PCB_138','PCB_153','PCB_180','PCB_tot','Cl','S_tot', 'sulfites','sulfate']

In [79]:
pol_field_model={}
if len(pol_old_name)==len(pol_new_name):
    for i,j in zip(pol_old_name, pol_new_name):
        pol_field_model.update({i:j})
    
else:
    print('Error! length of lists provided are not the same')

In [80]:
print(pol_field_model)

{'Arsenic': 'As', 'Cadmium': 'Cd', 'Chrome': 'Cr', 'Chrome VI': 'Cr_VI', 'Cuivre': 'Cu', 'Mercure': 'Hg', 'Plomb': 'Pb', 'Nickel': 'Ni', 'Zinc': 'Zn', 'Cyanure (libre)': 'CN_libre', 'Cyanure (totaux)': 'CN_tot', 'cyanure (APE)': 'CN_APE', 'cyanure complex': 'CN_comp', 'thiocyanate': 'thioCN', 'Benzène': 'Bnz', 'Toluène': 'Toln', 'Éthylbenzène': 'EthylBnz', 'Orthoxylène': 'O-Xyl', 'Para- et métaxylène': 'P-M-Xyl', 'Xylènes': 'Xyl', 'Styrène': 'Styr', 'BTEX totaux': 'BTEX_tot', 'Phénol': 'Phenol', 'Indice phénol': 'Idc_Phenol', 'Naphtalène': 'Naphta', 'Acénaphtylène': 'Acenaphtyl', 'Acénaphtène': 'Acenaphtn', 'Fluorène': 'Fluorene', 'Phénanthrène': 'Phenanthr', 'Anthracène': 'Anthrc', 'Fluoranthène': 'Flranth', 'Pyrène': 'Pyr', 'Benzo(a)anthracène': 'Bnz(a)anthrc', 'Chrysène': 'Chrys', 'Benzo(b)fluoranthène': 'Bnz(b)flranth', 'Benzo(k)fluoranthène': 'Bnz(k)flranth', 'Benzo(a)pyrène': 'Bnz(a)pyr', 'Dibenzo(ah)anthracène': 'Dibnz(ah)anthrc', 'Benzo(ghi)pérylène': 'Bnz(ghi)peryl', 'Indéno(1

In [81]:
an

Unnamed: 0,ID_ech,Type_ech,Arsenic,Cadmium,Chrome,Cuivre,Mercure,Plomb,Nickel,Zinc,...,fraction aliphat. >C8-C10,Fraction C5 - C8,Fraction C8 - C10,Fraction C10-C12,Fraction C12-C16,Fraction C16 - C21,Fraction C21 - C35,Hydrocarbures totaux C10-C35,MTBE,Chlorures
0,P18c,Eau,2.8,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,75,160,440,2300,4600,180,10,7100,20.0,72.0
1,P19,Eau,4.1,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,3,36,60,200,1100,55,10,1400,2.0,98.0
2,P20,Eau,1.9,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,3,3,10,10,5,10,10,50,20.0,110.0
3,P21,Eau,1.8,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,3,3,10,20,55,20,10,95,2.0,110.0
4,P22,Eau,2.9,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,3,3,10,10,5,10,10,50,2.0,110.0
5,P23,Eau,1.0,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,3,26,96,110,770,160,70,1100,2.0,100.0
6,eau forage P23,Eau,,,,,,,,,...,75,110,680,5800,4100,210,25,10000,,
7,P24b,Eau,3.2,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,3,3,10,10,5,10,10,50,2.0,98.0
8,P25,Eau,13.0,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,3,3,10,10,5,10,10,50,2.0,100.0
9,P26,Eau,3.5,0.25,2.5,5.0,0.05,2.5,5.0,20.0,...,3,3,10,10,5,10,10,50,2.0,110.0
