# Preparation of the drinking water quality data

In [2]:
# Imports
from PIL import Image
import pandas as pd
import numpy as np
import os,sys

In [117]:
# Read raw files
raw_path = "./raw_data/terviseamet"
raw_files = os.listdir(raw_path)

original_columns = pd.read_csv(os.path.join(raw_path, raw_files[0]))

raw_filelist = []
for f in raw_files:
    df = pd.read_csv(os.path.join(raw_path, f))

    # select columns and translate into english
    col_map = {
        'id': 'test_id',
        'veevark_id': 'station_id', 
        'veeliik': 'water_type', 
        'proovivotu_aeg': 'date',
        'proovivotu_metoodika': 'test_method', 
        'id3': 'indicator_id',
        'nimetus4': 'indicator_name',
        'sisaldus': 'value',
        'yhik': 'unit',
        'hinnang5': 'assessment'
        }
    df = df[col_map.keys()]
    df.rename(columns=col_map, inplace=True)
    
    # Add year variable
    df['date'] = pd.to_datetime(df['date'])
    df['year'] = df.date.dt.year

    # Add df to list for concatenation later
    raw_filelist.append(df)

long_df = pd.concat(raw_filelist, ignore_index=True)
long_df

  df = pd.read_csv(os.path.join(raw_path, f))
  df = pd.read_csv(os.path.join(raw_path, f))
  df = pd.read_csv(os.path.join(raw_path, f))
  df = pd.read_csv(os.path.join(raw_path, f))


Unnamed: 0,test_id,station_id,water_type,date,test_method,indicator_id,indicator_name,value,unit,assessment,year
0,276444,302,Joogivesi,2022-05-07,,241,Maitse (lahjendusaste),1,lahjendusaste,vastab nõuetele,2022
1,276444,302,Joogivesi,2022-05-07,,112,Lõhn (lahjendusaste),1,lahjendusaste,vastab nõuetele,2022
2,276444,302,Joogivesi,2022-05-07,,131,Värvus (Pt/Co skaala),3,mg/l Pt,vastab nõuetele,2022
3,276444,302,Joogivesi,2022-05-07,,16,pH,7.7,pH ühik,vastab nõuetele,2022
4,276444,302,Joogivesi,2022-05-07,,51,Elektrijuhtivus,630,μS/cm,vastab nõuetele,2022
...,...,...,...,...,...,...,...,...,...,...,...
504185,173704,306,Joogivesi,2018-01-01,,131,Värvus (Pt/Co skaala),3,mg/l Pt,vastab nõuetele,2018
504186,173704,306,Joogivesi,2018-01-01,,16,pH,7.22,pH ühik,vastab nõuetele,2018
504187,173878,306,Joogivesi,2018-01-01,,7,Clostridium perfringens (koos eostega),0,PMÜ/100ml,vastab nõuetele,2018
504188,173878,306,Joogivesi,2018-01-01,,4,Escherichia coli,0,PMÜ/100 ml,vastab nõuetele,2018


In [118]:
# How many tests per year per station_id
long_df.groupby(by=['station_id', 'year']).test_id.unique()

station_id  year
162         2014                                [80238, 80290, 61616]
            2015                              [106003, 106041, 98823]
            2016                             [133003, 131795, 122653]
            2017                                     [137400, 137538]
            2018    [182057, 182240, 177174, 177336, 168987, 16925...
                                          ...                        
2604        2022                                     [268222, 268532]
2643        2022                                             [274479]
2676        2022                                     [274169, 274168]
2678        2021                             [256134, 256136, 256138]
2679        2022                                             [275717]
Name: test_id, Length: 9237, dtype: object

In [119]:
long_df.value.dtype

dtype('O')

# What indicators to keep

In [120]:
# Keep only 2017-2021
period_df = long_df[(long_df.year>=2018) & (long_df.year < 2022)]

In [138]:
# Which indicators exist for all years
inds_per_year_df = period_df.groupby(by=['indicator_name', 'year']).size().sort_values(ascending=False).to_frame(name='size')
inds_per_year_wide = inds_per_year_df.pivot_table(
    index = ['indicator_name'],
    columns = 'year',
    values = 'size'
)

# Indicators that have more than n instances for all years
inds_that_exist_all_years = inds_per_year_wide.dropna()
for y in period_df.year.unique():
    inds_that_exist_all_years = inds_that_exist_all_years[inds_that_exist_all_years[y]>=1000]
indicators_to_keep = inds_that_exist_all_years.index
indicators_to_keep

Index(['Ammoonium', 'Coli-laadsed bakterid', 'Elektrijuhtivus',
       'Escherichia coli', 'Hägusus (NTU)', 'Kolooniate arv 22 °C',
       'Lõhn (lahjendusaste)', 'Maitse (lahjendusaste)', 'Raud',
       'Värvus (Pt/Co skaala)', 'pH'],
      dtype='object', name='indicator_name')

In [139]:
# Dataframe with only top indicators
rows_to_keep = [iname in indicators_to_keep for iname in period_df.indicator_name]
top_ind_df = period_df[rows_to_keep] 
top_ind_df

Unnamed: 0,test_id,station_id,water_type,date,test_method,indicator_id,indicator_name,value,unit,assessment,year
13627,212401,1724,Joogivesi,2019-12-31 00:00:00,,40,Kolooniate arv 22 °C,87,PMÜ/1 ml,vastab nõuetele,2019
13628,208314,306,Joogivesi,2019-12-30 08:30:00,"EVS-ISO 5667-5, EVS-EN ISO 19458",112,Lõhn (lahjendusaste),1,lahjendusaste,vastab nõuetele,2019
13629,208314,306,Joogivesi,2019-12-30 08:30:00,"EVS-ISO 5667-5, EVS-EN ISO 19458",56,Hägusus (NTU),1.0,NTU,vastab nõuetele,2019
13630,208314,306,Joogivesi,2019-12-30 08:30:00,"EVS-ISO 5667-5, EVS-EN ISO 19458",1,Raud,18,µg/l,vastab nõuetele,2019
13632,208314,306,Joogivesi,2019-12-30 08:30:00,"EVS-ISO 5667-5, EVS-EN ISO 19458",131,Värvus (Pt/Co skaala),2,mg/l Pt,vastab nõuetele,2019
...,...,...,...,...,...,...,...,...,...,...,...
504184,173704,306,Joogivesi,2018-01-01 00:00:00,,57,Coli-laadsed bakterid,0,PMÜ/100 ml,vastab nõuetele,2018
504185,173704,306,Joogivesi,2018-01-01 00:00:00,,131,Värvus (Pt/Co skaala),3,mg/l Pt,vastab nõuetele,2018
504186,173704,306,Joogivesi,2018-01-01 00:00:00,,16,pH,7.22,pH ühik,vastab nõuetele,2018
504188,173878,306,Joogivesi,2018-01-01 00:00:00,,4,Escherichia coli,0,PMÜ/100 ml,vastab nõuetele,2018


In [140]:
# Pivot table for test & indicator
single_test_wide = top_ind_df[['test_id', 'indicator_name', 'value']].pivot_table(
    index = 'test_id',
    columns = 'indicator_name',
    values = 'value'
)

  single_test_wide = top_ind_df[['test_id', 'indicator_name', 'value']].pivot_table(


In [141]:
# How many measurements per station per year
ind_by_year = top_ind_df.groupby(by=['indicator_name', 'year']).size()

In [142]:
# Value column to numeric and errors to NaN
top_ind_df['value'] = pd.to_numeric(top_ind_df['value'], errors='coerce')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_ind_df['value'] = pd.to_numeric(top_ind_df['value'], errors='coerce')


# Wide data

In [146]:
# indicator_name + year variable

In [147]:
# Keep only relevant variables
vars_for_wide = ['station_id', 'indicator_name', 'value', 'year']
wide_df = top_ind_df[vars_for_wide].pivot_table(
    index = ['station_id', 'year'],
    columns = 'indicator_name',
    values = 'value',
    aggfunc = np.nanmax
)

In [148]:
# Keep only rows that have all the measurement values
cleaned_df = wide_df.dropna()
cleaned_df

Unnamed: 0_level_0,indicator_name,Ammoonium,Coli-laadsed bakterid,Elektrijuhtivus,Escherichia coli,Hägusus (NTU),Kolooniate arv 22 °C,Lõhn (lahjendusaste),Maitse (lahjendusaste),Raud,Värvus (Pt/Co skaala),pH
station_id,year,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
162,2018,0.12,0.0,301.0,0.0,1.00,9.0,1.0,1.0,56.0,6.0,7.50
163,2018,0.08,17.0,821.0,0.0,1.00,13.0,3.0,3.0,131.3,14.9,8.07
163,2019,0.08,0.0,716.0,0.0,1.18,12.0,1.0,1.0,109.4,11.7,8.17
163,2020,0.08,0.0,671.0,0.0,1.90,12.0,1.0,1.0,98.0,12.1,8.12
163,2021,0.08,0.0,626.0,0.0,1.24,136.0,1.0,1.0,130.0,10.4,8.15
...,...,...,...,...,...,...,...,...,...,...,...,...
2314,2021,0.07,0.0,300.0,0.0,1.00,0.0,1.0,1.0,27.0,0.0,7.40
2398,2021,0.08,0.0,273.0,0.0,0.50,4.0,1.0,1.0,94.1,8.4,7.59
2415,2021,0.05,0.0,447.0,0.0,1.00,12.0,1.0,1.0,20.0,0.0,7.60
2426,2021,0.05,4.0,503.0,0.0,1.00,31.0,1.0,1.0,20.0,0.0,7.30


In [149]:
# How many stations have measurements for all years
clean_obs_per_station = cleaned_df.groupby(by='station_id').size().sort_values(ascending=False)
clean_obs_per_station

station_id
291     4
320     4
442     4
345     4
342     4
       ..
653     1
656     1
663     1
676     1
2459    1
Length: 409, dtype: int64

In [150]:
indicators_to_keep.keys()

indicator_name_map = {
'Hägusus (NTU)' : 'turbidity', 
'Escherichia coli': 'escherichia_coli', 
'Coli-laadsed bakterid': 'coli-type_bacteria',
'Värvus (Pt/Co skaala)': 'color',
'Raud': 'iron',
'Maitse (lahjendusaste)': 'taste',
'Lõhn (lahjendusaste)': 'smell', 
'pH': 'ph', 
'Elektrijuhtivus': 'conductivity', 
'Ammoonium': 'ammonia', 
'Lõhn (pallides)': 'smell2', 
'Kolooniate arv 22 °C': 'number_of_colonies', 
'Enterokokid': 'enterococci', 
'Coli-laadsed bakterid (Colilert)': 'coli-type_bacteria_colilert', 
'Escherichia coli (Colilert)': 'escherichia-coli_colilert',
'Mangaan': 'manganese', 
'Clostridium perfringens (koos eostega)': 'clostridium_perfringens', 
'Oksüdeeritavus': 'oxidability'
}


AttributeError: 'Index' object has no attribute 'keys'