# Information regarding good/bad water quality

In [1]:
import pandas as pd
import numpy as np
# import warnings
# warnings.filterwarnings('ignore')

Alkalinity - Ideal Range: 20 to 200 mg/L  -> milligrams per liter or ppm  
EC - Ideal Range: < 800 uS/cm -> microsiemens per centimeter  
DRP - Ideal Range: < 100 ug/L -> micrograms per liter

In [2]:
# read water quality data
df = pd.read_csv('data/wq.csv')
df.describe().round().astype(int)

Unnamed: 0,latitude,longitude,nir,green,swir16,swir22,ndmi,mndwi,pet,total alkalinity,electrical conductance,dissolved reactive phosphorus,sample_year,pop_density_nn,distance_km_to_pd_cell,river_mouthORjunction,river_mouth,river_junction
count,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093,9093
mean,-28,27,14076,9954,13603,11415,0,0,175,119,488,44,2013,102,0,0,0,0
std,3,3,2781,2636,3155,2403,0,0,29,75,344,51,1,455,0,0,0,0
min,-34,18,3992,4045,3672,3634,0,0,53,5,15,5,2011,0,0,0,0,0
25%,-30,26,12989,9432,12100,10050,0,0,156,55,207,10,2012,2,0,0,0,0
50%,-28,27,14183,9801,13704,11265,0,0,172,113,406,20,2013,10,0,0,0,0
75%,-27,29,15306,10202,15196,12650,0,0,193,171,696,50,2014,54,0,0,0,0
max,-22,32,65535,65535,65535,31202,1,1,271,362,1506,195,2015,6180,1,1,1,1


In [3]:
df['DRP'] = [1 if _ < 100 else 0 for _ in df['dissolved reactive phosphorus']]
df['EC'] = [1 if _ < 800 else 0 for _ in df['electrical conductance']]
df['Alkalinity'] = [1 if _ > 20 and _ < 200 else 0 for _ in df['total alkalinity']]

# check for nulls
df.isnull().sum()

province                         0
country                          0
latitude                         0
longitude                        0
sample date                      0
nir                              0
green                            0
swir16                           0
swir22                           0
ndmi                             0
mndwi                            0
pet                              0
total alkalinity                 0
electrical conductance           0
dissolved reactive phosphorus    0
month                            0
sample_year                      0
pop_density_nn                   0
distance_km_to_pd_cell           0
river_mouthORjunction            0
river_mouth                      0
river_junction                   0
DRP                              0
EC                               0
Alkalinity                       0
dtype: int64

In [4]:
# explore good and bad water quality separately
bad = df[(df['DRP']==0) & (df['EC']==0) & (df['Alkalinity']==0)]
good = df[(df['DRP']==1) & (df['EC']==1) & (df['Alkalinity']==1)]

In [5]:
# explore and compare water quality overall
df = df.assign(how_good=df['Alkalinity']+df['DRP']+df['EC'])
df['good'] = np.where(df['how_good'] > 2, 1, 0)
df.head()

Unnamed: 0,province,country,latitude,longitude,sample date,nir,green,swir16,swir22,ndmi,...,pop_density_nn,distance_km_to_pd_cell,river_mouthORjunction,river_mouth,river_junction,DRP,EC,Alkalinity,how_good,good
0,Mpumalanga,South Africa,-26.861111,28.884722,2011-01-03,17658.5,9550.0,13746.5,10574.0,0.124566,...,5.049022,0.251555,0,0,0,0,1,1,2,0
1,Gauteng,South Africa,-26.45,28.085833,2011-01-03,15210.0,10720.0,17974.0,14201.0,-0.083293,...,23.239988,0.419537,0,0,0,1,1,1,3,1
2,Free State,South Africa,-27.671111,27.236944,2011-01-03,14887.0,10943.0,13522.0,11403.0,0.048048,...,687.465759,0.069958,0,0,0,0,1,1,2,0
3,Free State,South Africa,-27.356667,27.286389,2011-01-03,16828.5,9502.5,12665.5,9643.0,0.141147,...,6.092811,0.232396,0,0,0,0,1,1,2,0
4,Free State,South Africa,-27.010111,26.698083,2011-01-04,12433.5,10433.5,9579.5,8531.5,0.129651,...,77.849716,0.466183,0,0,0,0,1,1,2,0


In [6]:
df.to_pickle('data/good_vs_bad.pkl')