In [1]:
import pandas as pd
import numpy as np
import os
import glob
pd.set_option('display.max_columns', None)
import matplotlib.pyplot as plt
import gc
import math
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Capacity Grouping

In [2]:
## Get Info about the WTP Capacity and water source
wtpc = pd.read_csv('Design Capacity WTP.csv')
wlb = ['small','medium','large']
breaks = [0,20,200,np.inf]
wtpc['ng'] = pd.cut(x=wtpc['Design Capacity (MLD)'], bins=breaks,labels=wlb)
wtpc['nng'] = pd.cut(x=wtpc['Design Capacity (MLD)'], bins=breaks)

## General Data Loading & Cleaning

### Water Quality (WQ) Related Data 

In [5]:
path = r'WQ\Refined'
db_dir = glob.glob(path + "/*.csv")

li = []

## Joining all csv file within the folder
for i in db_dir:
    df = pd.read_csv(i, index_col=None, header=0)
    li.append(df)
fdb = pd.concat(li, ignore_index=True)

In [6]:
## Drop unwanted column and fill na with big number 9999, then it will be removed when there is a change in data
fdb = fdb.drop(['Unnamed: 0'],axis=1)
fdb['Date'] = pd.to_datetime(fdb['Date'])
cl = fdb.loc[:, fdb.dtypes == object].columns.to_list()
cl.remove('wtp')
for i in cl:
    fdb[i] = fdb[i].fillna(9999)
    fdb[i] = fdb[i].apply(lambda x: str(x).replace(',',''))
fdb[cl] = fdb[cl].apply(pd.to_numeric)
db = fdb.copy()

In [7]:
## Get all the relevant columns (Daily average for further processing)
avg = fdb.columns
l = ['Quarter','wtp','Month','Date']
avg = [x for x in avg if "avg" in x ] + l
db['Month'] = db['Date'].dt.month
db['Quarter'] = db['Date'].dt.quarter
dc = db[avg].copy()

## Drop due to data limitation (Only limited number of WTP do it on daily basis)
dc= dc.drop(['Aluminium_avg', 'Fluoride_avg'],axis=1)
avc = [x for x in dc.columns if "avg" in x ]

In [15]:
## Final Clean data for further processing
dc = pd.merge(dc,wtpc[['wtp','Water Source','Lembangan']],on='wtp',how='left')
dc = dc[(dc['Lembangan'].isin(['Sg Tengi'])) & (dc['Water Source']=='River')].copy()
dc = dc[dc['pH_avg']<14].copy()

## Chemical Dosage

#### Overall DB

In [16]:
path = r'CC\Chm_DB'
dcb_dir = glob.glob(path + "/*.csv")

mm = []

for i in dcb_dir:
    dbo = pd.read_csv(i, index_col=None, header=0)
    mm.append(dbo)

In [17]:
chdb = pd.concat(mm, ignore_index=True)
chdb = chdb.drop(['Unnamed: 0'],axis=1)
chdb = chdb[chdb['Date']!='MONTHLY'].reset_index(drop=True)
chdb['Date'] = chdb['Date'] + '-2021'
chdb['Date'] = pd.to_datetime(chdb['Date'])
chdb['Month'] = chdb['Date'].dt.month
chdb['Quarter'] = chdb['Date'].dt.quarter

In [18]:
FC = chdb.columns
fFC = [x for x in FC if "avg" in x]
rcol = ['wtp','Date','Month','Quarter']
xxdb = chdb[fFC+rcol].copy()

In [19]:
new = [x for x in FC if "avg" in x]
new = [x.replace(" _avg","") for x in new]
new = [x.replace("_avg","") for x in new]

### Regrouping Chemical Product to it's own purposes

In [20]:
xdic = dict(zip(fFC,new))
xxdb = xxdb.rename(columns=xdic)
xxdb = pd.merge(xxdb,wtpc[['wtp','Water Source','Lembangan']],on='wtp',how='left')
x1db = xxdb[(xxdb['Lembangan']=='Sg Tengi') & (xxdb['Water Source']=='River')].copy()
avc2 = x1db.drop(['wtp', 'Date', 'Month','Quarter', 'Water Source', 'Lembangan'],axis=1).columns

In [26]:
appl = x1db.groupby(['wtp']).mean().reset_index()
wtpn = appl.wtp.unique()
number = dict({'wtp':[],'Chem':[]})
for i in wtpn:
    number['wtp'].append(i)
    jde = appl[appl['wtp']==i].drop(['wtp'],axis=1)
    jde = jde.dropna(axis=1).columns
    number['Chem'].append(jde)
cgr = pd.read_csv("C.Grouping.csv")
Chdict = dict(zip(cgr['Chemical Name'].values,cgr['Correct Grouping']))
df22=pd.DataFrame(number)
df22=df22.assign(Chem=df22.Chem.map(','.join))
df22=df22.set_index(['wtp']).apply(lambda x: x.str.split(',').explode()).reset_index()
df22['Purpose']=df22['Chem'].apply(lambda x: Chdict.get(x))

In [28]:
lis = df22['Chem'].unique()
pre_lis = [x for x in lis if "Pre" in x]
pos_lis = [x for x in lis if "Post" in x]
int_lis = [x for x in lis if "Intermediate" in x]
total_lis = pre_lis + pos_lis + int_lis

In [30]:
sd = ['Soda Ash (Pre)','Soda Ash (Post)']

In [31]:
x1db[total_lis] = x1db[total_lis].fillna(0)
x1db['Soda_Ash_T'] = x1db[sd[0]]+x1db[sd[1]]
li = [dict(zip(sd,['Soda_Ash_T' for i in range(len(sd))]))]

In [32]:
d = {}
for dictionary in li:
    for k, v in dictionary.items():
        d[k] = v

In [33]:
df22['Chem'] = df22['Chem'].apply(lambda x: d.get(x,x))
df22 = df22.drop_duplicates(subset=['wtp','Chem'])
df22 = df22.drop_duplicates(subset=['wtp','Purpose'],keep='first')
tt = x1db['wtp'].unique()
trdf = df22.copy()
Chdict['Hydrated_Lime_T'] = 'pH Adjuster'
Chdict['Soda_Ash_T'] = 'pH Adjuster'
jam = pd.DataFrame()

In [34]:
x10db = x1db.copy()

In [35]:
onew = 0
for i in tt:
    
    c2t = trdf[trdf['wtp']==i]['Chem'].unique()
    c2t = np.append(c2t,np.array(['Date','wtp']))
    op = x10db[x10db['wtp']==i][c2t].copy()
    op = op.rename(columns=Chdict)
    
    if onew == 0:
        jam = jam.append(op)
    else:
        jam = pd.concat([jam, op], ignore_index=False)
    
    onew = onew+1
    
    del c2t, op
    gc.collect()

512

### Final Merge Data

In [41]:
wqdb1 = dc[['wtp', 'pH_avg', 'Date', 'Turbidity_avg', 'Color_avg',
       'Iron_avg', 'Ammonia_avg', 'Manganese_avg']].copy()
FFDB = pd.merge(wqdb1.copy(),jam.copy(),on=['wtp','Date'],how='left')
the_name = FFDB.drop(['wtp', 'Date','Month'],axis=1).columns

In [45]:
## Feature Engineering (Daily % Changes compared to median value)
for i in the_name:
    means = FFDB.groupby('wtp')[f'{i}'].transform('median')
    FFDB[f'{i}_chpctm'] = (FFDB[f'{i}'] - means)/means*100

In [46]:
tfi = [x for x in FFDB.columns if "_chpctm" in x]
to_c4 = FFDB[tfi].copy()

In [49]:
## General Columns for EDA
ge = ['pH_avg','Turbidity_avg', 'Color_avg','Iron_avg', 'Ammonia_avg', 'Manganese_avg','Disinfectant','Coagulant', 'Flouridation','pH Adjuster']
lib = list(to_c4.columns) + ['Date','wtp'] + ge
to_c6 = FFDB[lib].copy()

In [52]:
to_c6.to_csv('C_data/WGIDB_SgTEGN.csv')