## Importing the dependencies

In [1]:
import pandas as pd
import numpy as np
import plotly.plotly as py
import cufflinks as cf

## Importing the data

In [3]:
raw_data = pd.read_csv('influentdata.csv',sep=';')
raw_data.datetime = pd.to_datetime(raw_data.datetime)

start_idx = raw_data.first_valid_index()
end_idx = raw_data.last_valid_index()

In [4]:
raw_data.set_index('datetime', inplace=True, drop=True)
start_idx = raw_data.first_valid_index()
end_idx = raw_data.last_valid_index()

In [5]:
parameters_list = []
for column in raw_data.columns:
    if (('datetime' not in column)&('Unit' not in column) & ('equipment' not in column)):
        parameters_list.append(column)
print(parameters_list)

['CODf', 'COD', 'NH4_N', 'K']


In [6]:
py.iplot([{
    'x':raw_data.index,
    'y':raw_data[col],
    'name':col
} for col in parameters_list], filename='raw_data')


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



In [7]:
calibration_period =pd.to_datetime(['January 15 2018','January 22 2018'])

## Transforming the data into standardized form

In [8]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()
norm_data = scaler.fit_transform(raw_data[parameters_list])
norm_df = pd.DataFrame(data=norm_data,
                       index=raw_data.index,
                       columns=parameters_list)

In [9]:
norm_df.cov()

Unnamed: 0,CODf,COD,NH4_N,K
CODf,1.000006,0.634922,-0.102157,-0.022672
COD,0.634922,1.000006,0.158317,0.068391
NH4_N,-0.102157,0.158317,1.000006,0.307326
K,-0.022672,0.068391,0.307326,1.000006


In [10]:
cal_data = raw_data.loc[calibration_period[0]:calibration_period[1]]

In [11]:
cal_data[parameters_list].head()

Unnamed: 0_level_0,CODf,COD,NH4_N,K
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-15 00:00:00,240.391,429.827,42.14,15.5
2018-01-15 00:02:00,236.9,425.971,42.5,15.6
2018-01-15 00:04:00,233.898,429.146,42.04,15.5
2018-01-15 00:06:00,236.87,427.17,41.77,15.3
2018-01-15 00:08:00,242.118,430.361,41.56,15.4


In [16]:
iter_alpha = 0.8

In [17]:
raw_data['CODf_mean'] = raw_data.CODf.ewm(alpha=iter_alpha).mean()
raw_data['CODf_std'] = raw_data.CODf.ewm(alpha=iter_alpha).std()
cal_std = cal_data.CODf.std()
print(cal_std)
raw_data['CODf_outliers']=raw_data.CODf.loc[(raw_data.CODf > raw_data.CODf_mean+cal_std)\
                          & (raw_data.CODf < raw_data.CODf_mean-cal_std)]

24.961962229202836


In [None]:
def  

In [105]:
'''Trace0 = {
    'x':cal_data.index,
    'y':cal_data['CODf'],
    'name':'CODf'
}
'''
Trace1 = {
    'x':raw_data.index,
    'y':raw_data['CODf_mean'],
    'name':'CODf_ewm',
    'stackgroup':'one',
    'fill':None,
    'line':dict(
        color='rgb(101, 240, 101)',
    )
}
Trace2 = {
    'x':raw_data.index,
    'y':raw_data['CODf_mean']+cal_std,
    'name':'CODf_ewm+std',
    'fill':None,
    'line':dict(
        color='rgb(143, 19, 131)',
    )
}
Trace3 = {
    'x':raw_data.index,
    'y':raw_data['CODf_mean']-cal_std,
    'name':'CODf_ewm-std',
    'fill':None,
    'line':dict(
        color='rgb(143, 19, 131)',
    )
}
Trace4 = {
    'x':raw_data.index,
    'y':raw_data.CODf_outliers,
    
    'name':'outliers',
    'fill':None,
    'mode':'markers'
}
data=[Trace1,Trace2,Trace3, Trace4]
py.iplot(data,filename='raw_data')


Woah there! Look at all those points! Due to browser limitations, the Plotly SVG drawing functions have a hard time graphing more than 500k data points for line charts, or 40k points for other types of charts. Here are some suggestions:
(1) Use the `plotly.graph_objs.Scattergl` trace object to generate a WebGl graph.
(2) Trying using the image API to return an image instead of a graph URL
(3) Use matplotlib
(4) See if you can create your visualization with fewer data points




The draw time for this plot will be slow for all clients.



Estimated Draw Time Too Long



### Exponential smothing functions

In [11]:
def s_1(data_array, alpha):
    results=[]
    for i in range(len(data_array)):
        if i < 1:    
            results.append(data_array[i])
        else:
            s = alpha*(data_array[i]) +\
                (1-alpha)*results[i-1]
            results.append(s)
    #print('Length of results is {}'.format(len(results)))
    return results



def s_2(data_array, alpha):
    results=[]
    for i in range(len(data_array)):
        if i < 2:    
            results.append(data_array[i])
        else:
            s = alpha*(data_array[i]) +\
                (1-alpha)*data_array[i-1] +\
                ((1-alpha)**2)*results[i-2]
            results.append(s)
    #print('Length of results is {}'.format(len(results)))
    return results
            
def s_3(data_array, alpha):
    results=[]
    for i in range(len(data_array)):
        if i < 3:    
            results.append(data_array[i])
        else:
            s = alpha*(data_array[i]) +\
                (1-alpha)*data_array[i-1] +\
                ((1-alpha)**2)*data_array[i-2] +\
                ((1-alpha)**3)*results[i-3]
            results.append(s)
    #print('Length of results is {}'.format(len(results)))
    return results          
                   
               

In [None]:
def shift(xs, n):
    e = np.zeros(xs.shape)
    if n >= 0:
        e[:n] = np.nan
        e[n:] = xs[:-n]
    else:
        e[n:] = np.nan
        e[:n] = xs[-n:]
    return e

In [35]:
def triple_smoothing(data_array):
    import scipy.optimize
    param,cov =scipy.optimize.curve_fit(s_1, data_array[1:], shift(data_array,1)[1:])
    alpha_1 = param[0]
    first_order = s_1(data_array, alpha_1)
    
    param,cov =scipy.optimize.curve_fit(s_2, data_array[2:], shift(data_array,2)[2:])
    alpha_2 = param[0]
    second_order = s_2(data_array, alpha_2)
    
    param,cov =scipy.optimize.curve_fit(s_3, data_array[3:], shift(data_array,3)[3:])
    alpha_3 = param[0]
    third_order = s_3(data_array, alpha_3)
    print(alpha_1, alpha_2, alpha_3)
    return np.add(np.add(first_order, second_order), np.dot(third_order,0.5))

In [36]:
for column_name in parameters_list:
    cal_data[column_name+'_triplesmooth'] = triple_smoothing(cal_data[column_name].values)
cal_data[[column_name+'_triplesmooth' for column_name in parameters_list]]

0.40957588507599746 0.9464306737158069 0.9493050372071887
0.4097307482285668 0.9554977025050228 0.9567458074906949
0.44444880179424207 0.9524045717459447 0.9525085208390907
0.4043254344222572 0.9503441704914668 0.9456490703465767


Unnamed: 0_level_0,CODf_triplesmooth,COD_triplesmooth,NH4_N_triplesmooth,K_triplesmooth
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-15 00:00:00,600.977500,1074.567500,105.350000,38.750000
2018-01-15 00:02:00,594.311171,1067.203578,106.050002,38.940433
2018-01-15 00:04:00,588.585077,1073.044352,105.361799,38.817269
2018-01-15 00:06:00,592.950642,1070.168331,104.820968,38.461598
2018-01-15 00:08:00,602.778011,1075.581529,104.304746,38.574732
2018-01-15 00:10:00,594.670456,1074.013231,105.060463,38.573567
2018-01-15 00:12:00,600.500128,1080.135501,104.212370,38.569110
2018-01-15 00:14:00,597.838787,1077.314769,105.471772,38.749016
2018-01-15 00:16:00,601.131431,1078.506018,105.822883,38.961829
2018-01-15 00:18:00,598.685898,1077.601055,106.882115,39.372809


In [None]:
print(alpha_1, alpha_2, alpha_3)

In [None]:
def backtodateaubase(input_df, param,sampling_point):
    df = input_df.copy(deep=True)
    df = df[['datetime',param,param+' Unit', param+' equipment']]
    df['Sampling Point'] = sampling_point
    df[param+' equipment'] = param +' from '+ df[param+' equipment']
    df.columns = ['Date and Time', 'Value', 'Unit', 'Parameter / from','Sampling point']
    df=df[['Date and Time','Sampling point','Parameter / from','Value','Unit']]
    return df

In [None]:
params_list = ['CODf','COD','NH4_N','K']
sampling_point = 'Primary settling tank effluent'
def stackparams(df_input, params_list, sampling_point):
    df_list = []
    for param in params_list:
        df_list.append(backtodateaubase(df_input,param,sampling_point))
    df = pd.concat(df_list, ignore_index=True)
    return df


In [None]:
test2 = stackparams(raw_data,params_list,sampling_point)
print(len(test2))

In [None]:
test2.set_index('Date and Time', drop=True, inplace=True)

In [None]:
test2.to_csv('unfiltered_data.csv',sep=';')

In [None]:
test2.head()

In [None]:
print(test2[test2['Parameter / from']=='COD from Spectro_010'].first_valid_index())
print(test2[test2['Parameter / from']=='COD from Spectro_010'].last_valid_index())

In [None]:
tr_data = raw_data.copy(deep=True) 
tr_data['parameter']

In [None]:
parameters_list = []
for column in raw_data.columns:
    if (('Unit' not in column) & ('equipment' not in column)):
        parameters_list.append(column)
print(parameters_list)
for parameter in parameters_list:
    plt.plot(raw_data.index, raw_data[[parameter]])     
plt.legend(parameters_list)
plt.show()

## Outlier detection