In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from datetime import datetime
import plotly.express as px
import plotly.graph_objects as go
from plotly.offline import iplot

### Helper Functions

In [None]:
def unix_to_datetime_local(df):
    time_sampled = list(df['time_sampled'])
    datetime_obj = []
    for i in time_sampled:
        datetime_obj.append(datetime.fromtimestamp(int(i))) #converts each timestamp to datetime format using computer's local timezone ie KHI
    df['datetime'] = datetime_obj
    return df

def vol_in_interval(df):
    total_flow = list(df['total_flow'])
    flow_delta = []

    for i in range(len(total_flow)):
        if i != len(total_flow)-1:
            flow_delta.append(float(total_flow[i]) - float(total_flow[i+1]))
        else:
            flow_delta.append(0)

    df['volume'] = flow_delta
    return df

def rem_nonzero_start(df):
    totalflow = list(df['total_flow'])
    totalflow.reverse()
    for i in range(len(totalflow)):
        if totalflow[i] == 0:
            break
    df = df.iloc[:len(totalflow)-i]
    return df

def positive_vol(df):
    df = df[df['volume']>=0]
    return df 

def abs_vol(df):
    abs_vol = []
    for v in list(df['volume']):
        if v < 0:
            abs_vol.append(abs(v))
        else:
            abs_vol.append(v)
    df['volume'] = abs_vol
    return df

def clean_flowrate(df, maxflow):
    '''Removing datapoints where flowrate exceeds sensor capacity.'''
    df = df[df['flow_rate'] <= maxflow]
    # df = df[df['flow_rate'] > 0] #remove 0's
    return df  

def add_days(df):
    '''Storing day names for processing each week'''
    time_sampled = list(df['time_sampled'])
    days = []
    for i in time_sampled:
        days.append(datetime.fromtimestamp(int(i)).strftime('%A')[:2])
    df['day'] = days
    return df

def get_cum_vol(df):
    vol =df['volume']
    vol = vol.iloc[::-1]
    cumvol = vol.cumsum()
    cumvol = cumvol.iloc[::-1]
    df['cumvol'] = cumvol
    return df

def add_date_time(df):
    list_time = [i.time() for i in  list(df['datetime'])]
    list_date = [i.date() for i in  list(df['datetime'])]
    df['date'] = list_date
    df['time'] = list_time
    return df

def get_daily_cum_vol(df):
    daily_vol = []
    dates = df['date'].unique()
    for d in dates:
        df_date = df[df["date"] == d]
        vol =df_date['volume']
        # vol = vol.iloc[::-1]
        cumvol = vol.cumsum()
        # cumvol = cumvol.iloc[::-1]
        daily_vol.extend(list(cumvol))
    df['DVol'] = daily_vol
    return df
    


### Preprocessing

In [None]:
df_max = pd.read_csv('D:\OneDrive - Habib University\HU\KWP\Data\InstalledNodes.csv')
max_node_capacity = list(df_max['MaxFlow'])

In [None]:
for i in range(1,10):
    df = pd.read_csv('D:\OneDrive - Habib University\HU\KWP\Data\ALLSENSORDATA\\new\\N'+str(i)+'.csv')
    df = df[['time_sampled','flow_rate']]
    print('Node:',i)
    og_len = len(df)
    print('og:',og_len)
    df = unix_to_datetime_local(df)
    df = rem_nonzero_start(df)
    print('removed non-0 start:',len(df))
    df = vol_in_interval(df)
    df = abs_vol(df)
    df = clean_flowrate(df,max_node_capacity[i-1])
    print('removed flow values:',len(df))
    df = add_days(df)
    # df = get_cum_vol(df)
    print(len(df))
    print('datapoints removed:',(og_len - len(df)))
    # df = add_date_time(df)
    # df = get_daily_cum_vol(df)
    df = df.drop(columns=['total_flow'])
    df = df.iloc[::-1]
    df.insert(0,'NodeID','N'+str(i))
    df.to_csv('Cleaned/N'+str(i)+'_cleaned.csv')

In [None]:
df = data[0]
df = df.set_index('datetime')
grouped_data = df.groupby(df.index.date).max()
fig = px.line(grouped_data, x=grouped_data.index, y=['flow_rate','volume'],title="N02 - Processed",category_orders={'day':['Mo','Tu','We','Th','Fr','Sa','Su']})
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig.show()

In [None]:
'''Adding DVol to preprocessed data'''
for i in range(10,27):
    df = pd.read_csv('D:\OneDrive - Habib University\HU\KWP\\Data\Preprocessed\\N'+str(i)+'_cleaned.csv')
    df['date'] = df.datetime.str[:10]
    df = get_daily_cum_vol(df)
    df = df.drop(columns=['Unnamed: 0','date'])
    df.to_csv('D:\OneDrive - Habib University\HU\KWP\\Data\Preprocessed\with dvol\\N'+str(i)+'_cleaned.csv')

In [None]:
'''Generating TF Files'''
for i in range(10,27):
    df_tf = pd.DataFrame(columns=['NodeID','Date','TotalVol','No_dp'])
    df = pd.read_csv('D:\OneDrive - Habib University\HU\KWP\\Data\Preprocessed\with dvol\\N'+str(i)+'_cleaned.csv')
    df['Date'] = df.datetime.str[:10]
    temp = df.groupby('Date').max()
    df_tf['NodeID'] = temp['NodeID']
    df_tf['TotalVol'] = temp['DVol']
    df_tf['Date'] = temp.index
    df_tf['No_dp'] = df.groupby('Date').size()
    df_tf.to_csv('D:\OneDrive - Habib University\HU\KWP\\Data\Preprocessed\\TF_N'+str(i)+'.csv')