In [None]:
import datetime
import time
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import random

# Create Data

To simulate a timeseries, let's thake a mean value and add a random noise dependanting on the standard deviation
for the purpose of this challenge, we have provided plausible values for each sensors 

In [None]:
sensorNames = ['AtmP','Temp','Airtight','H2OC']
sensorCenterLines = [989.21,9.45,1216.02,9.64]
standardDeviation = [8.35,8.42,39.98,4.23]



For our use case, we need data that contain the following informations:
- sensor: sensor name
- reading: the value sent by the sensor
- timeStamp: the time that the value has been sent

In order to create a dataframe with the data, we can execute this code that generate 100 points for each 4 sensors.

In [None]:
l_sensors = []
time_stamp_start = datetime.datetime(2022,11,25,9,0,0)
time_step = datetime.timedelta(seconds=1)
for i in range(100):
    for pos in range(0,4):
        l_sensors.append({
            'sensor': sensorNames[pos],
            'reading':np.random.normal(loc=sensorCenterLines[pos], scale=standardDeviation[pos]),
            'timeStamp':time_stamp_start + i* time_step
        })
        
df_sensors = pd.DataFrame(l_sensors)
df_sensors.head(10)

Now that we have the data let's aggragate the values by timestamp(index) and by sensor(columns)
for that you can use pandas.pivot_table 

In [None]:
df_datas = pd.pivot_table(df_sensors,values = 'reading',index="timeStamp",columns="sensor",aggfunc='mean')
df_datas.head(10)

Let's plot now the different sensors values by timeStamp

In [None]:
df_datas.plot(subplots = True);

# Introducing Latency

In real life, sensors are not synchronised together and they experience sometimes latency to send signals.


In [None]:
mu, sigma = 0, 0.2 # mean and standard deviation
s = np.abs(np.random.normal(mu, sigma, 1000))

plt.hist(s);
plt.title('Possible latency distibution in sec'),
plt.show()

In [None]:
l_sensors_latency = []
for i in range(1500):
    for pos in range(0,4):
        l_sensors_latency.append({
            'sensor': sensorNames[pos],
            'reading':np.random.normal(loc=sensorCenterLines[pos], scale=standardDeviation[pos]),
            'timeStamp':time_stamp_start + i* time_step + datetime.timedelta(seconds=abs(random.gauss(0,0.2)))
        })
        
df_sensors_latency = pd.DataFrame(l_sensors_latency)
df_sensors_latency.head(10)

In [None]:
df_datas_latency = pd.pivot_table(df_sensors_latency,values = 'reading',index="timeStamp",columns="sensor",aggfunc='mean')
df_datas_latency

In [None]:
df_datas_latency.plot(subplots = True);

💡 We can not gather the data continuously anymore because the data is not available for each sensor at the same time!

In that case it is better to group the data in **fix windows** of 15s for example

In pandas, we can resample the timestamp index. This operation will be done in apache beam in the next challenge.
We won't use pandas 

In [None]:
df_datas_latency = df_datas_latency.resample('15s').mean()

In [None]:
df_datas_latency.plot(subplots=True);