In [1]:
import pandas as pd
import numpy as np
import scipy as sp
import plotly.figure_factory as ff
import plotly.graph_objs as go
import plotly.plotly as py
import plotly.tools as pytools
import matplotlib.pyplot as plt
import datetime
from scipy.stats.stats import pearsonr
from tqdm import tqdm
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
pytools.set_credentials_file(username='hrishioa', api_key='xFpLRfOD7le9KEQU2DuE')

## Importing and Sanitizing Data
Import and clean data from sensor and reader.
For both, convert timestamps to date values for graphing.
For reader data, remove ones with error flags.

In [2]:
sensordf = pd.read_csv("sensor.csv")
sensordf['date'] = pd.to_datetime(sensordf.timestamp,unit="s")
sensordf.timestamp = [int(x) for x in sensordf.timestamp]
sensordf.frequency = [int(x) for x in sensordf.frequency]
# sensordf = sensordf.set_index('timestamp')
sensordf

Unnamed: 0,timestamp,value,sensor-id,frequency,date
0,1509690150,128.0,0M0001WVAT8,5,2017-11-03 06:22:30
1,1509690090,129.0,0M0001WVAT8,5,2017-11-03 06:21:30
2,1509690030,130.0,0M0001WVAT8,5,2017-11-03 06:20:30
3,1509689970,131.0,0M0001WVAT8,5,2017-11-03 06:19:30
4,1509689910,132.0,0M0001WVAT8,5,2017-11-03 06:18:30
5,1509689850,131.0,0M0001WVAT8,5,2017-11-03 06:17:30
6,1509689790,132.0,0M0001WVAT8,5,2017-11-03 06:16:30
7,1509689730,133.0,0M0001WVAT8,5,2017-11-03 06:15:30
8,1509689670,136.0,0M0001WVAT8,5,2017-11-03 06:14:30
9,1509689610,136.0,0M0001WVAT8,5,2017-11-03 06:13:30


In [3]:
readerdf = pd.read_csv("raw.csv")
readerdf['date'] = pd.to_datetime(readerdf.timestamp,unit="s")
readerdf.timestamp = [int(x) for x in readerdf.timestamp]
readerdf.value = [int(x) for x in readerdf.value]
readerdf = readerdf[readerdf['error-bitfield']==0]
readerdf

Unnamed: 0,record_id,month,day,year,hour,minute,second,timestamp,Fsensor-start,value,sensor-runtime,error-bitfield,record-type,reading-type,FerrorLO,arrow,date
0,105,10,22,17,20,14,43,1508674483,0,180,60,0,-1,-1,-1,-1,2017-10-22 12:14:43
1,106,10,22,17,20,29,43,1508675383,0,203,75,0,-1,-1,-1,-1,2017-10-22 12:29:43
2,107,10,22,17,20,44,43,1508676283,0,220,90,0,-1,-1,-1,-1,2017-10-22 12:44:43
3,108,10,22,17,20,59,43,1508677183,0,249,105,0,-1,-1,-1,-1,2017-10-22 12:59:43
4,109,10,22,17,21,14,43,1508678083,0,270,120,0,-1,-1,-1,-1,2017-10-22 13:14:43
5,114,10,22,17,21,30,26,1508679026,0,271,135,0,-1,-1,-1,-1,2017-10-22 13:30:26
6,115,10,22,17,21,45,26,1508679926,0,266,150,0,-1,-1,-1,-1,2017-10-22 13:45:26
7,116,10,22,17,22,0,26,1508680826,0,256,165,0,-1,-1,-1,-1,2017-10-22 14:00:26
8,117,10,22,17,22,15,26,1508681726,0,248,180,0,-1,-1,-1,-1,2017-10-22 14:15:26
9,118,10,22,17,22,30,26,1508682626,0,245,195,0,-1,-1,-1,-1,2017-10-22 14:30:26


## Plotting
Plot all data.

In [4]:
traces = []
traces.append(go.Scatter(x=sensordf['timestamp'],y=sensordf['value'],mode='markers',name="Sensor"))
traces.append(go.Scatter(x=readerdf['timestamp'],y=readerdf['value'],mode='markers',name="Reader"))
layout = go.Layout(title="Sensor vs Reader",xaxis=dict(
                            rangeslider=dict(),
                            type='DateTime'
                       ))
fig = go.Figure(data=traces,layout=layout)
py.iplot(fig)

## Test 1: Reader vs Sensor
Step 1: Filter for data that's present in both datasets. This requires some effort since the timestamps don't really line up (clock drift between sensor and reader, presumably)

In [5]:
rdata = []
sdata = []
timerange = (60*15)/2
for i in tqdm(range(len(sensordf))):
    subset = readerdf[(readerdf.timestamp <= sensordf.iloc[i].timestamp+timerange) & 
                     (readerdf.timestamp >= sensordf.iloc[i].timestamp-timerange)]
    if len(subset) > 0:
        sdata.append(sensordf.iloc[i])
    rdata.append(subset)
processedreader = pd.concat(rdata)
processedsensor = pd.DataFrame(sdata)

100%|██████████| 6539/6539 [00:09<00:00, 678.33it/s]


In [6]:
traces = []
traces.append(go.Scatter(x=processedsensor['date'],y=processedsensor['value'],mode='markers',name="Sensor"))
traces.append(go.Scatter(x=processedreader['date'],y=processedreader['value'],mode='markers',name="Reader"))
layout = go.Layout(title="Sensor vs Reader",xaxis=dict(
                            rangeslider=dict(),
                            type='DateTime'
                       ))
fig = go.Figure(data=traces,layout=layout)
iplot(fig)

Next, let's see how far the reader data tends to drift from the sensor data. We look for the closest point and plot the distance. For this, we remove the 1-minute readings from the sensor data, since those tend to throw distance measurements off.

In [7]:
pr = processedreader
ps = processedsensor[processedsensor.frequency != 5]
matches = []
distances = dict(distance=[],date=[])
pairs = dict(s=[],r=[])
for i1, srow in tqdm(ps.iterrows()):
    cmatch = None
    for i2, rrow in pr.iterrows():
        if cmatch is None or abs(cmatch.timestamp-srow.timestamp) > abs(rrow.timestamp-srow.timestamp):
            cmatch = rrow
    matches.append([srow,cmatch,abs(cmatch.timestamp-srow.timestamp)])
    distances['date'].append(srow.date)
    distances['distance'].append(cmatch.timestamp-srow.timestamp)
    pairs['s'].append(srow.value)
    pairs['r'].append(cmatch.value)
    
traces = [go.Scatter(x=distances['date'],y=distances['distance'],mode='markers',name="Distance-s")]
layout = go.Layout(title="Distance vs Time",xaxis=dict(
                            rangeslider=dict(),
                            type='DateTime'
                       ))
fig = go.Figure(data=traces,layout=layout)
iplot(fig)

62it [00:00, 82.42it/s]


#### Correlation Testing

In [8]:
print("Pearson Correlation: %f,%f" % (pearsonr(np.array(pairs['s']),np.array(pairs['r']))))
traces = [go.Scatter(x=pairs['s'],y=pairs['r'],mode='markers',name="Distance-s")]
layout = go.Layout(title="Sensor vs Reader Value",xaxis=dict(
                            rangeslider=dict(),
#                             type='DateTime'
                       ))
fig = go.Figure(data=traces,layout=layout)
py.iplot(fig,filename="Sensor-Reader Correlation")

Pearson Correlation: 0.976434,0.000000


In [9]:
np.array(pairs['r'])

array([136, 125, 116, 116, 121, 130, 139, 156, 167, 171, 184, 181, 190,
       216, 232, 234, 221, 202, 183, 163, 133,  89,  62,  65,  72,  85,
        97,  98, 112, 143, 183, 385, 379, 361, 335, 311, 288, 269, 245,
       216, 189, 168, 160, 157, 160, 186, 219, 247, 249, 232, 202, 151,
       116,  95,  99, 124, 150, 162, 173, 197, 217, 214])

In [10]:
rdelta = [pairs['r'][r]-pairs['r'][r-1] for r in range(1,len(pairs['r']))]
sdelta = [pairs['s'][s]-pairs['s'][s-1] for s in range(1,len(pairs['s']))]

In [11]:
traces = [go.Scatter(x=rdelta,y=sdelta,mode='markers',name="Distance-s")]
layout = go.Layout(title="Sensor vs Reader Value",xaxis=dict(
                            rangeslider=dict(),
#                             type='DateTime'
                       ))
fig = go.Figure(data=traces,layout=layout)
py.iplot(fig,filename="Sensor-Reader Correlation")