# TimeSeries DataAnalytics Tutorial



In [1]:
import visdom
import numpy as np
import chart_studio.plotly as py
import plotly.express as px
import plotly.tools as tls
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
from datetime import datetime
import requests
import json
import matplotlib.pyplot as plt

In [2]:
import sys
sys.path.append("../TimeSeriesTools")
import mongodb_utils
db_host= 'localhost'
port = '27018'
db_name='TimeSeriesBench'
mongodb_client = mongodb_utils.mongodb_connect(db_host, port)

Trying to connect to MongoDB server: localhost on port: 27018


In [3]:
def get_collection_scheme(db_name,scheme_name):
    db = mongodb_client[db_name]
    schemes_coll = db['schemes']
    scheme = schemes_coll.find({"name":scheme_name})
    return scheme

In [4]:
def mongodb_find_all_data(db_name,coll_name,scheme):
    data = mongodb_utils.get_all_data(mongodb_client,db_name,coll_name,scheme)
    return data

In [5]:
def mongodb_find_data_select_by_tags(db_name,coll_name,tags,scheme):
    data = mongodb_utils.get_data_select_by_tags(mongodb_client,db_name,coll_name,tags,scheme)
    return data

## I/ Requete des données

### A/ Données SmartGrid

In [6]:
scheme = get_collection_scheme(db_name,'SmartGrid')

In [7]:
scheme[0]['value']

{'_id': {'required': True},
 'timestamp': {'type': 'string', 'required': True, 'empty': False},
 'tagname': {'type': 'string', 'required': True, 'empty': False},
 'value': {'type': 'string', 'required': True, 'empty': False},
 'quality': {'type': 'string', 'required': True, 'empty': False}}

In [8]:
%%time
coll_name='SmartGridCryolite20190101OneMonthBS10000d'
data = mongodb_find_all_data(db_name,coll_name,scheme[0]['value'])
print("number of docs",len(data))

769598  documents found
number of docs 769598
CPU times: user 3.65 s, sys: 432 ms, total: 4.09 s
Wall time: 4.37 s


### B/ Standard method with Pandas

In [9]:
%%time
df = pd.DataFrame(data)
df[0:5]

CPU times: user 909 ms, sys: 41.1 ms, total: 950 ms
Wall time: 947 ms


Unnamed: 0,_id,timestamp,tagname,value,quality
0,5ee31951b4320c151b0560db,01/01/2019 09:15:12,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,1.0,100.0
1,5ee31951b4320c151b0560dc,01/01/2019 09:15:18,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,0.0,100.0
2,5ee31951b4320c151b0560dd,01/01/2019 09:15:37,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,1.0,100.0
3,5ee31951b4320c151b0560de,01/01/2019 09:15:43,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,0.0,100.0
4,5ee31951b4320c151b0560df,01/01/2019 09:15:53,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,1.0,100.0


In [10]:
df.tagname.unique()

array(['CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow',
       'CRY.CENTRALE_SOLAIRE.CRY_app_prod_pow',
       'CRY.CENTRALE_SOLAIRE.CRY_rea_prod_pow',
       'CRY.TGBT_NORMAL.CRY_act_cons_pow',
       'CRY.TGBT_NORMAL.CRY_app_cons_pow',
       'CRY.TGBT_NORMAL.CRY_rapp_cons_ene',
       'CRY.TGBT_NORMAL.CRY_rea_cons_pow'], dtype=object)

In [11]:
def str_to_timestamp(date):
    dt = datetime.strptime(date, '%d/%m/%Y %H:%M:%S')
    d0 = datetime(2019,1,1,0,0,0,0)
    return int((dt - d0).total_seconds())

In [12]:
df['timestamp'] = df['timestamp'].apply(str_to_timestamp)

In [13]:
df['day-id'] = df['timestamp']//(3600*24)

In [14]:
df['hour-id'] = df['timestamp'] % (3600*24) // 3600

In [15]:
df['value'] = pd.to_numeric(df['value'])

In [16]:
tagnames = df.tagname.unique()

In [17]:
def compute_curve(df,day_id,tagname):
    day_df = df.loc[(df['day-id'] == day_id) & (df['tagname'] == tagname )]
    vh_df = day_df[['hour-id','value']].groupby('hour-id').mean().reset_index().sort_values(by='hour-id')
    x = vh_df['hour-id']
    y = vh_df['value']
    return x,y

In [18]:
%%time
results =[]
for tag_id,tagname in enumerate(tagnames):
    for day_id in range(30):
        results.append(compute_curve(df,day_id,tagname))

CPU times: user 10.3 s, sys: 24.2 ms, total: 10.3 s
Wall time: 10.5 s


### C/ Parallel computation using DASK

In [19]:
def str_to_timestamp(date):
    dt = datetime.strptime(date, '%d/%m/%Y %H:%M:%S')
    d0 = datetime(2019,1,1,0,0,0,0)
    return int((dt - d0).total_seconds())

In [20]:
import dask as da
from dask.distributed import Client
from dask import delayed, compute
client = Client(n_workers=4)

In [21]:
coll_name='SmartGridCryolite20190101OneMonthBS10000d'
data = mongodb_find_all_data(db_name,coll_name,scheme[0]['value'])
print("number of docs",len(data))

769598  documents found
number of docs 769598


In [22]:
for i, d in enumerate(data):
    d['_id'] = i

In [23]:
%%time
df = pd.DataFrame(data)
df[0:5]

CPU times: user 986 ms, sys: 33 ms, total: 1.02 s
Wall time: 1.01 s


Unnamed: 0,_id,timestamp,tagname,value,quality
0,0,01/01/2019 09:15:12,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,1.0,100.0
1,1,01/01/2019 09:15:18,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,0.0,100.0
2,2,01/01/2019 09:15:37,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,1.0,100.0
3,3,01/01/2019 09:15:43,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,0.0,100.0
4,4,01/01/2019 09:15:53,CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow,1.0,100.0


In [24]:
tagnames = df.tagname.unique()
tag_ids = { tags:i for i,tags in enumerate(tagnames) }
tag_ids

{'CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow': 0,
 'CRY.CENTRALE_SOLAIRE.CRY_app_prod_pow': 1,
 'CRY.CENTRALE_SOLAIRE.CRY_rea_prod_pow': 2,
 'CRY.TGBT_NORMAL.CRY_act_cons_pow': 3,
 'CRY.TGBT_NORMAL.CRY_app_cons_pow': 4,
 'CRY.TGBT_NORMAL.CRY_rapp_cons_ene': 5,
 'CRY.TGBT_NORMAL.CRY_rea_cons_pow': 6}

In [25]:
%%time
import dask.dataframe as dd
dask_df = dd.from_pandas(df,npartitions=32)

CPU times: user 501 ms, sys: 50.2 ms, total: 551 ms
Wall time: 591 ms


In [26]:
%%time
timestamp_sec = dask_df['timestamp'].apply(lambda x : str_to_timestamp(x), meta=('x', str)).compute()
tag_id = dask_df['tagname'].apply(lambda x : tag_ids[x], meta=('x', str)).compute()
day_id = timestamp_sec // (3600 * 24)
hour_id = timestamp_sec % (3600 * 24) // 3600
dask_df['tag_hour_id'] = 24 * tag_id + hour_id
dask_df['value_d'] = dask_df.value.astype(np.float)
dask_mean = dask_df[['tag_hour_id', 'value_d']].groupby('tag_hour_id').mean()

CPU times: user 1.62 s, sys: 195 ms, total: 1.81 s
Wall time: 8.14 s


In [27]:
%%time
results = dask_mean.compute()

CPU times: user 828 ms, sys: 81.5 ms, total: 910 ms
Wall time: 1.37 s


In [28]:
results

Unnamed: 0_level_0,value_d
tag_hour_id,Unnamed: 1_level_1
8,0.830727
9,3.514209
10,10.662128
11,15.661135
12,18.249163
...,...
163,0.049523
164,0.172178
165,-0.035105
166,0.012671


distributed.utils - ERROR - 'start'
Traceback (most recent call last):
  File "/work/irlin355_1/gratienj/local/anaconda3/envs/timeseries/lib/python3.7/site-packages/distributed/utils.py", line 656, in log_errors
    yield
  File "/work/irlin355_1/gratienj/local/anaconda3/envs/timeseries/lib/python3.7/site-packages/distributed/dashboard/components/shared.py", line 312, in update
    ts = metadata["keys"][self.key]
KeyError: 'start'
tornado.application - ERROR - Exception in callback functools.partial(<function wrap.<locals>.null_wrapper at 0x2b18f6a88dd0>, <Task finished coro=<_needs_document_lock.<locals>._needs_document_lock_wrapper() done, defined at /work/irlin355_1/gratienj/local/anaconda3/envs/timeseries/lib/python3.7/site-packages/bokeh/server/session.py:51> exception=KeyError('start')>)
Traceback (most recent call last):
  File "/work/irlin355_1/gratienj/local/anaconda3/envs/timeseries/lib/python3.7/site-packages/tornado/ioloop.py", line 758, in _run_callback
    ret = callback(

In [None]:
client.stop()