# TimeSeries DataAnalytics Tutorial



In [None]:
%matplotlib inline

import visdom
import numpy as np
import chart_studio.plotly as py
import plotly.express as px
import plotly.tools as tls
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
from datetime import datetime
import requests
import json
import matplotlib.pyplot as plt

In [None]:
import sys
sys.path.append("../TimeSeriesTools")
import mongodb_utils
db_host= 'localhost'
port = '27018'
db_name='TimeSeriesBench'
mongodb_client = mongodb_utils.mongodb_connect(db_host, port)

In [None]:
import kairosdb_utils
global kairosdb_server 
kairosdb_server = "http://localhost:9080"

In [None]:
import influxdb_utils
db_host= 'localhost'
port = '8086'
db_name='TimeSeriesBench'
influxdb_client = influxdb_utils.influxdb_connect(db_host, port)

In [None]:
def get_collection_scheme(db_name,scheme_name):
    db = mongodb_client[db_name]
    schemes_coll = db['schemes']
    scheme = schemes_coll.find({"name":scheme_name})
    return scheme

In [None]:
def mongodb_find_all_data(db_name,coll_name,scheme):
    data = mongodb_utils.get_all_data(mongodb_client,db_name,coll_name,scheme)
    return data

In [None]:
def mongodb_find_data_select_by_tags(db_name,coll_name,tags,scheme):
    data = mongodb_utils.get_data_select_by_tags(mongodb_client,db_name,coll_name,tags,scheme)
    return data

In [None]:
def kairosdb_find_all_data(db_name,coll_name,scheme):
    data = kairosdb_utils.get_all_data(kairosdb_server,db_name,coll_name,scheme)
    return data

In [None]:
def influxdb_find_all_data(db_name,coll_name,scheme):
    data = influxdb_utils.get_all_data(influxdb_client,db_name,coll_name,scheme)
    return data

In [None]:
def clean_data(scheme,data):
    from cerberus import Validator
    v = Validator(scheme)
    for index,item in enumerate(data,start=0):
        res = v.validate(item)
        if (res == False):
            print("corrupt data in line :",index,", error : ",v.errors)
            del data[index]

In [None]:
def str_to_unix(date):
    dt = datetime.strptime(date, '%d/%m/%Y %H:%M:%S')
    epoch = datetime.utcfromtimestamp(0)
    return int((dt - epoch).total_seconds()) * 1000

In [None]:
def to_unix_time(dt):
    epoch =  datetime.utcfromtimestamp(7200)
    return (dt - epoch).total_seconds() * 1000

In [None]:
def type_convert(df):
    head = df.columns
    for i in range(1,len(head)) : 
        df[head[i]] = pd.to_numeric(df[head[i]])

## I/ Requete des données

### A/ Données SmartGrid

In [None]:
scheme = get_collection_scheme(db_name,'SmartGrid')

In [None]:
scheme[0]['value']

In [None]:
%%time
coll_name='SmartGridCryolite20190101OneMonthBS10000d'
data = mongodb_find_all_data(db_name,coll_name,scheme[0]['value'])
print("number of docs",len(data))

In [None]:
%%time
coll_name='SmartGridCryolite20190101OneMonthBS10000d'
tags = { 'Buiding' : 'CRY', 'Device' : 'CENTRALE_SOLAIRE', 'Measure' : 'CRY_act_prod_pow' }
data = mongodb_find_data_select_by_tags(db_name,coll_name,tags,scheme[0]['value'])
print("number of docs",len(data))

### A/ Standard method with Pandas

In [None]:
%%time
df = pd.DataFrame(data)
df[0:5]

In [None]:
df.tagname.unique()

In [None]:
def str_to_timestamp(date):
    dt = datetime.strptime(date, '%d/%m/%Y %H:%M:%S')
    d0 = datetime(2019,1,1,0,0,0,0)
    return int((dt - epoch).total_seconds())

In [None]:
df['timestamp'] = df['timestamp'].apply(str_to_timestamp)

In [None]:
df['day-id'] = df['timestamp']//(3600*24)

In [None]:
df['hour-id'] = df['timestamp'] % (3600*24) // 3600

In [None]:
df['value'] = pd.to_numeric(df['value'])

In [None]:
import matplotlib.colors as mcolors
color_names = [ c for c in mcolors.CSS4_COLORS.keys()]

In [None]:
df.tagname.unique()

In [None]:
tagnames = ['CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow','CRY.CENTRALE_SOLAIRE.CRY_app_prod_pow','CRY.CENTRALE_SOLAIRE.CRY_rea_prod_pow']
#tagnames = df.tagname.unique()

In [None]:
%%time
results =[]
for tag_id,tagname in enumerate(tagnames):
    for day_id in range(30):
        results.append(compute_curve(df,day_id,tagname))

### B/ Using DASK

In [None]:
def compute_curve(tag_df,day_id):
    day_df = tag_df.loc[(df['day-id'] == day_id)]
    vh_df = day_df[['hour-id','value']].groupby('hour-id').mean().reset_index().sort_values(by='hour-id')
    x = vh_df['hour-id']
    y = vh_df['value']
    return x,y

In [None]:
import dask as da
from dask.distributed import Client
from dask import delayed, compute
client = Client(n_workers=4)

In [None]:
%%time
arrays =[]
for tag_id,tagname in enumerate(tagnames):
    tag_df = df.loc[df['tagname'] == tagname]
    for day_id in range(30):
        arrays.append(compute_curve(tag_df,day_id))

In [None]:
%%time
import dask.dataframe as dd
dask_df = dd.from_pandas(df,npartitions=16)

In [None]:
tag_id = 0
tag_name = tagnames[tag_id]
tag_df = dask_df[dask_df['tagname'] == tagname]

In [None]:
day_df = tag_df[(tag_df['day-id'] == day_id)]

In [None]:
vh_df = day_df[['hour-id','value']].groupby('hour-id').mean()

In [None]:
vh_df.compute()

In [None]:
%%time
results = da.compute(*arrays)