# TimeSeries DataAnalytics Tutorial



In [None]:
import numpy as np
import chart_studio.plotly as py
import plotly.express as px
import plotly.tools as tls
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import pandas as pd
from datetime import datetime
import requests
import json
#import matplotlib.pyplot as plt

In [None]:
import findspark
findspark.init()

In [None]:
import pyspark
sc = pyspark.SparkContext(appName="TimeSeries")

## I/ Simple IRIS example to check PySpark

In [None]:
import os
import pandas as pd
data_dir = '/work/irlin355_1/gratienj/BigData/DataLakeBenchProject/python/TimeSeries/DataAnalytics'
filename = os.path.join(data_dir,'iris.csv')
df = pd.read_csv(filename)

In [None]:
from pyspark import SparkContext, SQLContext
from pyspark.sql.types import StructType, StructField, DoubleType, StringType
sqlContext = SQLContext(sc)
schema = StructType([StructField("sepal_length", DoubleType(), True),
                     StructField("sepal_width",  DoubleType(), True),
                     StructField("petal_length", DoubleType(), True),
                     StructField("petal_width",  DoubleType(), True),
                     StructField("variety",      StringType(), True),
                    ])
spark_df = sqlContext.createDataFrame(df,schema=schema)

In [None]:
spark_df.printSchema()

In [None]:
spark_df.show()

## I/ SmartGrid Example from File

### A/ Standard method with Pandas

In [None]:
def str_to_timestamp(date):
    dt = datetime.strptime(date, '%d/%m/%Y %H:%M:%S')
    d0 = datetime(2019,1,1,0,0,0,0)
    return int((dt - d0).total_seconds())

In [None]:
def compute_curve(df,day_id,tagname):
    day_df = df.loc[(df['day_id'] == day_id) & (df['tagname'] == tagname )]
    vh_df = day_df[['hour_id','value']].groupby('hour_id').mean().reset_index().sort_values(by='hour_id')
    x = vh_df['hour_id']
    y = vh_df['value']
    return x,y

In [None]:
data_dir = '/work/irlin355_1/gratienj/BigData/DigitalSandBox/Data/TimeSeries/SmartGridData/Cryolite/20190101'
filename = os.path.join(data_dir,'OneMonth.csv')
df = pd.read_csv(filename, sep=';')
#tagnames = ['CRY.CENTRALE_SOLAIRE.CRY_act_prod_pow']
tagnames = df.tagname.unique()

In [None]:
%%time
df['value'] = pd.to_numeric(df['value'])
df['quality'] = pd.to_numeric(df['value'])
df['timestamp'] = df['timestamp'].apply(str_to_timestamp)
df['day_id'] = df['timestamp']//(3600*24)
df['hour_id'] = df['timestamp'] % (3600*24) // 3600
results =[]
for tag_id,tagname in enumerate(tagnames):
    for day_id in range(30):
        results.append(compute_curve(df,day_id,tagname))

### B/ PySpark method

In [None]:
data_dir = '/work/irlin355_1/gratienj/BigData/DigitalSandBox/Data/TimeSeries/SmartGridData/Cryolite/20190101'
filename = os.path.join(data_dir,'OneMonth.csv')
df = pd.read_csv(filename, sep=';')
df['value'] = pd.to_numeric(df['value'])
df['quality'] = pd.to_numeric(df['value'])

In [None]:
from pyspark import SparkContext, SQLContext

sqlContext = SQLContext(sc)

In [None]:
Schema = StructType([ StructField("timestamp", StringType(),    True),
                      StructField("tagname",   StringType(),  True),
                      StructField("value",     DoubleType(),  True),
                      StructField("quality",   DoubleType(),  True)
                    ])

In [None]:
spark_df = sqlContext.createDataFrame(df,schema=Schema)

In [None]:
spark_df.printSchema()

In [None]:
spark_df.show()

In [None]:
def str_to_timestamp(date):
    dt = datetime.strptime(date, '%d/%m/%Y %H:%M:%S')
    d0 = datetime(2019,1,1,0,0,0,0)
    return int((dt - d0).total_seconds())

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import StructType, StructField, DoubleType, StringType, LongType
F1 = udf(lambda x: str_to_timestamp(x), LongType())
dayId = udf(lambda x : x//(3600*24))
hourId = udf(lambda x : x % (3600*24) // 3600)

In [None]:
def compute_curve(tag_df,day_id):
    day_df = tag_df.filter(tag_df.day_id == day_id)
    rdd = day_df.select('hour_id','value').groupby('hour_id').mean()
    #x = [x["avg(hour_id)"] for x in rdd.select("avg(hour_)").collect()]
    y = [x["avg(value)"] for x in rdd.select("avg(value)").collect()]
    return y

In [None]:
%%time
tagnames = df.tagname.unique()
spark_df2 = spark_df.withColumn("timestamp_sec",  F1(spark_df["timestamp"]))
spark_df3 = spark_df2.withColumn("day_id",dayId(spark_df2['timestamp_sec']))
spark_df4 = spark_df3.withColumn("hour_id",hourId(spark_df3['timestamp_sec']))
results =[]
for tag_id,tagname in enumerate(tagnames):
    tag_df = spark_df4.filter(spark_df4.tagname == tagname)
    for day_id in range(30):
        results.append(compute_curve(tag_df,day_id))