In [52]:
# Installations:
"""
pandas
openpyxl
pyarrow
"""

'\npandas\nopenpyxl\n'

In [1]:
import pandas as pd
import seaborn as sns


In [44]:
# Variables
filename = 'time_series_data.parquet'
datetime = 'DateTime'
tagname = 'TagName'
value = 'Value'
aggr_freq_l1 = '10s'

In [72]:
def reads_parquet(filename:str) -> pd.DataFrame:
    try:
        df = pd.read_parquet(filename)
        print("df.shape: ", df.shape)
        return df
    except Exception as e:
        raise e

In [73]:
def drops_null(df: pd.DataFrame) -> pd.DataFrame:   
    if df.empty:
        print("Dataframe is empty")
        return None
    elif df.isnull().sum().sum() > 0:
        try:
            print("Before null removal, df.shape: ", df.shape)
            df = df.dropna().reset_index(drop=True)
            print("After null removal, df.shape: ", df.shape)
            return df
        except Exception as e:
            raise e
    else:
        return df

def drops_duplicate(df: pd.DataFrame) -> pd.DataFrame:  
    if df.empty:
        print("Dataframe is empty")
        return None
    elif df.duplicated().sum() > 0:
        try:
            print("Before duplicate removal, df.shape: ", df.shape)
            df = df.drop_duplicates(subset=[datetime, tagname]).reset_index(drop=True)
            print("After duplicate removal, df.shape: ", df.shape)
            return df
        except Exception as e:
            raise e
    else:
        return df

In [92]:
def check_nulls_in_col(df:pd.DataFrame) -> list:
    if not df.empty:
        return [(x,y) for x, y in zip(df.isnull().sum().keys(), df.isnull().sum()) if y != 0]
    else:
        print("pass correct dataframe")
        pass

def check_nulls_plot(df:pd.DataFrame):
    sns.heatmap(df.isnull(), cbar=False, cmap='viridis')

In [80]:
def pivots_data(df:pd.DataFrame) -> pd.DataFrame:
    try:
        print("Before pivot, df.shape: ", df.shape)
        df_pivot = df.pivot(index=datetime, columns=tagname, values=value).reset_index().copy()
        print("After pivot, df.shape: ", df_pivot.shape)
        return df_pivot
    except Exception as e:
        raise e

In [88]:
def groups_data(df:pd.DataFrame, aggr_freq_l1:str) -> pd.DataFrame:
    try:
        print("Before grouping, df.shape: ", df.shape)
        df_grouped = df.groupby([pd.Grouper(key=datetime, freq=aggr_freq_l1)]).mean()
        print("After grouping, df.shape: ", df_grouped.shape)
        return df_grouped
    except Exception as e:
        raise e

In [91]:
df = reads_parquet(filename)
df = drops_null(df)
df = drops_duplicate(df)
display(df.head(2))

df_pivot = pivots_data(df)
print(check_nulls_in_col(df_pivot))
display(df_pivot.head(2))

df_grouped = groups_data(df_pivot, aggr_freq_l1)
print(check_nulls_in_col(df_grouped))
display(df_grouped.head(2))

df_grouped = drops_null(df_grouped)

df.shape:  (712070, 3)
Before null removal, df.shape:  (712070, 3)
After null removal, df.shape:  (712069, 3)


Unnamed: 0,DateTime,TagName,Value
0,2023-10-05,Pressure,60.862499
1,2023-10-05,Bar Angle,38.530838


Before pivot, df.shape:  (712069, 3)
After pivot, df.shape:  (671761, 4)
[('Bar Angle', 431443), ('Pressure', 440320), ('RS4R4 Height', 431451)]


TagName,DateTime,Bar Angle,Pressure,RS4R4 Height
0,2023-10-05 00:00:00.000,38.530838,60.862499,0.37324
1,2023-10-05 00:00:00.010,,60.9375,


Before grouping, df.shape:  (671761, 4)
After grouping, df.shape:  (25904, 3)
[('Pressure', 4)]


TagName,Bar Angle,Pressure,RS4R4 Height
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2023-10-05 00:00:00,38.479168,61.619999,0.369833
2023-10-05 00:00:10,38.549345,62.10375,0.369945


Before null removal, df.shape:  (25904, 3)
After null removal, df.shape:  (25900, 3)
