In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import os

env = os.environ.get('Environment')
print(f"Environment: {env!r}")

Environment: 'prod'


In [2]:
%reload_ext jupyter_dmdg

In [3]:
# To query the process events in the Data Lake (DL)

In [4]:
%%athena_to_df --out df_DL
SELECT
    site_id,
    building_id,
    event_name,
    event_name_full AS recipe_full,
    event_parameter_name AS parameter_name,  
    event_parameter_value AS parameter_value,
    event_start AS start_date,
    event_stop AS end_date
FROM "prod_plant_connectivity_prepared_data"."process_events"

In [5]:
# To select site & building

In [6]:
column_name1 = input("Enter site_id: ")
value1 = input("Enter your site_id: ")
column_name2 = input("Enter building_id: ")
value2 = input("Enter your building_id: ")
df_DL_site_building = df_DL[(df_DL[column_name1] == value1) & (df_DL[column_name2] == value2)]

Enter site_id:  site_id
Enter your site_id:  MLE
Enter building_id:  building_id
Enter your building_id:  0V10


In [7]:
# To have the list extracted from the DL in the proper format to be compared with the list of process events extracted from Data Source (DS) 

In [8]:
df_DL_site_building = df_DL_site_building.copy()
time_difference = int(input("Enter time difference: "))
df_DL_site_building.loc[:, 'start_date'] += pd.Timedelta(hours=time_difference)
df_DL_site_building.loc[:, 'end_date'] += pd.Timedelta(hours=time_difference)
df_DL_site_building.loc[:, 'parameter_value'] = df_DL_site_building['parameter_value'].astype(float)
df_DL_site_building = df_DL_site_building.astype(str)
len(df_DL_site_building)

Enter time difference:  1


140512

In [9]:
# To keep for the same process events only the one with the most recent date parameter

In [10]:
source_file_to_read = input("Enter your source csv file: ")
df_all_process_events = pd.read_csv(source_file_to_read)
df_all_process_events = df_all_process_events.reset_index().rename(columns={'index': 'ID'})
df_all_process_events['ID'] = df_all_process_events['ID'].astype(str)
sort_column = input("Enter the parameter_ts as it appears in the source: ")
if sort_column != "na":
    df_all_process_events.sort_values(by=sort_column, ascending=False, inplace=True)
df_all_process_events.drop_duplicates(subset=['RECIPE_FULL', 'PARAMETER_NAME', 'PARAMETER_VALUE', 'START_DATE', 'END_DATE'], keep='first', inplace=True)
len(df_all_process_events)

Enter your source csv file:  MLE/0V10/MLE_0V10.csv
Enter the parameter_ts as it appears in the source:  DATE_PARAMETRE


5182

In [11]:
# To select only the process events that come from mapped equipments

In [12]:
mapped_equipment_file_to_read = input("Enter your mapped equipment file: ")
df_mapped_equipment = pd.read_csv(mapped_equipment_file_to_read)
df_DS = df_all_process_events[df_all_process_events['EQUIPMENT_NAME'].isin(df_mapped_equipment['EQUIPMENT_NAME'])]
len(df_DS)

Enter your mapped equipment file:  MLE/0V10/MLE_0V10_Equipment.csv


5182

In [13]:
# To have the list of process events extracted from DS in the proper format to be compared with the list extracted from the DL

In [14]:
df_DS.columns = df_DS.columns.str.lower()
df_DS = df_DS.astype(str)
df_DS['start_date'] = df_DS['start_date'].str.slice(stop=-4)
df_DS['end_date'] = df_DS['end_date'].str.slice(stop=-4)
len(df_DS)

5182

In [15]:
# To select only process events extracted from DS that do not appear in DL (datagaps)

In [16]:
df_merge = pd.merge(df_DS, df_DL_site_building, on=['recipe_full', 'parameter_name', 'parameter_value', 'start_date', 'end_date'], how='left')

In [17]:
datagaps = df_merge[df_merge['event_name'].isna()]
datagaps = datagaps.drop('event_name', axis=1)
len(datagaps)

389

In [18]:
# To have the list of datagaps in the same format as the DS file 

In [19]:
mask = df_all_process_events['ID'].isin(datagaps['id'])
result = df_all_process_events[mask]
result = result.drop('ID', axis=1)
len(result)

389

In [20]:
# To select only datagaps with date parameter (only if the site uses it) greater than end date

In [21]:
if sort_column != "na":
    filtered_result = result[result[sort_column] <= result['END_DATE']]
else:
    filtered_result = result
len(filtered_result)

369

In [22]:
file_path = f'{value1}/{value2}/datagaps_{value1}_{value2}.csv'
filtered_result.to_csv(file_path)

In [23]:
# To represent the number of datagaps per equipment

In [24]:
var1 = filtered_result['EQUIPMENT_NAME'].value_counts()
plt.figure(figsize=(10, 5))
var1.plot(kind='bar')
plt.xlabel('Equipment name')
plt.ylabel('Number of datagaps')
plt.title('Number of datagaps per equipment')
fig_path = f'{value1}/{value2}/datagaps_{value1}_{value2}_per_equipment.png'
plt.savefig(fig_path)
plt.close()

In [25]:
# To represent the number of datagaps per week

In [26]:
filtered_result = filtered_result.copy()
filtered_result['DATE_PARAMETRE'] = pd.to_datetime(filtered_result['DATE_PARAMETRE'])
filtered_result.set_index('DATE_PARAMETRE', inplace=True)
weekly_counts = filtered_result.groupby(pd.Grouper(freq='D')).count()
plt.figure(figsize=(20, 10))
plt.bar(weekly_counts.index, weekly_counts['BATCH_ID'])
plt.xlabel('Date parameter day')
plt.ylabel('Number of datagaps')
plt.title('Number of datagaps per date parameter day')
fig_path = f'{value1}/{value2}/datagaps_{value1}_{value2}_per_day.png'
plt.savefig(fig_path)
plt.close()

In [27]:
# percentage of missing data 

In [28]:
print("Considering total number of rows of the initial csv as total number of process events and only datagaps when date parameter is lower than end date: " + str(round(len(filtered_result)*100/len(df_all_process_events),2)))

Considering total number of rows of the initial csv as total number of process events and only datagaps when date parameter is lower than end date: 7.12
