# Hidden Markov Modelling for Production Failures
- Data preprocessing
- Model building
- Exploration of results

In [16]:
# imports and data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import shutil
import sys
import os

from pvops.hmm import *

example_prodpath = os.path.join('..', '..', 'examples', 'example_data', 'example_prod_cleaned.csv')
example_metapath = os.path.join('..', '..', 'examples', 'example_data', 'example_metadata2.csv')
example_OMpath = os.path.join('..', '..', 'examples', 'example_data', 'example_om_cleaned.csv')


In [17]:
prod_data = pd.read_csv(example_prodpath, on_bad_lines='skip', engine='python')
metadata = pd.read_csv(example_metapath, on_bad_lines='skip', engine='python')
om_data = pd.read_csv(example_OMpath, on_bad_lines='skip', engine='python')

In [18]:
prod_col_dict = {'siteid': 'randid', 
                 'timestamp': 'Date',
                 'energyprod': 'Energy',
                 'powerprod': 'energy_generated_kWh',
                 'energyexpected': 'energy_expected_kWh',
                 'irradiance':'Irradiance',
                 'baseline': 'IEC_pstep', #user's name choice for new column (baseline expected energy defined by user or calculated based on IEC)
                 'dcsize': 'dcsize', #user's name choice for new column (System DC-size, extracted from meta-data)
                 'compared': 'Compared',#user's name choice for new column
                 'energy_pstep': 'Energy_pstep', #user's name choice for new column
                 'ambient_temperature': 'temp_amb_C',
                 'module_temperature': 'temp_mod_C'
                }

om_col_dict = {'siteid': 'randid', 
               'datestart': 'date_start',
               'dateend': 'date_end',
               'text': 'GeneralDesc',
               'workID': 'WONumber',
               'worktype': 'WOType',
               'asset': 'Asset',
               'prod_impact': 'ProductionImpact_kWh',
               'eventdur': 'EventDur', #user's name choice for new column (Repair Duration)
               'modatestart': 'MonthStart', #user's name choice for new column (Month when an event begins)
               'agedatestart': 'AgeStart'} #user's name choice for new column (Age of system when event begins)

metad_col_dict = {'siteid': 'randid',
                  'dcsize': 'DC_Size_kW',
                  'COD': 'COD'}

In [19]:
prod_data.head()

Unnamed: 0,randid,Date,Energy,Irradiance
0,R23,,1000.0,
1,R23,7/19/2018 1:00,1000.0,
2,R23,7/19/2018 2:00,0.0,
3,R23,7/19/2018 3:00,0.0,
4,R23,7/19/2018 4:00,1000.0,


In [20]:
om_data.head()

Unnamed: 0,randid,Asset,date_start,date_end,WONumber,WOType,GeneralDesc
0,,Inverter,5/2/2018 12:00,5/17/2018 16:00,100,Corrective,"Inverter 1.1 Contactor 7, Inverter 1.2 Contact..."
1,R23,Facility,5/19/2018 15:44,5/19/2018 13:04,101,Preventive,Site offline due to grid disturbance
2,R23,Facility,6/15/2018 6:46,6/15/2018 10:30,102,Corrective,Plant trip due to grid disturbance
3,R23,Facility,6/18/2018 11:20,6/18/2018 14:03,103,Corrective,Site trip due to cause grid disturbance
4,R23,Facility,7/21/2018 4:45,7/21/2018 13:15,104,Vegetation,Site tripped due to grid disturbance


In [21]:
metadata.head()


Unnamed: 0,randid,DC_Size_kW,COD,latitude,longitude
0,R23,2500,10/20/2013,-80,-35
1,R27,475,10/21/2017,-81,-36


In [22]:
# Create performance index (PI)
# Since our data does not come with expected energies, we can use
# pvops functions to come up with our own estimates.
# for testing purposes set expected energy as energy produced
prod_data[prod_col_dict['energyexpected']] = prod_data[prod_col_dict['energyprod']]


In [23]:
prod_data.loc[:, 'performance_index'] = prod_data[prod_col_dict['energyprod']] / \
                                               prod_data[prod_col_dict['energyexpected']]

In [29]:
om_data.Asset

0     Inverter
1     Facility
2     Facility
3     Facility
4     Facility
5     Inverter
6     Inverter
7     Inverter
8     Facility
9     Facility
10    Facility
11       Other
12    Facility
13       other
Name: Asset, dtype: object

In [44]:
metadata

Unnamed: 0,randid,DC_Size_kW,COD,latitude,longitude
0,R23,2500,10/20/2013,-80,-35
1,R27,475,10/21/2017,-81,-36


In [45]:
## Add system capacity to production data
prod_data.loc[:, 'capacity'] = np.nan
sites = prod_data['randid'].unique()
for site in sites:
    site_mask = prod_data['randid'] == site
    prod_data.loc[site_mask, 'capacity'] = metadata.loc[metadata['randid'] == site, "DC_Size_kW"]

## Preprocessing
Load in data from the end of tutorial_text2time_module.ipynb which already covers the standard preprocessing activities for om and prod data.

In [37]:
# Filter out night-time values
bad_irr_values = prod_data[prod_data[prod_col_dict['irradiance']].isna()]
pd.Series(bad_irr_values.loc[:,'Date'].hour.tolist()).value_counts()

AttributeError: 'Series' object has no attribute 'hour'