### Adaptive Fault Detection with VAR Models

##### Import libraries needed

In [193]:
# importing packages and libraries
from pandas import read_csv
import pandas as pd
import numpy as np
import pickle
import os
import random
import string
from statsmodels.tsa.api import VAR
import matplotlib.pyplot as plt

# Ignoe harmless warnings
import warnings
warnings.filterwarnings("ignore")

# Define the plot size default
from pylab import rcParams
rcParams['figure.figsize'] = (12, 5)

### Defining methods or functions

In [194]:
# Plotting multiple series
def plot_multiple_series(actual, pred, attr):
    for i in range(len(attr)):
        title = "Prediction of {}".format(attr[i])
        plt.title(title)
        plt.xlabel("Timestep")
        plt.ylabel("Values")
        plt.plot(actual.iloc[:,i], label="actual")
        plt.plot(pred.iloc[:,i], label="forecast")
        plt.legend()
        plt.show()


# Root mean squared error
def root_mse(x, y):
    if len(x) != len(y):
        return "Error: The two arguments must have the same length"
    mse = np.square(np.subtract(x, y)).mean()
    return np.sqrt(mse)

# Plotting series
def plot_series(series, attr):
    for i in range(len(attr)):
        title = "Plot of "+str(attr[i])
        actual = series.iloc[:,i]
        plt.title(title)
        plt.xlabel("Timestep")
        plt.ylabel(attr[i])
        plt.plot(actual)
        plt.show()

# Normalisation of time series
def normalise_timeseries(data):
    # Calculate the mean and standard deviation for each feature
    means = np.mean(data, axis=0)
    stds = np.std(data, axis=0)
    
    # Normalise each feature using standard deviation
    normalised_data = (data - means) / stds
    return pd.DataFrame(normalised_data)


# Denomalisation of time series
def denormalise_timeseries(data, means, stds):
    denormalised_data = (data * stds) + means
    return pd.DataFrame(denormalised_data)


# Augmented Dickey-Fuller Test
def adf_test(series, title=''):
    '''
    Hypothesis Test for Stationarity
    Pass in a time series and an optional title, return an ADF report
    '''
    print(f'Augmented Dickey-Fuller Test: {title}')
    result = adfuller(series.dropna(),autolag='AIC')
    labels = ['ADF test statistics','p-value','#lags','#observations'] # use help(adfuller) to understand why these labels are chosen
    
    outcome = pd.Series(result[0:4],index=labels)
    
    for key,val in result[4].items():
        outcome[f'critical value ({key})'] = val
        
    print(outcome.to_string()) # this will not print the line 'dtype:float64'
    
    if result[1] <= 0.05:
        print('Strong evidence against the null hypothesis') # Ho is Data is not stationary, check help(adfuller)
        print('Reject the null hypothesis')
        print('Data is Stationary')
    else:
        print('Weak evidence against the Null hypothesis')
        print('Fail to reject the null hypothesis')
        print('Data has a unit root and is non stationary')


# Loading expert models in a dictionary and count the number of expert models
def load_expert_models(expert_path):
    files = os.listdir(expert_path)
    pickle_files = [file for file in files if file.endswith('.pkl')]
    models = {}

    for file in pickle_files:
        with open(file, 'rb') as f:
            loaded_expert = pickle.load(f)
            models[file.split('.')[0]] = loaded_expert
            # models[file.split('.')[0]] = pickle.load(f)

    # return models, len(models)
    return models


# Count the number of expert models in the repository
# def count_expert_models():

def file_names_string(file_path):
    files = os.listdir(file_path)
    for file_name in files:
        if not isinstance(file_name, str):
            return False
    return True



In [195]:
# Assigning variables
# file = 'test_series_reduced.csv'
# file = 'stuckat1_training_reduced_new.csv'
file = 'valueflip_training_reduced_new.csv'
# file = 'stuckat1_training_reduced.csv'
experts_path = 'expert_models'
df_raw = read_csv(file, header=0, index_col=0)
attr = list(pd.read_csv(file).columns.values)[1:]
series = df_raw.iloc[:40000,:]
# plot_series(series, attr)
nobs = 3000
# steps = 15
steps = 20
begin = 2000
finish = 2200
normalised_data = normalise_timeseries(df_raw)
testData = normalised_data.copy()
train = testData.iloc[:-nobs]
test = testData.iloc[-nobs:]
# len(train), len(test)
input1 = testData.iloc[begin:finish,:]
# plot_series(input1, attr)
prediction_error = {}

In [196]:
# Threshold prediction error value
threshold = 0.35

In [197]:
checker = file_names_string(expert_path)
print(checker)

True


In [198]:
# Load the expert models
my_experts = load_expert_models(experts_path)
# print(my_experts)
print(my_experts['var_ctrl_stuckat0_perm_reduced'])


FileNotFoundError: [Errno 2] No such file or directory: 'new_expert_model_001.pkl'

In [None]:
# myfile = "expert_models/new_expert_model_r8UakMv4.pkl"
myfile = "expert_models/var_golden_model_reduced.pkl"
with open(myfile, 'rb') as f:
    test_file = pickle.load(f)
    print(test_file)

In [None]:
pred = my_experts['var_ctrl_stuckat0_perm_reduced'].forecast(input1.values, steps=steps)
pred_df = pd.DataFrame(pred, columns=input1.columns)
print(pred_df)

In [None]:
for model_name, model in my_experts.items():
    predictions = my_experts[model_name].forecast(input1.values, steps=steps)
    predictions_df = pd.DataFrame(predictions, columns=input1.columns)
    expected = testData.iloc[finish:finish+steps,:].reset_index(drop=True)
    RMSE = []
    for feature in attr:
        RMSE.append(root_mse(predictions_df[feature], expected[feature]))

    prediction_error[model_name] = RMSE
    
print(prediction_error)

In [None]:
for key, value in prediction_error.items():
    print(prediction_error[key], np.mean(prediction_error[key]))

In [None]:
type(prediction_error)

In [None]:
def identify_fault(error_dict):
    best_val = 100
    best_expert = ""
    for key, value in error_dict.items():
        if np.mean(error_dict[key]) < best_val:
            best_val = np.mean(error_dict[key])
            best_expert = key

    
    print(f'The best expert is {best_expert}')
    if threshold < best_val:
        print(f'We need to train additional expert model')
    else:
        print(f'Congratulations!!! The best expert model is acceptable')

In [None]:
identify_fault(prediction_error)

In [None]:
print(stop)

In [None]:
def select_order(data):
    model = VAR(data)
    selected_order = model.select_order()
    order = selected_order.selected_orders['aic']
    return order

# results = model
max_lag = select_order(train)

def train_new_expert(data, max_lag):
    model = VAR(data)
    fitted_expert = model.fit(maxlags=max_lag)
    return fitted_expert


def generate_random_name(length):
    random_string = ''.join(random.choices(string.ascii_letters + string.digits, k=length))
    # return "expert_models/New_expert_model_" + random_string + ".pkl"
    return "new_expert_model_" + random_string + ".pkl"


# new_expert_model = train_new_expert(train, max_lag)
model = VAR(train)
new_expert_model = model.fit(maxlags=max_lag)

# new_expert_model.k_ar
new_expert_model_result = new_expert_model.forecast(y=train.values, steps=steps)
print(len(new_expert_model_result))

result_forecast = pd.DataFrame(new_expert_model_result, columns=attr)

# Save the model
expert_name = generate_random_name(8)

# with open('expert_models/new_expert_model_001.pkl', 'wb') as f:
with open('expert_models/new_expert_model_001.pkl', 'wb') as f:
    pickle.dump(new_expert_model, f)
print("Model saved!")

# with open('expert_models/' + expert_name, 'wb') as f:
#           pickle.dump(new_expert_model, f)
# print("Model saved!")

In [None]:
print("Temporarily stop execution")
print(stop)

In [None]:
# test_file = 'test_outputs.csv'
# # test_file = 'stuckat1_training_reduced.csv'
# # test_file = 'test_series_reduced.csv'
# # dff = pd.read_csv(test_file, index_col=0, header=0, parse_dates=True)
# dff = pd.read_csv(test_file, index_col=0, header=0)
# featt = list(pd.read_csv(test_file, index_col=0, header=0).columns.values)[1:]
# plot_series(dff, featt)

In [None]:
def plot_dataframe(df):
    num_series = len(df.columns)
    fig, axes = plt.subplots(num_series, 1, figsize=(10, 5*num_series), sharex=True)
    for i, col in enumerate(df.columns):
        ax = axes[i] if num_series > 1 else axes
        ax.plot(df.index, df[col])
        ax.set_title(col)
        ax.grid(True)

    plt.tight_layout()
    plt.show()