In [51]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LassoCV, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import cross_val_score, cross_validate, KFold, train_test_split
from sklearn.metrics import make_scorer, f1_score, r2_score, confusion_matrix, roc_curve, auc, recall_score, precision_score, accuracy_score, mean_squared_error, mean_absolute_error
from sklearn.feature_selection import RFE
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm
import shap
import re

pd.options.display.max_rows = 20

In [36]:
train_data = pd.read_csv('anomaly-free.csv', sep=';')

test_files = ['0.csv', '1.csv', '2.csv', '3.csv']
test_data_list = [pd.read_csv(file, sep=';') for file in test_files]

test_data = pd.concat(test_data_list, ignore_index=True)

In [None]:
train_data

Unnamed: 0,datetime,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS
0,2020-02-08 13:30:47,0.202394,0.275154,2.16975,0.382638,90.6454,26.8508,238.852,122.664
1,2020-02-08 13:30:48,0.203153,0.277857,2.07999,-0.273216,90.7978,26.8639,227.943,122.338
2,2020-02-08 13:30:50,0.202054,0.275790,2.52577,0.382638,90.7730,26.8603,223.486,121.338
3,2020-02-08 13:30:51,0.203595,0.278101,2.49742,0.054711,90.8424,26.8616,244.904,121.664
4,2020-02-08 13:30:52,0.201889,0.276363,2.29194,0.710565,90.6664,26.8603,239.196,122.000
...,...,...,...,...,...,...,...,...,...
9400,2020-02-08 16:16:43,0.225744,0.267921,2.91598,0.054711,88.8593,29.3792,219.542,126.000
9401,2020-02-08 16:16:44,0.205870,0.258370,1.71505,0.710565,89.1754,29.3692,221.862,126.000
9402,2020-02-08 16:16:45,0.219222,0.267244,2.35834,-0.273216,89.1306,29.3674,226.050,126.679
9403,2020-02-08 16:16:46,0.219481,0.271278,2.49108,0.054711,88.5447,29.3757,226.343,127.000


In [None]:
test_data

Unnamed: 0,datetime,Accelerometer1RMS,Accelerometer2RMS,Current,Pressure,Temperature,Thermocouple,Voltage,Volume Flow RateRMS,anomaly,changepoint
0,2020-03-09 15:56:30,0.027608,0.039203,1.290480,0.054711,68.6194,24.3670,241.062,32.0362,0.0,0.0
1,2020-03-09 15:56:31,0.027166,0.039940,1.285650,0.382638,68.5923,24.3660,238.709,32.9649,0.0,0.0
2,2020-03-09 15:56:32,0.027718,0.040167,1.155880,0.054711,68.5207,24.3666,226.485,32.0362,0.0,0.0
3,2020-03-09 15:56:33,0.028045,0.038026,0.971268,0.382638,68.5425,24.3634,220.378,32.9649,0.0,0.0
4,2020-03-09 15:56:34,0.027644,0.038580,1.072460,-0.273216,68.6569,24.3639,233.922,32.0000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
4307,2020-03-09 17:14:04,0.026853,0.038926,0.740614,0.054711,69.6371,24.1045,237.276,32.0451,0.0,0.0
4308,2020-03-09 17:14:05,0.027067,0.038430,0.988875,0.054711,69.6731,24.1046,230.729,32.9562,0.0,0.0
4309,2020-03-09 17:14:07,0.027582,0.038836,0.588439,0.054711,69.6959,24.1020,233.443,32.0000,0.0,0.0
4310,2020-03-09 17:14:08,0.027406,0.038133,0.989732,-0.273216,69.6293,24.1020,238.930,32.0000,0.0,0.0


In [37]:
X_train = train_data.drop(columns=['datetime'])
X_test = test_data.drop(columns=['datetime', 'anomaly', 'changepoint'], errors='ignore')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

k = 2
knn = NearestNeighbors(n_neighbors=k)
knn.fit(X_train_scaled)

distances, _ = knn.kneighbors(X_test_scaled)
distance_threshold = np.mean(distances[:, -1]) + 2 * np.std(distances[:, -1])

test_data['predictions'] = distances[:, -1] > distance_threshold
test_data['predictions'] = test_data['predictions'].apply(lambda x: 1 if x else 0)

if 'anomaly' in test_data.columns:
    from sklearn.metrics import precision_score, recall_score, f1_score

    y_true = test_data['anomaly']
    y_pred = test_data['predictions']

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"nPrecision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

print("nTest Data with Predictions:")
print(test_data[['datetime', 'predictions', 'anomaly']].head())

nPrecision: 0.90
Recall: 0.05
F1 Score: 0.09
nTest Data with Predictions:
              datetime  predictions  anomaly
0  2020-03-09 15:56:30            0      0.0
1  2020-03-09 15:56:31            0      0.0
2  2020-03-09 15:56:32            0      0.0
3  2020-03-09 15:56:33            0      0.0
4  2020-03-09 15:56:34            0      0.0


In [56]:
train_data = pd.read_csv('anomaly-free.csv', sep=';')
test_files = ['0.csv', '1.csv', '2.csv', '3.csv']
test_data_list = [pd.read_csv(file, sep=';') for file in test_files]
test_data = pd.concat(test_data_list, ignore_index=True)


q1 = train_data['Temperature'].quantile(0.25)
q3 = train_data['Temperature'].quantile(0.75)


X_train = train_data.drop(columns=['datetime', 'changepoint'], errors='ignore')
y_train = train_data['Temperature']

X_test = test_data.drop(columns=['datetime', 'changepoint', 'anomaly'], errors='ignore')
y_test = test_data['Temperature']


model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

y_val_pred = model.predict(X_test)

out_of_bounds = [(1 if value < q1 or value > q3 else 0) for value in y_val_pred]
test_data['predictions'] = out_of_bounds


if 'anomaly' in test_data.columns:
    from sklearn.metrics import precision_score, recall_score, f1_score

    y_true = test_data['anomaly']
    y_pred = test_data['predictions']

    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"nPrecision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")


nPrecision: 0.35
Recall: 1.00
F1 Score: 0.52
