In [1]:
import pandas as pd
import numpy as np
import mylibrary as lib
from sklearn.ensemble import IsolationForest
import plotly.express as px
import plotly.graph_objs as go
import plotly.io as pio
import matplotlib.pyplot as plt
from matplotlib.lines import Line2D
from sklearn.ensemble import IsolationForest
import random
import copy

In [2]:
df, filename = lib.preprocess.upload.upload_csv()

In [3]:
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df.dropna()

In [4]:
dt1 = pd.to_datetime('2023-07-14 00:00:04')
dt2 = pd.to_datetime('2023-07-14 23:59:04')
df['DateTime'] = pd.to_datetime(df['DateTime'])
df = df[(df['DateTime'] >= dt1) & (df['DateTime'] <= dt2)]

In [6]:
df1 = df.copy() 
df1 = lib.preprocess.scale.apply_min_max_scale(df1)
df_resampled = df1.resample('2s', on='DateTime').mean().reset_index()

In [7]:
def smoothing(x, k):
    width = 1 / k

    for i in range(k):
        min_val = i * width
        max_val = (i + 1) * width

        if min_val <= x <= max_val:
            return min_val

    return "Invalid value"

def apply_smoothing_to_dataframe(df, k):
    result_df = df.copy()

    for column in result_df.columns:
        if column != 'DateTime':
            result_df[f'{column}_binned'] = result_df[column].apply(lambda x: smoothing(x, k))
            del result_df[column] 

    return result_df

In [8]:
k = 200
df_resampled = apply_smoothing_to_dataframe(df_resampled, k)

Unnamed: 0,DateTime,Hid_Pomp_Balans,Hid_Pomp_Motor_Rulman,Hid_emn_bas_act,Hid_Pomp_Balans_binned,Hid_Pomp_Motor_Rulman_binned,Hid_emn_bas_act_binned
0,2023-07-14 00:00:04,0.407407,0.352941,0.533594,0.405,0.35,0.53
1,2023-07-14 00:00:06,0.407407,0.352941,0.533594,0.405,0.35,0.53
2,2023-07-14 00:00:08,0.444444,0.352941,0.533594,0.440,0.35,0.53
3,2023-07-14 00:00:10,0.444444,0.352941,0.533594,0.440,0.35,0.53
4,2023-07-14 00:00:12,0.444444,0.352941,0.533594,0.440,0.35,0.53
...,...,...,...,...,...,...,...
43166,2023-07-14 23:58:56,0.617284,0.294118,0.533594,0.615,0.29,0.53
43167,2023-07-14 23:58:58,0.617284,0.294118,0.533594,0.615,0.29,0.53
43168,2023-07-14 23:59:00,0.617284,0.294118,0.533594,0.615,0.29,0.53
43169,2023-07-14 23:59:02,0.617284,0.294118,0.533594,0.615,0.29,0.53


In [9]:
sensors = df_resampled.columns[1:]
sensors

Index(['Hid_Pomp_Balans', 'Hid_Pomp_Motor_Rulman', 'Hid_emn_bas_act',
       'Hid_Pomp_Balans_binned', 'Hid_Pomp_Motor_Rulman_binned',
       'Hid_emn_bas_act_binned'],
      dtype='object')

In [10]:
df1 = df_resampled.copy()
df1 = df1.dropna()

Unnamed: 0,DateTime,Hid_emn_bas_act_binned
0,2023-07-14 00:00:04,0.53
1,2023-07-14 00:00:06,0.53
2,2023-07-14 00:00:08,0.53
3,2023-07-14 00:00:10,0.53
4,2023-07-14 00:00:12,0.53
...,...,...
43166,2023-07-14 23:58:56,0.53
43167,2023-07-14 23:58:58,0.53
43168,2023-07-14 23:59:00,0.53
43169,2023-07-14 23:59:02,0.53


In [None]:
lib.plot.linechart.plot_line_chart(df1)

### Fuctions

In [None]:
# anomaly_set is list of anomaly points set(1d)
# outliers_set is list of predicted points sets (1d)
# anomaly_points is a list of waves
def intersection_multi(anomaly_set, outliers_set, anomaly_points, threshold):
    union_set = set()
    anomaly_points = {tuple(arr) for arr in anomaly_points}
    for elem1 in anomaly_set:
        for arr in anomaly_points:
            if any(abs(elem1 - elem2) < threshold for elem2 in outliers_set) and elem1 in arr:
                union_set.add(arr)
                break
    return union_set

In [None]:
# anomaly_points is [[],[],[]] matrix
# outliers is matrix
def get_metrics_multi(anomaly_points, outliers, threshold):
    
    flat_anomaly_points = [point for arr in anomaly_points for point in arr]
    anomaly_set = set(flat_anomaly_points)
    outliers_set = set(outliers)
    
    intersection = (intersection_multi(anomaly_set, outliers_set, anomaly_points, threshold))
    tp = sum(len(t) for t in intersection)

    print(len(outliers_set))
    print(len(anomaly_set))
    print(tp)
    precision = tp/len(outliers_set)
    recall = tp/len(anomaly_set)

    return precision, recall

In [None]:
def get_results(df, noise, anomaly_count, anomaly_normal_ratio, wave_length, df_results):
    df2 = df.copy()
    df2, anomaly_points2 = lib.preprocess.anomaly.apply_anomaly(df2, noise = noise, anomaly_normal_ratio = anomaly_normal_ratio, noise_direction = 'n', random_state = 300, wave = True, wave_length = wave_length)
    # lib.preprocess.anomaly.plot_anomaly(df2, anomaly_points2)
    
    anomaly_points2['Hid_emn_bas_act_binned'].append([42549, 42548, 42441, 42457, 42547, 42442, 42456, 42527, 42443,
                                    42546, 42455, 42545, 42444, 42544, 42528, 42543, 42542, 42541,
                                    42540, 42539, 42538, 42529, 42537, 42536, 42454, 42535, 42530,
                                    42534, 42445, 42533, 42531, 42532, 42453, 42446, 42452, 42451,
                                    42447, 42450, 42449, 42448])

    anomaly_points2 = anomaly_points2[next(iter(anomaly_points2))]
    
    b = sum(len(arr) for arr in anomaly_points2)
    contamination = round(b/len(df2), 6)

    model = IsolationForest(random_state=300, contamination = contamination)
    model.fit(df2[df2.columns[-1]])

    anomaly_predictions = model.predict(df2[df2.columns[1:]])

    anomaly_indices = np.where(anomaly_predictions == -1)[0]

    threshold = wave_length * 2  if wave_length * 2 > 20 else 20

    precision, recall = get_metrics_multi(anomaly_points2, anomaly_indices, threshold)

    row = {'data': 77, 'noise': noise, 'anomaly count': anomaly_count,  
    'wave len': wave_length, 'precision' : precision, 'recall' : recall, 
    'anomaly_normal_ratio' : anomaly_normal_ratio, 'noise direction' : 'n'}
    
    df_results = pd.concat([df_results, pd.DataFrame([row])], ignore_index=True)
    return df_results




# Isolation Forest

In [None]:
df_results = pd.DataFrame(columns=['data', 'noise', 'anomaly count',  'wave len',  'precision', 'recall', 'anomaly_normal_ratio', 'noise direction'])

In [None]:
wave_lengths = [1, 15, 60]
noises = [0.5, 0.25, 0.1]
anomaly_counts = [1, 5, 10]
anomaly_normal_ratios = {1: 0.000025, 5: 0.00013, 10: 0.00024}

In [None]:
for wave_length in wave_lengths:
    for noise in noises:
        for anomaly_count in anomaly_counts:
            anomaly_normal_ratio = anomaly_normal_ratios[anomaly_count]
            df_results = get_results(df1,  noise,  anomaly_count, anomaly_normal_ratio, wave_length, df_results)
            

In [None]:
df_results.to_csv("if_results.csv", index = False)

In [None]:
df_results