<a href="https://colab.research.google.com/github/jjuhyeok/Anomaly_Detection/blob/David/associationlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [357]:
import os
import pandas as pd
import numpy as np

import plotly.express as px
import plotly.graph_objects as go

from google.colab import drive
from google.colab import files

#### **Mount Drive**

In [358]:
drive.mount('/content/drive')
root = '/content/drive/My Drive/smartfactory'
# os.chdir(root)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#### **Helper Functions**

In [359]:
def feature_engineering(df):
  """
      Feature engineering using actual physical laws
  """
  df["heat_rate"] = df['air_inflow'] * (df['air_end_temp'] - 25)
  df["power_consumption"] = df["motor_current"] * df["motor_vibe"]
  df["power_output"] = df["motor_current"] * df["motor_rpm"]
  df["compressor_efficiency"] = df["motor_vibe"] / df["motor_rpm"]
  df["compressor_temp_change"] = df["air_end_temp"] - 25
  df["compressor_heat_reject"] = 1.293 * df["compressor_temp_change"] * df["air_inflow"]
  df["air_mass_flow"] = 1.293 * df["air_inflow"]
  df['air_velocity'] = df["air_inflow"] / (3.14 * 0.05 * 0.05)
  df['air_pressure'] = 101.325 + 0.5 * 1.293 * (df["air_velocity"]**2)
  df["air_enthalpy"] = 700 * df["air_inflow"] * df["air_end_temp"]
  df["compression_ratio"] = df["out_pressure"] / 101.325
  df["temp_pressure_ration"] = (25 / df["air_end_temp"]) * (101.325 / df["out_pressure"])
  return df



def assign_label(x, bins, label):
  """
      Assign each bin a name of its column and some number
      Assign "_none" for values outside the range
  """
  for i in range(len(bins)-1):
    if x >= bins[i] and x < bins[i+1]:
      return label + "_" + str(i)

  return label + "_none"


def to_numeric(df):
  """
      Change numerics to numeric type
  """
  
  for col in list(df.columns):
      df[col] = df[col].apply(pd.to_numeric, errors='ignore')
  return df

def get_columns(df):
  return list(df.columns)


def csv_download(df, filename):
  df.to_csv(f'{filename}.csv', index=False)
  files.download(f'{filename}.csv')


def create_quantiles(how_many):
  increment = 1 / how_many
  quantiles = [0]
  val = 0
  for i in range(how_many):
    val += increment
    val = round(val, 5)
    quantiles.append(val)
  return quantiles

#### **Load Data**

In [360]:
train = pd.read_csv(os.path.join(root, "train_data.csv"))
test = pd.read_csv(os.path.join(root, "test_data.csv"))
columns = get_columns(train)
columns = [x for x in columns if x != "type"]

#### **Get Quantiles**

In [361]:
quantiles_1 = create_quantiles(10)
quantiles_2 = create_quantiles(20)
quantiles_3 = create_quantiles(40)
quantiles_4 = create_quantiles(50)

In [362]:
def get_binning_range(df, columns, quantiles, idx):
  """
      Get the binning ranges/edges
  """
  ranges = {}
  df1 = pd.DataFrame()
  for id in idx:
    df1 = pd.concat([df1, df[df['type'] == id]], axis=0)

  for col in columns:
    bins = [df1[col].quantile(i) for i in quantiles]
    bins[-1] = bins[-1] * 1.01
    df1[col] = df1[col].apply(lambda x: assign_label(x, bins, col))
    ranges[col] = bins

  return ranges, df1

In [363]:
def fit_binning_range(df, columns, ranges, idx=None):
  """
      Use the given range to bin df
  """
  df1 = pd.DataFrame()
  for id in idx:
    df1 = pd.concat([df1, df[df["type"] == id]], axis=0)

  for col in columns:
    bins = ranges[col]
    df1[col] = df1[col].apply(lambda x: assign_label(x, bins, col))
  
  return df1[columns], df1[["type"]]

In [364]:
def find_occurence_frequency(df1, df2, columns, occurence):
  """
      Given a row in df1 (test), find how many times it existed in df2 (train)
      Look at the features in columns
      Give hihger probability to the less frequent rows
  """
  count = []
    
  for i in range(len(df1)):
    row = df1.iloc[i, :].values.tolist()
    temp = df2.copy()
    for (bin, col) in [(row[i], columns[i]) for i in range(len(row))]:
      temp = temp[temp[col] == bin]
    count.append(len(temp))

  amplitude = [1/x if x > occurence else 5 for x in count]
  return amplitude

#### **Feature Engineering**

In [365]:
train = feature_engineering(train)
test = feature_engineering(test)

columns = get_columns(train)
columns = [x for x in columns if x != "type"]

In [384]:
# Get range on
idx = [0, 4, 5, 6, 7]
ranges, df_train = get_binning_range(train, columns, quantiles_1, idx)

# Fit range on
idx = [4]
df_test, df_test_type = fit_binning_range(test, columns, ranges, idx)

# How many times it should be seen to not be anomaly
occurence = 0

# Amplitude = How unlikely a test data exists in train dataset
amplitude = find_occurence_frequency(df_test, df_train, columns, occurence)

In [385]:
fig = px.line(x=df_test.index.tolist(), y=amplitude, color=df_test_type["type"])
fig.update_layout(title="type " + str(idx))

##### **Candidate anomalies**

In [386]:
anomaly = [x for (x, y) in list(zip(df_test.index.tolist(), amplitude)) if y == 5]

In [387]:
candidates = pd.DataFrame()
candidates["type"] = df_test_type["type"]
candidates["labe"] = 0
candidates.loc[anomaly, "label"] = 1

In [388]:
fig = px.line(x=df_test.index.tolist(), y=candidates["label"])
fig.update_layout(title="type " + str(idx))
fig.show()