<a href="https://colab.research.google.com/github/jjuhyeok/Anomaly_Detection/blob/David/associationlearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import pandas as pd
import numpy as np

#### **Mount Drive**

In [11]:
from google.colab import drive
drive.mount('/content/drive')
root = '/content/drive/My Drive/smartfactory'
# os.chdir(root)

Mounted at /content/drive


#### **Helper Functions**

In [12]:
def feature_engineering(df):
  """
      Feature engineering using actual physical laws
  """
  df["heat_rate"] = df['air_inflow'] * (df['air_end_temp'] - 25)
  df["power_consumption"] = df["motor_current"] * df["motor_vibe"]
  df["power_output"] = df["motor_current"] * df["motor_rpm"]
  df["compressor_efficiency"] = df["motor_vibe"] / df["motor_rpm"]
  df["compressor_temp_change"] = df["air_end_temp"] - 25
  df["compressor_heat_reject"] = 1.293 * df["compressor_temp_change"] * df["air_inflow"]
  df["air_mass_flow"] = 1.293 * df["air_inflow"]
  df['air_velocity'] = df["air_inflow"] / (3.14 * 0.05 * 0.05)
  df['air_pressure'] = 101.325 + 0.5 * 1.293 * (df["air_velocity"]**2)
  df["air_enthalpy"] = 700 * df["air_inflow"] * df["air_end_temp"]
  df["compression_ratio"] = df["out_pressure"] / 101.325
  df["temp_pressure_ration"] = (25 / df["air_end_temp"]) * (101.325 / df["out_pressure"])
  return df



def assign_label(x, bins, label):
  """
      Assign each bin a name of its column and some number
      Assign "_none" for values outside the range
  """
  for i in range(len(bins)-1):
    if x >= bins[i] and x < bins[i+1]:
      return label + "_" + str(i)

  return label + "_none"


def to_numeric(df):
  """
      Change numerics to numeric type
  """
  
  for col in list(df.columns):
      df[col] = df[col].apply(pd.to_numeric, errors='ignore')
  return df

def get_columns(df):
  return list(df.columns)

#### **Load Data**

In [13]:
train = pd.read_csv(os.path.join(root, "train_data.csv"))
test = pd.read_csv(os.path.join(root, "test_data.csv"))
columns = get_columns(train)
columns = [x for x in columns if x != "type"]

#### **Get Quantiles**

In [14]:
# quantiles = [0, .1, .2, .3, .4, .5, .6, .7, .8, .9, .1]
quantiles = [0, 0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95, 1.0]

In [15]:
def get_binning_range(df, columns, quantiles, idx):
  """
      Get the binning ranges/edges
  """
  ranges = {}
  df1 = pd.DataFrame()
  for id in idx:
    df1 = pd.concat([df1, df[df['type'] == id]], axis=0)

  for col in columns:
    bins = [df1[col].quantile(i) for i in quantiles]
    bins[-1] = bins[-1] * 1.01
    df1[col] = df1[col].apply(lambda x: assign_label(x, bins, col))
    ranges[col] = bins

  return ranges, df1

In [16]:
def fit_binning_range(df, columns, ranges, idx=None):
  """
      Use the given range to bin df
  """
  df1 = pd.DataFrame()
  for id in idx:
    df1 = pd.concat([df1, df[df["type"] == id]], axis=0)

  for col in columns:
    bins = ranges[col]
    df1[col] = df1[col].apply(lambda x: assign_label(x, bins, col))
  
  return df1[columns], df1[["type"]]

In [17]:
def find_occurence_frequency(df1, df2, columns):
  """
      Given a row in df1 (test), find how many times it existed in df2 (train)
      Look at the features in columns
      Give hihger probability to the less frequent rows
  """
  prob = []
    
  for i in range(len(df1)):
    row = df1.iloc[i, :].values.tolist()
    temp = df2.copy()
    for (bin, col) in [(row[i], columns[i]) for i in range(len(row))]:
      temp = temp[temp[col] == bin]
    prob.append(len(temp))

  prob = [1/(x+0.01) for x in prob]
  return prob

In [27]:
idx = [0, 1, 2, 3, 4, 5, 6, 7]  # Get range on
ranges, df_train = get_binning_range(train, columns, quantiles, idx)

idx = [0, 5]                    # Fit range on
df_test, df_test_type = fit_binning_range(test, columns, ranges, idx)
prob = find_occurence_frequency(df_test, df_train, columns)

In [28]:
import plotly.express as px
import plotly.graph_objects as go

fig = px.line(x=df_test.index.tolist(), y=prob, color=df_test_type["type"])
fig.update_layout(title="type " + str(idx))

##### **Create Submission File**

In [None]:
trial = pd.DataFrame()
trial["type"] = test["type"]
trial["label"] = 0

##### **Find and label anomalies**

In [37]:
occurence = 0     # occurences <= this value are considered anomaly
threshold = 1 / (occurence + 0.01)
anomaly = [x for (x, y) in list(zip(df_test.index.tolist(), prob)) if y >= threshold]

In [38]:
trial.loc[anomaly, "label"] = 1

##### **Download**

In [None]:
from google.colab import files
trial.to_csv('trial.csv', index=False)
files.download('trial.csv')