<a href="https://colab.research.google.com/github/innovateDotAI/outlier_system/blob/main/OneSVM_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

For details on SVM algorithm please reffer to [Support Vector Machines for Machine Learning](https://machinelearningmastery.com/support-vector-machines-for-machine-learning/)

Additoonal refrencees used:
[One-Class Classification Algorithms for Imbalanced Datasets](https://machinelearningmastery.com/one-class-classification-algorithms/)

# 1. Load the Library

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
np.set_printoptions(precision=3, suppress=True)


# 2. Load the data

In [2]:
#Data loading
#First download and import the dataset using pandas:
import pandas as pd
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00601/ai4i2020.csv'
#column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight',
                #'Acceleration', 'Model Year', 'Origin']
col_name = ['UID','PID','Type','airTemp','processTemp','rotationalSpeed','torque','toolwear','machineFailure','TWF','HDF','PWF','OSF','RNF']
raw_dataset = pd.read_csv(url, names=col_name,
                          na_values='?', comment='\t',
                          sep=',', skipinitialspace=True,header=0)

In [3]:
raw_dataset.head()

Unnamed: 0,UID,PID,Type,airTemp,processTemp,rotationalSpeed,torque,toolwear,machineFailure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0


In [4]:
#Drop those rows which will not be part of feature vector:
rawData = raw_dataset.copy()
rawData.drop(['UID','PID','TWF','HDF','PWF','OSF','RNF'], axis = 1, inplace = True)
rawData.head()

Unnamed: 0,Type,airTemp,processTemp,rotationalSpeed,torque,toolwear,machineFailure
0,M,298.1,308.6,1551,42.8,0,0
1,L,298.2,308.7,1408,46.3,3,0
2,L,298.1,308.5,1498,49.4,5,0
3,L,298.2,308.6,1433,39.5,7,0
4,L,298.2,308.7,1408,40.0,9,0


# 3. Data Preprocesssing
## 3.1 Misssing value investigation

In [5]:
rawData.isna().sum()

Type               0
airTemp            0
processTemp        0
rotationalSpeed    0
torque             0
toolwear           0
machineFailure     0
dtype: int64

No missing value found

In [6]:
# mark inliers 1, outliers -1
#rawSVMdata = rawData.copy()
# rawData['machineFailure' == '1'] = -1
# rawData['machineFailure' == '0'] = 1
# rawData
rawData.loc[rawData["machineFailure"] == 1, "machineFailure"] = -1
rawData.loc[rawData["machineFailure"] == 0, "machineFailure"] = 1

In [7]:
rawData["machineFailure"].unique()

array([ 1, -1])

In [8]:
rawData

Unnamed: 0,Type,airTemp,processTemp,rotationalSpeed,torque,toolwear,machineFailure
0,M,298.1,308.6,1551,42.8,0,1
1,L,298.2,308.7,1408,46.3,3,1
2,L,298.1,308.5,1498,49.4,5,1
3,L,298.2,308.6,1433,39.5,7,1
4,L,298.2,308.7,1408,40.0,9,1
...,...,...,...,...,...,...,...
9995,M,298.8,308.4,1604,29.5,14,1
9996,H,298.9,308.4,1632,31.8,17,1
9997,M,299.0,308.6,1645,33.4,22,1
9998,H,299.0,308.7,1408,48.5,25,1


## 3.2 One Hot encoding

In [9]:
#raw_dataset
# # generate binary values using get_dummies
raw_dataset_oneHot = pd.get_dummies(rawData, columns=["Type"], prefix=["Type_is"] )
raw_dataset_oneHot

Unnamed: 0,airTemp,processTemp,rotationalSpeed,torque,toolwear,machineFailure,Type_is_H,Type_is_L,Type_is_M
0,298.1,308.6,1551,42.8,0,1,0,0,1
1,298.2,308.7,1408,46.3,3,1,0,1,0
2,298.1,308.5,1498,49.4,5,1,0,1,0
3,298.2,308.6,1433,39.5,7,1,0,1,0
4,298.2,308.7,1408,40.0,9,1,0,1,0
...,...,...,...,...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14,1,0,0,1
9996,298.9,308.4,1632,31.8,17,1,1,0,0
9997,299.0,308.6,1645,33.4,22,1,0,0,1
9998,299.0,308.7,1408,48.5,25,1,1,0,0


In [10]:
raw_dataset_oneHot["machineFailure"].unique()

array([ 1, -1])

# 4. Split the data based on class

In [11]:
# Classs count 
#data.groupby('amount', as_index=False).agg({"duration": "sum"})
#raw_dataset_oneHot.groupby('machineFailure',as_index=False).agg({"machineFailure": "count"})
raw_dataset_oneHot.groupby('machineFailure',as_index=True).agg({"machineFailure": "count"})

Unnamed: 0_level_0,machineFailure
machineFailure,Unnamed: 1_level_1
-1,339
1,9661


Normal Classs is "machineFailure = 0" and outlier class is "machineFailure = 1"

In [12]:
#split raw dataset into two part:
raw_dataset_oneHot_MFFalse = raw_dataset_oneHot[raw_dataset_oneHot['machineFailure'] == 1]
raw_dataset_oneHot_MFTrue = raw_dataset_oneHot[raw_dataset_oneHot['machineFailure'] == -1]

In [13]:
raw_dataset_oneHot_MFFalse

Unnamed: 0,airTemp,processTemp,rotationalSpeed,torque,toolwear,machineFailure,Type_is_H,Type_is_L,Type_is_M
0,298.1,308.6,1551,42.8,0,1,0,0,1
1,298.2,308.7,1408,46.3,3,1,0,1,0
2,298.1,308.5,1498,49.4,5,1,0,1,0
3,298.2,308.6,1433,39.5,7,1,0,1,0
4,298.2,308.7,1408,40.0,9,1,0,1,0
...,...,...,...,...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14,1,0,0,1
9996,298.9,308.4,1632,31.8,17,1,1,0,0
9997,299.0,308.6,1645,33.4,22,1,0,0,1
9998,299.0,308.7,1408,48.5,25,1,1,0,0


In [14]:
# Understand data distribution
raw_dataset_oneHot_MFFalse.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
airTemp,9661.0,299.973999,1.990748,295.3,298.3,300.0,301.5,304.5
processTemp,9661.0,309.99557,1.486846,305.7,308.8,310.0,311.1,313.8
rotationalSpeed,9661.0,1540.260014,167.394734,1168.0,1429.0,1507.0,1615.0,2695.0
torque,9661.0,39.629655,9.47208,12.6,33.1,39.9,46.3,70.0
toolwear,9661.0,106.693717,62.94579,0.0,52.0,107.0,160.0,246.0
machineFailure,9661.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
Type_is_H,9661.0,0.101646,0.302198,0.0,0.0,0.0,0.0,1.0
Type_is_L,9661.0,0.596729,0.49058,0.0,0.0,1.0,1.0,1.0
Type_is_M,9661.0,0.301625,0.458987,0.0,0.0,0.0,1.0,1.0


# 6. Create Traning and test Set

In [15]:
#Split the data into training and test sets
train_dataset = raw_dataset_oneHot_MFFalse.sample(frac=0.8, random_state=0)
test_dataset = raw_dataset_oneHot_MFFalse.drop(train_dataset.index) # Removing all rows which is part of traning set.

In [16]:
train_dataset.shape

(7729, 9)

In [17]:
test_dataset.shape

(1932, 9)

In [18]:
train_dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7729 entries, 1896 to 9964
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   airTemp          7729 non-null   float64
 1   processTemp      7729 non-null   float64
 2   rotationalSpeed  7729 non-null   int64  
 3   torque           7729 non-null   float64
 4   toolwear         7729 non-null   int64  
 5   machineFailure   7729 non-null   int64  
 6   Type_is_H        7729 non-null   uint8  
 7   Type_is_L        7729 non-null   uint8  
 8   Type_is_M        7729 non-null   uint8  
dtypes: float64(3), int64(3), uint8(3)
memory usage: 445.3 KB


In [21]:
#Split features from labels
#The pop() method removes the specified column from the DataFrame. The pop() method returns the removed columns as a Pandas Series object.
train_features = train_dataset.copy()
test_features = test_dataset.copy()

train_labels = train_features.pop('machineFailure') # Removing label from Traning_set data
test_labels = test_features.pop('machineFailure')

In [22]:
train_labels.values

array([1, 1, 1, ..., 1, 1, 1])

# 5. Normalization - 
## Noraml Training Data Class: ['machineFailure'] == 0] ( N(0,1))

In [23]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
scaler = preprocessing.StandardScaler(with_mean=True, with_std=True).fit(train_features.values)
X_train_scaled = scaler.transform(train_features.values)
print(X_train_scaled.mean(axis=0))
print(X_train_scaled.std(axis=0))
X_train_scaled

[-0. -0.  0.  0.  0. -0. -0.  0.]
[1. 1. 1. 1. 1. 1. 1. 1.]


array([[-1.04 , -1.681,  0.213, ..., -0.34 ,  0.827, -0.659],
       [ 1.318,  1.548,  0.165, ..., -0.34 , -1.209,  1.518],
       [-0.889, -1.479, -0.985, ..., -0.34 ,  0.827, -0.659],
       ...,
       [-0.087, -0.537, -0.6  , ..., -0.34 ,  0.827, -0.659],
       [ 0.365,  0.673, -1.22 , ..., -0.34 ,  0.827, -0.659],
       [-0.839, -1.412,  1.37 , ..., -0.34 ,  0.827, -0.659]])

In [24]:
X_train_scaled.shape

(7729, 8)

## Noraml Test Data Class: ['machineFailure'] == 0] ( N(0,1))

In [25]:
X_test_scaled = scaler.transform(test_features.values)
print(X_test_scaled.mean(axis=0))
print(X_test_scaled.std(axis=0))
X_test_scaled

[ 0.003 -0.011  0.051 -0.047 -0.032 -0.037  0.032 -0.009]
[0.993 1.    1.038 1.012 0.98  0.95  0.993 0.996]


array([[-0.94 , -0.941,  0.075, ..., -0.34 , -1.209,  1.518],
       [-0.588, -0.47 , -0.533, ..., -0.34 ,  0.827, -0.659],
       [-0.488, -0.336,  1.942, ..., -0.34 ,  0.827, -0.659],
       ...,
       [-0.789, -1.277,  2.122, ...,  2.937, -1.209, -0.659],
       [-0.739, -1.21 , -0.569, ..., -0.34 ,  0.827, -0.659],
       [-0.639, -1.075, -0.642, ...,  2.937, -1.209, -0.659]])

## Anomaly Class: ['machineFailure'] == 1] ( N(0,1)) 

In [26]:
raw_dataset_oneHot_MFTrue

Unnamed: 0,airTemp,processTemp,rotationalSpeed,torque,toolwear,machineFailure,Type_is_H,Type_is_L,Type_is_M
50,298.9,309.1,2861,4.6,143,-1,0,1,0
69,298.9,309.0,1410,65.7,191,-1,0,1,0
77,298.8,308.9,1455,41.3,208,-1,0,1,0
160,298.4,308.2,1282,60.7,216,-1,0,1,0
161,298.3,308.1,1412,52.3,218,-1,0,1,0
...,...,...,...,...,...,...,...,...,...
9758,298.6,309.8,2271,16.2,218,-1,0,1,0
9764,298.5,309.5,1294,66.7,12,-1,0,1,0
9822,298.5,309.4,1360,60.9,187,-1,0,1,0
9830,298.3,309.3,1337,56.1,206,-1,0,1,0


In [27]:
#Split features from labels
#The pop() method removes the specified column from the DataFrame. The pop() method returns the removed columns as a Pandas Series object.
ana_features = raw_dataset_oneHot_MFTrue.copy()
ana_labels = ana_features.pop('machineFailure') # Removing label from outlier_set data

In [28]:
ana_features.shape

(339, 8)

In [29]:
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
#scaler = preprocessing.StandardScaler().fit(train_features.values)
X_anomaly= scaler.transform(ana_features.values)
print(X_anomaly.mean(axis=0))
print(X_anomaly.std(axis=0))
X_anomaly

[ 0.458  0.196 -0.253  1.106  0.58  -0.137  0.203 -0.126]
[1.038 0.916 2.315 1.731 1.15  0.79  0.939 0.936]


array([[-0.538, -0.605,  7.963, ..., -0.34 ,  0.827, -0.659],
       [-0.538, -0.672, -0.774, ..., -0.34 ,  0.827, -0.659],
       [-0.588, -0.739, -0.503, ..., -0.34 ,  0.827, -0.659],
       ...,
       [-0.739, -0.403, -1.075, ..., -0.34 ,  0.827, -0.659],
       [-0.839, -0.47 , -1.214, ..., -0.34 ,  0.827, -0.659],
       [-0.689, -1.21 , -1.069, ..., -0.34 ,  0.827, -0.659]])

# 6. Normal classs recognition Model Creation and Prediction 
## 6.1 Create OneClassSVM model

In [30]:
from sklearn.svm import OneClassSVM
# define outlier detection model
model = OneClassSVM(gamma='scale', nu=0.1)
model.fit(X_train_scaled)
# detect outliers in the test set
yhat_test = model.predict(X_test_scaled)
#yhat_test = model.predict(X_train_scaled)

## 6.2 Confusion matirx of  test data for analysis

In [31]:
#Check the classification model strength
from sklearn.metrics import classification_report
target_names = ['-1','1'] # Need to automate this step later
y_test_labled = np.array(test_labels)
#y_test_labled = np.array(train_labels)
y_predict_ana = yhat_test.astype('int64')
#print(classification_report(np.array(test_labels.astype('float32'))), test_predictions, target_names=target_names))
print(classification_report(y_test_labled, y_predict_ana, target_names=target_names))

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           1       1.00      0.90      0.95      1932

    accuracy                           0.90      1932
   macro avg       0.50      0.45      0.47      1932
weighted avg       1.00      0.90      0.95      1932



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## 6.3 Confusion matirx of  outlier data for analysis

In [32]:
#Check the classification model strength
from sklearn.metrics import classification_report
yhat_ana = model.predict(X_anomaly)
target_names = ['-1','1'] # Need to automate this step later
y_ana_labled = np.array(ana_labels)
y_predict_ana = yhat_ana.astype('int64')
#print(classification_report(np.array(test_labels.astype('float32'))), test_predictions, target_names=target_names))
print(classification_report(y_ana_labled, y_predict_ana, target_names=target_names))

              precision    recall  f1-score   support

          -1       1.00      0.59      0.74       339
           1       0.00      0.00      0.00         0

    accuracy                           0.59       339
   macro avg       0.50      0.29      0.37       339
weighted avg       1.00      0.59      0.74       339



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
