In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,accuracy_score

In [2]:
#Loading the dataset from the csv file using pandas
data =pd.read_csv('glass-0-1-2-3_vs_4-5-6.csv', header = None)

In [3]:
#exploring the dataset
print(data.columns)

Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')


In [4]:
data.columns=['v1','v2','v3','v4','v5','v6','v7','v8','v9','Class']

In [5]:
print(data.describe())
print(data.shape)

               v1            v2           v3          v4            v5  \
count  214.000000    214.000000   214.000000  214.000000    214.000000   
mean     1.518366    980.688157    52.989301    1.444895  12613.121321   
std      0.003037   3538.105952   422.981011    0.499361  27494.343383   
min      1.511150     10.730000     0.000000    0.290000     69.810000   
25%      1.516526     12.942788     2.113668    1.188800     72.394400   
50%      1.517688     13.346775     3.479750    1.358930     72.957200   
75%      1.519163     13.986838     3.609960    1.628570     73.388400   
max      1.533930  15126.000000  3592.000000    3.500000  73702.000000   

               v6            v7          v8          v9       Class  
count  214.000000    214.000000  214.000000  214.000000  214.000000  
mean    17.891039    313.356772    0.177563   15.279695    0.238318  
std    102.675204   1672.968946    0.497925   76.941003    0.427053  
min      0.000000      5.430000    0.000000    0.0000

In [6]:
columns = data.columns.tolist()

columns = [c for c in columns if c not in ["Class"]]
target = "Class"
X = data[columns]
Y = data[target]
Fraud = data[data["Class"]==1]
valid = data[data["Class"]==0]
outlier_fraction=len(Fraud)/float(len(valid))

In [None]:
clf = "IsolationForest":IsolationForest(max_samples=len(X), contamination = outlier_fraction,random_state=1)

In [23]:
classifiers = {
    "IsolationForest":IsolationForest(max_samples=len(X),
                                     contamination = outlier_fraction,random_state=1),
    "Local Outlier Factor":LocalOutlierFactor(
    n_neighbors=20,
    contamination = outlier_fraction),
    "One Class SVM":OneClassSVM(gamma='auto',nu=0.5)
}

In [24]:
n_outliers=len(Fraud)

for i,(model_name,model) in enumerate(classifiers.items()):
    
    if model_name=="Local Outlier Factor":
        y_pred = model.fit_predict(X)
        scores_pred = model.negative_outlier_factor_
    else:
        model.fit(X)
        scores_pred=model.decision_function(X)
        y_pred=model.predict(X)
#0 for valid and 1 for Fraud    
    y_pred[y_pred==1]=0
    y_pred[y_pred==-1]=1
    
    n_errors=(y_pred!=Y).sum()
    
    print("{}: {}".format(model_name,n_errors))
    print(accuracy_score(Y,y_pred))
    print(classification_report(Y,y_pred))

IsolationForest: 56
0.7383177570093458
              precision    recall  f1-score   support

           0       0.86      0.78      0.82       163
           1       0.46      0.61      0.53        51

    accuracy                           0.74       214
   macro avg       0.66      0.69      0.67       214
weighted avg       0.77      0.74      0.75       214

Local Outlier Factor: 72
0.6635514018691588
              precision    recall  f1-score   support

           0       0.81      0.73      0.77       163
           1       0.34      0.45      0.39        51

    accuracy                           0.66       214
   macro avg       0.58      0.59      0.58       214
weighted avg       0.70      0.66      0.68       214

One Class SVM: 88
0.5887850467289719
              precision    recall  f1-score   support

           0       0.85      0.56      0.67       163
           1       0.33      0.69      0.44        51

    accuracy                           0.59       214
   macro

In [25]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2)

rf = RandomForestClassifier(n_estimators=100,max_depth=10,random_state=1)
rf.fit(x_train,y_train)
y_predict = rf.predict(x_test)


print(accuracy_score(y_test,y_predict))
print(classification_report(y_test,y_predict))

0.9534883720930233
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        31
           1       0.92      0.92      0.92        12

    accuracy                           0.95        43
   macro avg       0.94      0.94      0.94        43
weighted avg       0.95      0.95      0.95        43



In [28]:
from numpy import mean,std
from sklearn.svm import OneClassSVM
from sklearn.model_selection import cross_val_score

clf = OneClassSVM(gamma='auto',nu=0.3)
scores = cross_val_score(clf,x_test,y_test, scoring = 'accuracy', cv=5)
print('Accuracy: %.4f (%.4f)' %(mean(scores), std(scores)))

Accuracy: 0.0444 (0.0889)
