In [None]:
# notebook for doing sklearn oneclasssvm   also some thyroid class preprocessing to see if can boost SVM performance on it

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:

import pandas as pd
import numpy as np
from sklearn.metrics import auc,roc_curve, precision_recall_curve, average_precision_score, roc_auc_score
from sklearn import preprocessing
import matplotlib.pyplot as plt




def dataLoading(path):
    # loading data
    df = pd.read_csv(path) 
    
    labels = df['class']
    
    x_df = df.drop(['class'], axis=1)
    
    x = x_df.values
    print("Data shape: (%d, %d)" % x.shape)
    
    return x, labels;


def aucPerformance(mse, labels):
    roc_auc = roc_auc_score(labels, mse)
    ap = average_precision_score(labels, mse)
    print("AUC-ROC: %.4f, AUC-PR: %.4f" % (roc_auc, ap))
    return roc_auc, ap;




In [3]:
import numpy as np
import argparse
import numpy as np
import matplotlib.pyplot as plt
import sys
from scipy.sparse import vstack, csc_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm
import time

def oneClassSVM(filename, nu, runs ):
    #gamma=.1
    #clf=svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma='auto', max_iter=-1)
    clf=svm.OneClassSVM(kernel="rbf")
    runs = 10
    rauc = np.zeros(runs)
    ap = np.zeros(runs)
    x, labels = dataLoading('gdrive/MyDrive/CSCI5523GroupAnomalyProject/data/' + filename + ".csv")
    outlier_indices = np.where(labels == 1)[0]
    outliers = x[outlier_indices]
    n_outliers_org = outliers.shape[0]
    train_time = 0
    test_time = 0
    for i in np.arange(runs):
        #random splitting each time with random_state=None
        x_train, x_test, y_train, y_test = train_test_split(x, labels, test_size=0.2, random_state=None,
                                                                stratify=labels)
        y_train = np.array(y_train)
        y_test = np.array(y_test)

        outlier_indices = np.where(y_train == 1)[0]
        inlier_indices = np.where(y_train == 0)[0]
        n_outliers = len(outlier_indices)
        print("Original training size: %d, No. outliers: %d" % (x_train.shape[0], n_outliers))
        start_time = time.time()
        train_time += time.time() - start_time

        clf.fit(x_train)

        scores=clf.predict(x_test)
        test_time += time.time() - start_time
        rauc[i], ap[i] = aucPerformance(scores, y_test)

    mean_auc = np.mean(rauc)
    std_auc = np.std(rauc)
    mean_aucpr = np.mean(ap)
    std_aucpr = np.std(ap)
    train_time = train_time / runs
    test_time = test_time / runs
    print("average AUC-ROC: %.4f, average AUC-PR: %.4f" % (mean_auc, mean_aucpr))
    print("average runtime: %.4f seconds" % (train_time + test_time))

In [4]:
np.random.seed(42)

filename='annthyroid_21feat_normalised'  #average AUC-ROC: 0.5105, average AUC-PR: 0.0760
#filename='thyroid_feat_fully_binarized'    #average AUC-ROC: 0.5177, average AUC-PR: 0.0770
#filename='bank-additional-full_normalised'
#filename='celeba_baldvsnonbald_normalised'
#filename='census-income-full-mixed-binarized'
#filename='creditcardfraud_normalised'
#filename='KDD2014_donors_10feat_nomissing_normalised'
#filename='UNSW_NB15_traintest_backdoor'
#filename='creditcardfraud_normalised_mn_scaled_2.0'
#filename='thyroid_reduced_5'
outliers_fraction=.0001
runs=10
nu=outliers_fraction
MAX_INT = np.iinfo(np.int32).max
oneClassSVM(filename, nu, runs )

Data shape: (7200, 21)
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.5197, AUC-PR: 0.0772
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.4906, AUC-PR: 0.0731
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.5104, AUC-PR: 0.0758
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.5262, AUC-PR: 0.0783
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.4919, AUC-PR: 0.0732
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.5201, AUC-PR: 0.0773
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.5304, AUC-PR: 0.0790
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.4991, AUC-PR: 0.0742
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.4850, AUC-PR: 0.0724
Original training size: 5760, No. outliers: 427
AUC-ROC: 0.5321, AUC-PR: 0.0792
average AUC-ROC: 0.5105, average AUC-PR: 0.0760
average runtime: 1.9748 seconds


In [None]:
#feature reducing thyroid
df= pd.read_csv('gdrive/MyDrive/data/annthyroid_21feat_normalised.csv')
thyroid_np=df.to_numpy()
thyroid_reduced=thyroid_np[:, 16:22]
thyroid=pd.DataFrame(thyroid_reduced)
thyroid.columns=["TSH", "T3", "TT4", "T4U", "FTI", "class"]
#thyroid.describe()
thyroid.to_csv('gdrive/MyDrive/data/thyroid_reduced_5.csv', index=False)

In [None]:
#feature reducing thyroid
df= pd.read_csv('gdrive/MyDrive/data/annthyroid_21feat_normalised.csv')
df.head()

Unnamed: 0,Dim_0,Dim_1=0,Dim_2=0,Dim_3=0,Dim_4=0,Dim_5=0,Dim_6=0,Dim_7=0,Dim_8=0,Dim_9=0,Dim_10=0,Dim_11=0,Dim_12=0,Dim_13=0,Dim_14=0,Dim_15=0,Dim_16,Dim_17,Dim_18,Dim_19,Dim_20,class
0,0.75,1,0,1,1,1,1,1,0,1,1,1,1,1,1,1,0.001132,0.08078,0.197324,0.300926,0.225,0
1,0.239583,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.000472,0.164345,0.235786,0.537037,0.165625,0
2,0.479167,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.003585,0.130919,0.167224,0.527778,0.11875,0
3,0.65625,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.001698,0.091922,0.125418,0.337963,0.129688,0
4,0.229167,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0.000472,0.142061,0.229097,0.337963,0.235938,0


In [None]:

df= pd.read_csv('gdrive/MyDrive/data/annthyroid_21feat_normalised.csv')
for i in range(1,16):
    df=pd.concat([df,pd.get_dummies(df['Dim_'+str(i)+'=0'], prefix='Dim_'+str(i))],axis=1)
    df.drop(['Dim_'+str(i)+'=0'],axis=1, inplace=True)
    
    
    
df.to_csv('gdrive/MyDrive/data/thyroid_feat_fully_binarized.csv', index=False)
                
df.head()


Unnamed: 0,Dim_0,Dim_16,Dim_17,Dim_18,Dim_19,Dim_20,class,Dim_1_0,Dim_1_1,Dim_2_0,Dim_2_1,Dim_3_0,Dim_3_1,Dim_4_0,Dim_4_1,Dim_5_0,Dim_5_1,Dim_6_0,Dim_6_1,Dim_7_0,Dim_7_1,Dim_8_0,Dim_8_1,Dim_9_0,Dim_9_1,Dim_10_0,Dim_10_1,Dim_11_0,Dim_11_1,Dim_12_0,Dim_12_1,Dim_13_0,Dim_13_1,Dim_14_0,Dim_14_1,Dim_15_0,Dim_15_1
0,0.75,0.001132,0.08078,0.197324,0.300926,0.225,0,0,1,1,0,0,1,0,1,0,1,0,1,0,1,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1
1,0.239583,0.000472,0.164345,0.235786,0.537037,0.165625,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
2,0.479167,0.003585,0.130919,0.167224,0.527778,0.11875,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
3,0.65625,0.001698,0.091922,0.125418,0.337963,0.129688,0,1,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
4,0.229167,0.000472,0.142061,0.229097,0.337963,0.235938,0,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1
