In [1]:
import pandas as pd
import numpy as np
import random

#split dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

#AUCROC,AUCPR,precision,recall,f1-score
from sklearn.metrics import roc_curve,auc,average_precision_score,precision_score,recall_score,f1_score

#gridsearch/randomsearch
from itertools import product
from tqdm import tqdm

#visualize results
import matplotlib.pyplot as plt
import time

#one-class svm
from sklearn.svm import OneClassSVM

In [2]:
#remove randomness
def set_seed(seed):
  np.random.seed(seed)
  random.seed(seed)

In [3]:
def data_prepare(contam_ratio,seed):
    set_seed(seed)
    data = pd.read_csv("D:/Jiang/Research_Anomaly Detection/Important_Credit Card Fraud Detection (CCFD)/creditcardfraud/creditcard.csv")
    #change 0,1 label to 1,-1
    data.loc[data['Class']==1,'Class'] = -1
    data.loc[data['Class']==0,'Class'] = 1

    X = data.drop(['Time','Class'], axis=1)
    y = data["Class"].values

    #split the data to training, validation and testing data (50%,20%,30%)
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,shuffle = False)
    X_train,X_val,y_train,y_val = train_test_split(X_train,y_train,test_size = 2/7,shuffle = False)

    #the known positive samples before contaminating
    known_pos_entire = sum(y_train == -1)
    #Minmax
    scaler=MinMaxScaler().fit(X_train)

    X_train = scaler.transform(X_train)
    X_val = scaler.transform(X_val)
    X_test = scaler.transform(X_test)

    index_contam = np.arange(len(y_train))[y_train == -1]
    index_contam = np.random.choice(index_contam,int(contam_ratio*len(index_contam)),replace = False)

    y_train[index_contam] = 1
    #the known positive samples after contaminating
    known_pos_sub = sum(y_train == -1)

    print(f'The left true(known) positive samples in the training set:{known_pos_sub}/{known_pos_entire}\n')
    
#     #OCSVM only use normal samples (which could be contaminated) in the training phase
#     index_subset = np.arange(len(y_train))[y_train == 1]
#     X_train = X_train[index_subset]
#     y_train = y_train[index_subset]
    
    return X_train,y_train,X_val,y_val,X_test,y_test

OneClass-SVM

In [4]:
seed_pool = [1]
anomaly_ratio_pool = [0.001,0.002,0.003]
contam_ratio_pool = [1.0,0.98,0.8,0.5,0.0]

#random search size
search_size = 10

In [5]:
hyper_kernel = ['poly','rbf','sigmoid']
hyper_list_entire = list(product(hyper_kernel))

def random_search(hyper_list_entire, search_size, seed):
    if search_size < len(hyper_list_entire):
        set_seed(seed)
        index = np.random.choice(np.arange(len(hyper_list_entire)), search_size, replace=False)

        hyper_list = []
        for i in index:
            hyper_list.append(hyper_list_entire[i])
    else:
        hyper_list = hyper_list_entire

    return hyper_list_entire

In [6]:
for contam_ratio in tqdm(contam_ratio_pool):
  df_result = pd.DataFrame(data = None,index = ['AUCPR'] + anomaly_ratio_pool,columns = seed_pool)
  for seed in tqdm(seed_pool):
    #############################################seleting the best hyper-parameters in validation set#############################################
    metric_value_list=list()
    hyper_list = random_search(hyper_list_entire,search_size,seed)
    for i in range(len(hyper_list)):
      #try:
        print(f'Finding Optimal Hyper-parameters......Current Candidates: {hyper_list[i]}')
        kernel = hyper_list[i][0]
        #data
        X_train,_,X_val,y_val,_,_ = data_prepare(contam_ratio,seed)
        #model initialization
        model_ocsvm = OneClassSVM(kernel = kernel)
        #fitting
        model_ocsvm.fit(X_train)
        #evaluation
        score = -model_ocsvm.decision_function(X_val)
        metric_value = average_precision_score(y_true = y_val,y_score = score,pos_label = -1)
        metric_value_list.append(metric_value)

        print(f'The metric value corresponded to the hyper-parameters is :{metric_value:{.4}}')
        print('******************************')
        print('\n')
      #except:
        #pass
      #continue

    best_hyper_params=hyper_list[metric_value_list.index(max(metric_value_list))]
    print(f'The best hyper-parameters are: {best_hyper_params}')
    print('\n')
    ###################################################################testing#########################################################################
    print('Testing Phrase......')
    kernel = best_hyper_params[0]

    #data
    X_train,_,_,_,X_test,y_test = data_prepare(contam_ratio,seed)
    
    #model initialization
    model_ocsvm = OneClassSVM(kernel = kernel)
    #fitting
    model_ocsvm.fit(X_train)
    #evaluation
    score = -model_ocsvm.decision_function(X_test)
    
    #store the result
    #AUCPR
    df_result.loc['AUCPR',seed] = average_precision_score(y_true = y_test,y_score = score,pos_label = -1)
    #F1
    for anomaly_ratio in anomaly_ratio_pool:
        threshold = score[np.argsort(-score)][int(anomaly_ratio*len(score))]
        
        y_pred = np.ones(len(score))
        y_pred[score >= threshold] = -1
        
        print('\n')
        print(f'Precision: {round(precision_score(y_pred = y_pred, y_true = y_test, pos_label= -1)*100,2)}')
        print(f'Recall: {round(recall_score(y_pred = y_pred, y_true = y_test, pos_label= -1)*100,2)}')
        print(f'F1-score: {round(f1_score(y_pred = y_pred, y_true = y_test, pos_label= -1)*100,2)}')
        print('\n')

        df_result.loc[anomaly_ratio,seed] = f1_score(y_pred = y_pred,y_true = y_test,pos_label = -1) 

  #mean & sd
  df_result['mean'] = np.mean(df_result.loc[:,seed_pool],axis = 1)
  df_result['std'] = np.std(df_result.loc[:,seed_pool],axis = 1)
  df_result = round(df_result.astype('float64')*100,2)

  filepath = 'D:/Jiang/Research_Anomaly Detection/Important_Credit Card Fraud Detection (CCFD)/Hyprid Semi-supervised/result/' +\
              'CCFD_OCSVM_' + str(contam_ratio) + '.csv'
  df_result.to_csv(filepath,index = False)

  0%|                                                                                            | 0/5 [00:00<?, ?it/s]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Finding Optimal Hyper-parameters......Current Candidates: ('poly',)
The left true(known) positive samples in the training set:0/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('rbf',)
The left true(known) positive samples in the training set:0/269





The metric value corresponded to the hyper-parameters is :0.5234
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('sigmoid',)
The left true(known) positive samples in the training set:0/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


The best hyper-parameters are: ('sigmoid',)


Testing Phrase......
The left true(known) positive samples in the training set:0/269







Precision: 52.87
Recall: 42.59
F1-score: 47.18




Precision: 35.67
Recall: 56.48
F1-score: 43.73




Precision: 25.29
Recall: 60.19
F1-score: 35.62





 20%|███████████████▌                                                              | 1/5 [2:21:03<9:24:12, 8463.21s/it]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Finding Optimal Hyper-parameters......Current Candidates: ('poly',)
The left true(known) positive samples in the training set:6/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('rbf',)
The left true(known) positive samples in the training set:6/269





The metric value corresponded to the hyper-parameters is :0.5234
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('sigmoid',)
The left true(known) positive samples in the training set:6/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


The best hyper-parameters are: ('sigmoid',)


Testing Phrase......
The left true(known) positive samples in the training set:6/269







Precision: 52.87
Recall: 42.59
F1-score: 47.18




Precision: 35.67
Recall: 56.48
F1-score: 43.73




Precision: 25.29
Recall: 60.19
F1-score: 35.62





 40%|███████████████████████████████▏                                              | 2/5 [4:28:13<6:50:39, 8213.20s/it]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Finding Optimal Hyper-parameters......Current Candidates: ('poly',)
The left true(known) positive samples in the training set:54/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('rbf',)
The left true(known) positive samples in the training set:54/269





The metric value corresponded to the hyper-parameters is :0.5234
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('sigmoid',)
The left true(known) positive samples in the training set:54/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


The best hyper-parameters are: ('sigmoid',)


Testing Phrase......
The left true(known) positive samples in the training set:54/269







Precision: 52.87
Recall: 42.59
F1-score: 47.18




Precision: 35.67
Recall: 56.48
F1-score: 43.73




Precision: 25.29
Recall: 60.19
F1-score: 35.62





 60%|██████████████████████████████████████████████▊                               | 3/5 [6:05:51<4:10:13, 7506.64s/it]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Finding Optimal Hyper-parameters......Current Candidates: ('poly',)
The left true(known) positive samples in the training set:135/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('rbf',)
The left true(known) positive samples in the training set:135/269





The metric value corresponded to the hyper-parameters is :0.5234
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('sigmoid',)
The left true(known) positive samples in the training set:135/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


The best hyper-parameters are: ('sigmoid',)


Testing Phrase......
The left true(known) positive samples in the training set:135/269







Precision: 52.87
Recall: 42.59
F1-score: 47.18




Precision: 35.67
Recall: 56.48
F1-score: 43.73




Precision: 25.29
Recall: 60.19
F1-score: 35.62





 80%|██████████████████████████████████████████████████████████████▍               | 4/5 [7:39:06<1:55:33, 6933.40s/it]
  0%|                                                                                            | 0/1 [00:00<?, ?it/s]

Finding Optimal Hyper-parameters......Current Candidates: ('poly',)
The left true(known) positive samples in the training set:269/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('rbf',)
The left true(known) positive samples in the training set:269/269





The metric value corresponded to the hyper-parameters is :0.5234
******************************


Finding Optimal Hyper-parameters......Current Candidates: ('sigmoid',)
The left true(known) positive samples in the training set:269/269





The metric value corresponded to the hyper-parameters is :0.7093
******************************


The best hyper-parameters are: ('sigmoid',)


Testing Phrase......
The left true(known) positive samples in the training set:269/269







Precision: 52.87
Recall: 42.59
F1-score: 47.18




Precision: 35.67
Recall: 56.48
F1-score: 43.73




Precision: 25.29
Recall: 60.19
F1-score: 35.62





100%|████████████████████████████████████████████████████████████████████████████████| 5/5 [9:12:23<00:00, 6532.26s/it]
