In [26]:
'''importing the required libraries'''
import pandas as pd
import copy
import numpy as np
from numpy import argmax
from matplotlib import pyplot

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
import tensorflow as tf

from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

## Prepare original dataset

In [27]:
# from collections import Counter
# counter = Counter(y_test3.index)
# more_than_one = [id for id, count in counter.items() if count >= 2]
# print(len(more_than_one))

In [28]:
'''Importing the dataset and checking its properties '''
ori_data =  pd.read_csv("data/creditcard.csv")
print(ori_data.Class.value_counts())

0    284315
1       492
Name: Class, dtype: int64


In [29]:
'''removing the unwanted features'''
ori_data.drop('Time', axis=1, inplace=True)

In [30]:
ori_data.drop_duplicates(inplace=True)

In [31]:
ori_data.head()

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [32]:
label_cols = [ i for i in ori_data.columns if 'Class' in i ]
data_cols = [ i for i in ori_data.columns if i not in label_cols ]

print('Dataset columns: {}'.format(", ".join(data_cols)))

Dataset columns: V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15, V16, V17, V18, V19, V20, V21, V22, V23, V24, V25, V26, V27, V28, Amount


In [33]:
x_temp = ori_data.groupby(data_cols).size().reset_index(name='count')
x_temp['count'].value_counts()

1    275663
Name: count, dtype: int64

## Prepare synthetic data

In [34]:
synth_data = [
            ('GaussianCopula140000', 'data/1GaussianCopula140000.csv'),
                ('Synthpop140000', 'data/mysyn.csv'),
          ('CTGAN140000', 'data/1CTGAN140000.csv'),
          ('CopulaGAN140000', 'data/1CopulaGAN140000.csv'),
          ('TVAE140000', 'data/1TVAE140000.csv'),
           ]


## Models

In [35]:
#scaling
def standard_scaler(X_train, X_test):
    sc = StandardScaler()
    x_train_scaled = sc.fit_transform(X_train)
    x_test_scaled = sc.transform(X_test)
    return x_train_scaled, x_test_scaled

In [36]:
def evaluate(y_prob, y_label):
    y_pred = np.where(y_prob>0.5, 1,0)
    acc = accuracy_score(y_label,y_pred)*100
    print('Accuracy:',round(acc,2))
    
    ''' Generating the Confusion matrix and Classification report'''
    print('Confusion matrix', '\n', confusion_matrix(y_label, y_pred), '\n')
    print('Classification report', '\n', classification_report(y_label, y_pred), '\n')
    
    f1s = f1_score(y_label, y_pred, average='macro')
    f1s = round(f1s, 2)
    
    print(f"F1_Score: {f1s}")
    
    # calculate pr-curve
    precision, recall, thresholds = precision_recall_curve(y_label, y_prob)
    
#     # # plot the roc curve for the model
#     no_skill = len(y_label[y_label==1]) / len(y_label)
#     pyplot.plot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill')
#     pyplot.plot(recall, precision, marker='.', label='PR Curve')
#     # axis labels
#     pyplot.xlabel('Recall')
#     pyplot.ylabel('Precision')
#     pyplot.legend()
#     # show the plot
#     pyplot.show()
    
    # convert to f score
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = argmax(fscore)
    print('Best Threshold=%f, F-Score=%.3f' % (thresholds[ix], fscore[ix]))

    pr_auc = auc(recall, precision)
    print('PR AUC= %.3f'% pr_auc)

    y_pred = np.where(y_prob>thresholds[ix], 1,0)
    print('Confusion matrix', '\n',confusion_matrix(y_label, y_pred))
    print('Classification report', '\n', classification_report(y_label, y_pred), '\n')
    return f1s

### Fully Connected no bias - class weights

In [37]:
WEIGHTS_PATH = 'initialFLC/initial_weights'
EPOCH = 10
BATCH_SIZE = 16
LOOP = 10

In [38]:
'''Fully Connected'''
''' Initializing the model '''

fcl = Sequential()

''' Adding the input layer and the first hidden layer '''

fcl.add(Dense(input_dim=29, units = 6, kernel_initializer='random_uniform', activation = 'relu'))

''' Adding the second hidden layer '''

fcl.add(Dense(units = 6, kernel_initializer='random_uniform', activation = 'relu'))

''' Adding the output layer '''

fcl.add(Dense(units = 1, kernel_initializer='random_uniform', activation = 'sigmoid'
               #,bias_initializer=output_bias
               ))
fcl.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
fcl.save_weights(WEIGHTS_PATH)

In [39]:
def train(batch_size, epoch, X, y):
    #load initial weights
    fcl.load_weights(WEIGHTS_PATH)
    
    # train
    fcl.fit(X, y, 
            #class_weight=class_weight,
            batch_size = batch_size, epochs = epoch
            #, verbose=0
            )

    #model.save_weights('model/model_weights{}'.format(name))

### Random Forest Classifier

In [40]:
RFC_PARAMS = dict(
    n_estimators=300, max_depth=8, max_leaf_nodes=2**6, class_weight="balanced_subsample"
)

## Train

In [41]:
#input: dataset
#return: fcl and rfc f1 score 
def train_evaluate(X_train, y_train, X_test, y_test):
    '''Fully Connected Layer'''
    print("Fully connected layer \n")
    train(BATCH_SIZE, EPOCH, X_train, y_train)
    
    fcl_y_prob = fcl.predict(X_test)
    fcl_f1 = evaluate(fcl_y_prob, y_test)
    
    print("Random Forest Classifier \n")
    rfc = RandomForestClassifier(**RFC_PARAMS)
    rfc.fit(X_train, y_train)
    
    rfc_y_prob = rfc.predict_proba(X_test)[:,1]
    rfc_f1 = evaluate(rfc_y_prob, y_test)
    
    return fcl_f1, rfc_f1

## 1.Train Real - Test Real
    Train 80% của 20k nhãn 0, 80% nhãn 1 của 492
    Test 20% còn lại.

In [None]:
zero_label = 20000
sampled_data = pd.concat([ori_data[ori_data.Class == 1], ori_data[ori_data.Class==0].sample(zero_label)], axis=0)

sss = StratifiedShuffleSplit(test_size=0.2, random_state=42)
train_idx, test_idx = next(sss.split(sampled_data, sampled_data["Class"]))
X_train1, X_test1 = sampled_data.iloc[train_idx][data_cols], sampled_data.iloc[test_idx][data_cols],
y_train1, y_test1 =sampled_data.iloc[train_idx]["Class"], sampled_data.iloc[test_idx]["Class"]

In [None]:
y_test1.value_counts()

In [None]:
X_train1, X_test1 = standard_scaler(X_train1, X_test1)

In [None]:
rfc_f1_list = []
fcl_f1_list = []
for i in range (0,LOOP):
    fcl_f1, rfc_f1 = train_evaluate(X_train1, y_train1, X_test1, y_test1)
    fcl_f1_list.append(fcl_f1)
    rfc_f1_list.append(rfc_f1)
    

In [None]:
print(f'> F1 Score FCL: {np.mean(fcl_f1_list)} (Độ lệch +- {np.std(fcl_f1_list)})')
print(f'> F1 Score RFC: {np.mean(rfc_f1_list)} (Độ lệch +- {np.std(rfc_f1_list)})')

## 2.Train Synthetic - Test Real
    Train 112k
    Test 9840 - 492

In [42]:
train_size0 = 112000
test_size1 = 492

#get test set with 492 real 1 and 492*20 0
sample_data2 = pd.concat([ori_data[ori_data.Class == 1], ori_data[ori_data.Class==0].sample(test_size1*20)], axis=0)
#sample_data2 = pd.concat([ori_data[ori_data.Class == 1], ori_data[ori_data.Class==0].sample(160000)], axis=0)
X_test2 = sample_data2[data_cols]
y_test2 = sample_data2["Class"]
y_test2 = y_test2.to_numpy()

#get the rest of the dataset (drop test)
sample_data_left2 = ori_data.drop(list(sample_data2.index))

In [43]:
sample_data_left2

Unnamed: 0,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
284802,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,4.356170,...,0.213454,0.111864,1.014480,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,-0.732789,-0.055080,2.035030,-0.738589,0.868229,1.058415,0.024330,0.294869,0.584800,-0.975926,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,1.919565,-0.301254,-3.249640,-0.557828,2.630515,3.031260,-0.296827,0.708417,0.432454,-0.484782,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,-0.240440,0.530483,0.702510,0.689799,-0.377961,0.623708,-0.686180,0.679145,0.392087,-0.399126,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.00,0


In [44]:
train_size1 = copy.deepcopy(train_size0)
for name, data_path in synth_data:
    print("------------------------------------------", name)
    rfc_f1_list = []
    fcl_f1_list = []
    
    #add synthetic data into train dataset
    gen_data = pd.read_csv(data_path)[data_cols]
    gen_data["Class"] = np.ones(gen_data.shape[0], dtype = int)
    data2 = pd.concat([gen_data.sample(train_size1), 
                       sample_data_left2.sample(train_size0)]).sample(train_size1+train_size0)    
    data2 = data2.reset_index(drop = True)
    
    check_dup = data2.append(sample_data2)
    print(check_dup.duplicated().any())
    
    X_train2 = data2[data_cols]
    y_train2 = data2["Class"]
    
    train_data2, test_data2 = standard_scaler(X_train2, X_test2)
    
    y_train2 = y_train2.to_numpy()
    
    for i in range(0,10):
        fcl_f1, rfc_f1 = train_evaluate(train_data2, y_train2, test_data2, y_test2)
        fcl_f1_list.append(fcl_f1)
        rfc_f1_list.append(rfc_f1)
    print(f'> F1 Score FCL: {np.mean(fcl_f1_list)} (Độ lệch +- {np.std(fcl_f1_list)})')
    print(f'> F1 Score RFC: {np.mean(rfc_f1_list)} (Độ lệch +- {np.std(rfc_f1_list)})')



------------------------------------------ Synthpop140000
False
Fully connected layer 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 98.76
Confusion matrix 
 [[9776   64]
 [  64  409]] 

Classification report 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      9840
           1       0.86      0.86      0.86       473

    accuracy                           0.99     10313
   macro avg       0.93      0.93      0.93     10313
weighted avg       0.99      0.99      0.99     10313
 

F1_Score: 0.93
Best Threshold=0.732988, F-Score=0.898
PR AUC= 0.919
Confusion matrix 
 [[9818   22]
 [  71  402]]
Classification report 
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      9840
           1       0.95      0.85      0.90       473

    accuracy                           0.99     10313
   macro avg       0.97     

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 98.57
Confusion matrix 
 [[9743   97]
 [  50  423]] 

Classification report 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      9840
           1       0.81      0.89      0.85       473

    accuracy                           0.99     10313
   macro avg       0.90      0.94      0.92     10313
weighted avg       0.99      0.99      0.99     10313
 

F1_Score: 0.92
Best Threshold=0.772006, F-Score=0.885
PR AUC= 0.921
Confusion matrix 
 [[9800   40]
 [  67  406]]
Classification report 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      9840
           1       0.91      0.86      0.88       473

    accuracy                           0.99     10313
   macro avg       0.95      0.93      0.94     10313
weighted avg       0.99      0.99      0.99     10313
 

Rando

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 98.58
Confusion matrix 
 [[9767   73]
 [  73  400]] 

Classification report 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      9840
           1       0.85      0.85      0.85       473

    accuracy                           0.99     10313
   macro avg       0.92      0.92      0.92     10313
weighted avg       0.99      0.99      0.99     10313
 

F1_Score: 0.92
Best Threshold=0.953364, F-Score=0.880
PR AUC= 0.900
Confusion matrix 
 [[9829   11]
 [  94  379]]
Classification report 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      9840
           1       0.97      0.80      0.88       473

    accuracy                           0.99     10313
   macro avg       0.98      0.90      0.94     10313
weighted avg       0.99      0.99      0.99     10313
 

Rando

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 98.59
Confusion matrix 
 [[9761   79]
 [  66  407]] 

Classification report 
               precision    recall  f1-score   support

           0       0.99      0.99      0.99      9840
           1       0.84      0.86      0.85       473

    accuracy                           0.99     10313
   macro avg       0.92      0.93      0.92     10313
weighted avg       0.99      0.99      0.99     10313
 

F1_Score: 0.92
Best Threshold=0.915198, F-Score=0.885
PR AUC= 0.909
Confusion matrix 
 [[9824   16]
 [  86  387]]
Classification report 
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      9840
           1       0.96      0.82      0.88       473

    accuracy                           0.99     10313
   macro avg       0.98      0.91      0.94     10313
weighted avg       0.99      0.99      0.99     10313
 

Rando

## 3.Train Synthetic - Test Synthetic

In [45]:
zero_ori = ori_data[ori_data.Class==0]

In [46]:
train_size0 = 112000
train_size1 = 112000
test_size1 = 492
test_size0 = 492*20

for name, data_path in synth_data:
    print("------------------------------------------", name)
    rfc_f1_list = []
    fcl_f1_list = []
    
    #add synthetic data into ori zero
    gen_data = pd.read_csv(data_path)[data_cols]
    gen_data["Class"] = np.ones(gen_data.shape[0], dtype = int)
    
    print(gen_data)
    
    zero_data = zero_ori.sample(train_size0 + test_size0)
    one_data = gen_data.sample(train_size1+test_size1)
        
    train_data3 = pd.concat([zero_data.iloc[:train_size0], one_data.iloc[:train_size1]], axis = 0)
    train_data3 = train_data3.sample(frac=1).reset_index(drop=True)
    test_data3 = pd.concat([zero_data.iloc[train_size0:train_size0+test_size0], one_data.iloc[train_size1:train_size1+test_size1]], axis = 0)
    test_data3= test_data3.sample(frac=1).reset_index(drop=True)
    
    X_train3, y_train3 = train_data3[data_cols], train_data3["Class"]
    X_test3, y_test3 = test_data3[data_cols], test_data3["Class"]
    X_train3, X_test3 = standard_scaler(X_train3, X_test3)
        
    for i in range(0,LOOP):
        print('----------', LOOP)
        fcl_f1, rfc_f1 = train_evaluate(X_train3, y_train3, X_test3, y_test3)
        fcl_f1_list.append(fcl_f1)
        rfc_f1_list.append(rfc_f1)
        
    print("------------------------------------------", name)    
    print(f'> F1 Score FCL: {np.mean(fcl_f1_list)} (Độ lệch +- {np.std(fcl_f1_list)})')
    print(f'> F1 Score RFC: {np.mean(rfc_f1_list)} (Độ lệch +- {np.std(rfc_f1_list)})')

------------------------------------------ Synthpop140000
               V1         V2         V3        V4         V5        V6  \
0       -1.396204   2.618584  -2.987193  2.178538  -0.281214 -0.501474   
1       -6.159607   1.468713  -3.114372  5.153525  -2.470388 -2.735569   
2      -28.524268  15.365804 -28.923756  6.370895 -22.105532 -4.572498   
3       -3.291125   0.099249  -4.759158  3.728439  -0.161437 -1.904959   
4        0.232512   2.994499  -5.052968  5.835566   1.591168 -1.811287   
...           ...        ...        ...       ...        ...       ...   
139995 -21.885434   9.843153 -24.098872  6.155789 -16.905611 -4.902997   
139996  -5.685013   5.776516  -9.775528  6.689951  -4.409844 -0.133493   
139997  -5.622469   0.577610  -6.934388  2.400031   2.497367 -1.260375   
139998  -2.326922   0.520539  -1.994122  0.605761  -1.185444 -0.131207   
139999  -2.866364   4.045601  -5.957706  5.340242  -2.080938 -2.420168   

               V7         V8        V9       V10  ...

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 99.3
Confusion matrix 
 [[9774   66]
 [   6  486]] 

Classification report 
               precision    recall  f1-score   support

           0       1.00      0.99      1.00      9840
           1       0.88      0.99      0.93       492

    accuracy                           0.99     10332
   macro avg       0.94      0.99      0.96     10332
weighted avg       0.99      0.99      0.99     10332
 

F1_Score: 0.96
Best Threshold=0.927184, F-Score=0.974
PR AUC= 0.989
Confusion matrix 
 [[9833    7]
 [  19  473]]
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      9840
           1       0.99      0.96      0.97       492

    accuracy                           1.00     10332
   macro avg       0.99      0.98      0.99     10332
weighted avg       1.00      1.00      1.00     10332
 

Random

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 98.97
Confusion matrix 
 [[9741   99]
 [   7  485]] 

Classification report 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      9840
           1       0.83      0.99      0.90       492

    accuracy                           0.99     10332
   macro avg       0.91      0.99      0.95     10332
weighted avg       0.99      0.99      0.99     10332
 

F1_Score: 0.95
Best Threshold=0.946007, F-Score=0.975
PR AUC= 0.988
Confusion matrix 
 [[9834    6]
 [  19  473]]
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      9840
           1       0.99      0.96      0.97       492

    accuracy                           1.00     10332
   macro avg       0.99      0.98      0.99     10332
weighted avg       1.00      1.00      1.00     10332
 

Rando

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 99.05
Confusion matrix 
 [[9751   89]
 [   9  483]] 

Classification report 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99      9840
           1       0.84      0.98      0.91       492

    accuracy                           0.99     10332
   macro avg       0.92      0.99      0.95     10332
weighted avg       0.99      0.99      0.99     10332
 

F1_Score: 0.95
Best Threshold=0.940711, F-Score=0.970
PR AUC= 0.987
Confusion matrix 
 [[9828   12]
 [  18  474]]
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00      9840
           1       0.98      0.96      0.97       492

    accuracy                           1.00     10332
   macro avg       0.99      0.98      0.98     10332
weighted avg       1.00      1.00      1.00     10332
 

Rando

## 4. Dataset = ori + gen
    Train 112k 0 112k 1 
    Test 492*2 0 492 1

In [47]:
train_size0 = 112000
train_size1 = 112000
test_size1 = 1000
test_size0 = test_size1*20

for name, data_path in synth_data:
    print("------------------------------------------", name)
    rfc_f1_list = []
    fcl_f1_list = []
    
    #combine 2 dataset
    gen_data = pd.read_csv(data_path)[data_cols]
    gen_data["Class"] = np.ones(gen_data.shape[0], dtype = int)
    data4 = pd.concat([gen_data, ori_data], axis=0) 
    
    #split train test
    zero_data = data4[data4.Class == 0].sample(train_size0 + test_size0)
    one_data = data4[data4.Class == 1].sample(train_size1+test_size1)
        
    train_data4 = pd.concat([zero_data.iloc[:train_size0], one_data.iloc[:train_size1]], axis = 0)
    train_data4 = train_data4.sample(frac=1).reset_index(drop=True)
    test_data4 = pd.concat([zero_data.iloc[train_size0:train_size0+test_size0], one_data.iloc[train_size1:train_size1+test_size1]], axis = 0)
    test_data4 = test_data4.sample(frac=1).reset_index(drop=True)
    
    X_train4, y_train4 = train_data4[data_cols], train_data4["Class"]
    X_test4, y_test4 = test_data4[data_cols], test_data4["Class"]
    X_train4, X_test4 = standard_scaler(X_train4, X_test4)
        
    for i in range(0,LOOP):
        print("Train inter", i)
        fcl_f1, rfc_f1 = train_evaluate(X_train4, y_train4, X_test4, y_test4)
        fcl_f1_list.append(fcl_f1)
        rfc_f1_list.append(rfc_f1)
        
    print(f'> F1 Score FCL: {np.mean(fcl_f1_list)} (Độ lệch +- {np.std(fcl_f1_list)})')
    print(f'> F1 Score RFC: {np.mean(rfc_f1_list)} (Độ lệch +- {np.std(rfc_f1_list)})')

------------------------------------------ Synthpop140000
Train inter 10
Fully connected layer 

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 99.57
Confusion matrix 
 [[19917    83]
 [    8   992]] 

Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       0.92      0.99      0.96      1000

    accuracy                           1.00     21000
   macro avg       0.96      0.99      0.98     21000
weighted avg       1.00      1.00      1.00     21000
 

F1_Score: 0.98
Best Threshold=0.916062, F-Score=0.981
PR AUC= 0.993
Confusion matrix 
 [[19985    15]
 [   23   977]]
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       0.98      0.98      0.98      1000

    accuracy                           1.00     21000
   macro av

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 98.86
Confusion matrix 
 [[19778   222]
 [   17   983]] 

Classification report 
               precision    recall  f1-score   support

           0       1.00      0.99      0.99     20000
           1       0.82      0.98      0.89      1000

    accuracy                           0.99     21000
   macro avg       0.91      0.99      0.94     21000
weighted avg       0.99      0.99      0.99     21000
 

F1_Score: 0.94
Best Threshold=0.948167, F-Score=0.972
PR AUC= 0.985
Confusion matrix 
 [[19983    17]
 [   40   960]]
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       0.98      0.96      0.97      1000

    accuracy                           1.00     21000
   macro avg       0.99      0.98      0.98     21000
weighted avg       1.00      1.00      1.00     21000


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 99.27
Confusion matrix 
 [[19860   140]
 [   13   987]] 

Classification report 
               precision    recall  f1-score   support

           0       1.00      0.99      1.00     20000
           1       0.88      0.99      0.93      1000

    accuracy                           0.99     21000
   macro avg       0.94      0.99      0.96     21000
weighted avg       0.99      0.99      0.99     21000
 

F1_Score: 0.96
Best Threshold=0.942904, F-Score=0.976
PR AUC= 0.989
Confusion matrix 
 [[19989    11]
 [   37   963]]
Classification report 
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20000
           1       0.99      0.96      0.98      1000

    accuracy                           1.00     21000
   macro avg       0.99      0.98      0.99     21000
weighted avg       1.00      1.00      1.00     21000


KeyboardInterrupt: 

In [49]:
print(f'> F1 Score FCL: {np.mean(fcl_f1_list)} (Độ lệch +- {np.std(fcl_f1_list)})')
print(f'> F1 Score RFC: {np.mean(rfc_f1_list)} (Độ lệch +- {np.std(rfc_f1_list)})')

> F1 Score FCL: 0.9614285714285715 (Độ lệch +- 0.012453996981544792)
> F1 Score RFC: 0.96 (Độ lệch +- 0.0)
