In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings(action='ignore')

from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [3]:
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)

seed_everything(SEED) # Seed 고정

# 데이터 불러오기

In [4]:
os.chdir('/content/drive/Othercomputers/내 MacBook Air/MLDL/project/open')

In [5]:
train = pd.read_csv('./train.csv')
val = pd.read_csv('./val.csv')
test = pd.read_csv('./test.csv')
sample_submission = pd.read_csv('./sample_submission.csv')

In [6]:
train_total = train.iloc[:, 1:]
val_total = val.iloc[:, 1:]
test_total = test.iloc[:, 1:]

In [7]:
val_total_x = val_total.drop(['Class'], axis=1)
val_total_y = val_total.loc[:, 'Class']

In [8]:
from sklearn.preprocessing import StandardScaler  # 표준화 
std = StandardScaler()

train_total_std = std.fit_transform(train_total) 
val_total_x_std = std.fit_transform(val_total_x) 
test_total_std = std.fit_transform(test_total) 

In [9]:
selection = [1, 2, 3, 4, 5, 7, 8, 9, 10, 11, 12, 14, 16, 17, 18, 21, 27, 30]
idx = selection-np.ones_like(selection)
def feature_selection(data):
  select_data = data[:,idx]
  return select_data

In [10]:
select_train = feature_selection(train_total_std)
select_val_x = feature_selection(val_total_x_std)
select_test = feature_selection(test_total_std)

## EllipticEnvelope

#### 라벨링
* +1 이면 boundary 안에 들어온 값으로 정상 데이터
* -1 이면 outlier

정상데이터: 0 , 이상치: 1로 라벨링 수정

In [11]:
def get_pred_label(model_pred):
  model_pred = np.where(model_pred==1, 0, model_pred)
  model_pred = np.where(model_pred==-1, 1, model_pred)
  return model_pred

### EE Model Fit

In [24]:
from sklearn.covariance import EllipticEnvelope
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
from sklearn.metrics import classification_report
EE_model = EllipticEnvelope(support_fraction = 0.999, contamination = 0.00112, random_state = 42)

EE_model.fit(select_train)

### Predict

In [25]:
val_EE_pred = EE_model.predict(select_val_x)
val_EE_pred = get_pred_label(val_EE_pred)
val_EE_score = f1_score(val_total_y, val_EE_pred, average='macro')
print(f'Validation F1 Score : [{val_EE_score}]')
print(classification_report(val_total_y, val_EE_pred))

Validation F1 Score : [0.8927516353661109]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.85      0.73      0.79        30

    accuracy                           1.00     28462
   macro avg       0.92      0.87      0.89     28462
weighted avg       1.00      1.00      1.00     28462



## AutoEncoder

In [None]:
select_train.shape

(113842, 18)

In [None]:
import tensorflow as tf
from tensorflow.keras import Model
# Sequential

from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras import layers
# Conv2D, Conv2DTranspose, Dense, Flatten, Dropout, BatchNormalization, Reshape, LeakyReLU

### Model

In [None]:
input_dim = select_train.shape[1]
output_dim = input_dim
encoding_dim = 128

input_layer = layers.Input(shape=(input_dim, ))

encoder1 = layers.Dense(encoding_dim/2, name='encoder1')(input_layer)
encoder1 = layers.BatchNormalization()(encoder1)
encoder1 = layers.LeakyReLU()(encoder1)

encoder2 = layers.Dense(encoding_dim, name='encoder2')(encoder1)
encoder2 = layers.BatchNormalization()(encoder2)
encoder2 = layers.LeakyReLU()(encoder2)

decoder1 = layers.Dense(encoding_dim/2, name='decoder1')(encoder2)
decoder1 = layers.BatchNormalization()(decoder1)
decoder1 = layers.LeakyReLU()(decoder1)

decoder2 = layers.Dense(output_dim, name='decoder2')(decoder1)

autoencoder = Model(input_layer, decoder2)
autoencoder.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(learning_rate=0.01))

In [None]:
print("====== 모델 전체 구조 ======")
print(autoencoder.summary())

Model: "model_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_6 (InputLayer)        [(None, 18)]              0         
                                                                 
 encoder1 (Dense)            (None, 64)                1216      
                                                                 
 batch_normalization_19 (Bat  (None, 64)               256       
 chNormalization)                                                
                                                                 
 leaky_re_lu_19 (LeakyReLU)  (None, 64)                0         
                                                                 
 encoder2 (Dense)            (None, 128)               8320      
                                                                 
 batch_normalization_20 (Bat  (None, 128)              512       
 chNormalization)                                          

### Fit

In [None]:
EPOCHS = 400
BATCH = 2**14
reduce_lr = ReduceLROnPlateau(monitor='loss', mode='max', factor=0.5, patience=10, min_lr=1e-8, verbose=True)
# es = EarlyStopping(monitor='loss', patience=5, verbose=1)

In [None]:
hist = autoencoder.fit(select_train, select_train, 
                 epochs=EPOCHS,
                 batch_size=BATCH,
                 callbacks=[reduce_lr],
                 shuffle=True)

Epoch 1/400
Epoch 2/400
Epoch 3/400
Epoch 4/400
Epoch 5/400
Epoch 6/400
Epoch 7/400
Epoch 8/400
Epoch 9/400
Epoch 10/400
Epoch 11/400
1/7 [===>..........................] - ETA: 0s - loss: 0.0155
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.004999999888241291.
Epoch 12/400
Epoch 13/400
Epoch 14/400
Epoch 15/400
Epoch 16/400
Epoch 17/400
Epoch 18/400
Epoch 19/400
Epoch 20/400
Epoch 21/400
Epoch 21: ReduceLROnPlateau reducing learning rate to 0.0024999999441206455.
Epoch 22/400
Epoch 23/400
Epoch 24/400
Epoch 25/400
Epoch 26/400
Epoch 27/400
Epoch 28/400
Epoch 29/400
Epoch 30/400
Epoch 31/400
1/7 [===>..........................] - ETA: 0s - loss: 0.0092
Epoch 31: ReduceLROnPlateau reducing learning rate to 0.0012499999720603228.
Epoch 32/400
Epoch 33/400
Epoch 34/400
Epoch 35/400
Epoch 36/400
Epoch 37/400
Epoch 38/400
Epoch 39/400
Epoch 40/400
Epoch 41/400
1/7 [===>..........................] - ETA: 0s - loss: 0.0082
Epoch 41: ReduceLROnPlateau reducing learning rate to 0.0006

### Predict 예측

In [None]:
val_y_pred = autoencoder.predict(select_val_x)



#### 예측값 코사인유사도 계산

In [None]:
from numpy import dot
from numpy.linalg import norm

def cos_sim(A, B):
  return dot(A, B)/(norm(A)*norm(B))

In [None]:
def cosine_similarity(true, pred):
  diff = []
  for i in range(len(true)):
    diff.append(cos_sim(true[i], pred[i]))
  diff = pd.Series(diff)
  return np.array(diff)

#### 예측 라벨링

In [None]:
val_AE_pred = np.where(cosine_similarity(select_val_x, val_y_pred)>0.9883, 0, 1)

In [None]:
pd.DataFrame(val_total_y[val_total_y != val_AE_pred]).value_counts()

Class
0        191
1         28
dtype: int64

In [None]:
val_AE_score = f1_score(val_total_y, val_AE_pred, average='macro')
print(f'Validation F1 Score : [{val_AE_score}]')
print(classification_report(val_total_y, val_AE_pred))

Validation F1 Score : [0.5070374269939165]
              precision    recall  f1-score   support

           0       1.00      0.99      1.00     28432
           1       0.01      0.07      0.02        30

    accuracy                           0.99     28462
   macro avg       0.50      0.53      0.51     28462
weighted avg       1.00      0.99      1.00     28462



In [None]:
# (0,0)=>0  | (1,0)=>1  | (0,1)=>0  | (1,1)=>1
from collections import Counter

def mode (x) :
    cnt = Counter(x)
    mode = cnt.most_common(1)
    return mode[0][0]

In [None]:
def get_ensemble_pred(test):
  # pred EE
  print('EE_model')
  test_EE_pred = EE_model.predict(test)
  test_EE_pred = get_pred_label(test_EE_pred)

  # AE
  print('AE_model')
  test_AE_pred = autoencoder.predict(test)
  test_AE_pred = np.where(cosine_similarity(test, test_AE_pred)>0.974, 0, 1)

  preds = pd.DataFrame(zip(test_EE_pred, test_AE_pred))
  preds.columns = ['pred_EE', 'pred_AE']
  
  return preds, preds.apply(mode,axis = 1)

In [None]:
test_pred = get_ensemble_pred(select_test)[1]

EE_model
AE_model


In [None]:
test_pred.value_counts()

0    142236
1       267
dtype: int64

In [None]:
val_pred = get_ensemble_pred(select_val_x)[1]

EE_model
AE_model


In [None]:
pd.DataFrame(val_total_y[val_total_y != val_pred]).value_counts()

Class
1        10
0         8
dtype: int64

In [None]:
val_score = f1_score(val_total_y, val_pred, average='macro')
print(f'Validation F1 Score : [{val_score}]')
print(classification_report(val_total_y, val_pred))

Validation F1 Score : [0.8446693194042376]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     28432
           1       0.71      0.67      0.69        30

    accuracy                           1.00     28462
   macro avg       0.86      0.83      0.84     28462
weighted avg       1.00      1.00      1.00     28462

