In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import LocalOutlierFactor

from sklearn.preprocessing import Normalizer, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import f1_score


data_path = './open/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path+'test.csv')
valid = pd.read_csv(data_path+'val.csv')
submission =pd.read_csv(data_path+'sample_submission.csv')



KeyboardInterrupt: 

In [None]:
#### isolate forest #####
#TODO : 앙상블, grid search
train_droped = train.drop('ID',axis=1)
sample = train_droped.sample(frac = 0.1)

valid_droped = valid.drop(['ID','Class'],axis=1)

val_normal, val_fraud = valid['Class'].value_counts()
val_contamination = val_fraud / val_normal

N_ITER = 300
#train
iso_classifiers= IsolationForest(n_estimators=N_ITER, max_samples=len(sample),contamination=val_contamination).fit(sample)

y_prediction = iso_classifiers.predict(sample)
y_prediction[y_prediction==1] = 0
y_prediction[y_prediction==-1] = 1




In [None]:
#valid
valid_predict = iso_classifiers.predict(valid_droped)
valid_predict[valid_predict==1] = 0
valid_predict[valid_predict==-1] = 1
valid_label = valid['Class']
val_score = f1_score(valid_label,valid_predict,average='macro')
print(f'val f1 : {val_score}')
print(classification_report(valid_label, valid_predict))


In [None]:
#test
test_droped = test.drop(['ID'],axis=1)
iso_test_predict = iso_classifiers.predict(test_droped)
iso_test_predict[iso_test_predict==1] = 0
iso_test_predict[iso_test_predict==-1] = 1
# sub_df = pd.DataFrame(submission, columns = ['Class'])
#  = pd.concat([sub_df['ID'],df['Class']],axis=1)



In [None]:
##### Autoencoder ####

import torch
import torchvision
from torchvision import transforms
import torch.nn.functional as F

# from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error # Regression 문제의 평가를 위해 MSE(Mean Squared Error)를 불러온다.
import numpy as np

import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset 
import numpy as np
import cv2
import matplotlib.pyplot as plt
from transformers import get_cosine_schedule_with_warmup
data_path = './open/'
train = pd.read_csv(data_path + 'train.csv')
test = pd.read_csv(data_path+'test.csv')
valid = pd.read_csv(data_path+'val.csv')
submission =pd.read_csv(data_path+'sample_submission.csv')
device = "mps" if torch.backends.mps.is_available() else "cpu"

BATCH_SIZE = 32
Epochs =200

In [None]:
pipeline = Pipeline([('normalizer', Normalizer()),
                     ('scaler', MinMaxScaler())])

valid_droped = valid.drop(['ID','Class'],axis=1)
train_droped = train.drop('ID',axis=1)
pipeline.fit(train_droped)
pipeline.fit(valid_droped)

train_transformed = pipeline.transform(train_droped)
valid_transformed = pipeline.transform(valid_droped)

In [None]:

class TensorData(Dataset):
    def __init__(self, x_data, y_data):
        self.x_data = torch.FloatTensor(x_data)
        self.y_data = torch.FloatTensor(y_data)
        self.len = self.y_data.shape[0]

    def __getitem__(self, index):
        # print(self.x_data[index])
        return self.x_data[index], self.y_data[index] 

    def __len__(self):
        return self.len

In [None]:
trainsets = TensorData(train_transformed, train_transformed)
trainloader = torch.utils.data.DataLoader(trainsets, batch_size=BATCH_SIZE, shuffle=True)

validset = TensorData(valid_transformed, valid_transformed)
validloader = torch.utils.data.DataLoader(validset, batch_size=BATCH_SIZE, shuffle=False)

In [None]:
#MODEL
    
class Autoencoder(nn.Module):
    def __init__(self,input_dim):
        super(Autoencoder, self).__init__()
        self.input_dim = input_dim
        self.encoder = nn.Sequential(
                        nn.Linear(input_dim, 16, bias=True),
                        nn.ReLU(),
                        nn.Linear(16, 8, bias=True),
                        nn.ReLU(),
                        nn.Linear(8, 4, bias=True),
                        nn.ReLU(),
                        nn.Linear(4, 2, bias=True),
                        nn.ReLU(),
                    
        )
        
        self.decoder = nn.Sequential(
                        nn.Linear(2, 4, bias=True),
                        nn.ReLU(),  
                        nn.Linear(4, 8, bias=True),
                        nn.ReLU(),  
                        nn.Linear(8, 16, bias=True),
                        nn.ReLU(),  
                        nn.Linear(16, input_dim, bias=True),
                        nn.ReLU()
        )
    
    def forward(self, x):
        
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)

        return decoded

In [None]:
input_dim = train_transformed.shape[1]
model = Autoencoder(input_dim).to(device)

criterion = nn.MSELoss() 
optimizer = optim.AdamW(model.parameters(), lr=1e-4)
scheduler = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps = len(trainloader)*3,num_training_steps = len(trainloader)*Epochs)



In [None]:
input_dim

In [None]:
f1_min_score = np.inf
fraud_labels = valid['Class']

def calF1score(pres,labels,THRESHOLD):
  preds_np = np.array(pres)
  trues_np = np.array(labels)
  mse = np.mean(np.power(preds_np - trues_np, 2), axis=1)
 
  m = np.median(mse)
  ad = np.abs(mse - m)
  mad = np.median(ad)
  z_scores = 0.6745 * ad / mad
  outliers = np.where(z_scores>THRESHOLD,1,0 )
  return outliers

for epoch in range(Epochs):
  model.train()
  train_loss = 0
  preds_list =[]
  label_list = []
  for inputs,labels in trainloader:
    inputs = inputs.to(device)
    labels = labels.to(device)
    optimizer.zero_grad()
    outputs = model(inputs)

    loss = criterion(outputs,labels)
    loss.backward()
    train_loss +=loss.item()
    optimizer.step()
    scheduler.step()
  if epoch %5 ==0:
    print(f'{epoch+1} train loss : {train_loss/len(trainloader):.4f}')

  model.eval()
  with torch.no_grad():
      for inputs,labels in validloader:
         
          inputs = inputs.to(device)
          labels = labels.to(device)
          outputs = model(inputs)
      
          pres = outputs.cpu().numpy()
          # print(pres)
          preds_list.extend(pres)
          
          true_value = labels.cpu().numpy()
          label_list.extend(true_value)
      fraud_pres = calF1score(preds_list,label_list,3)
      val_score = f1_score(fraud_labels,valid_predict,average='macro')

  print(f'valid f1 : {val_score:.4f} , iter : {epoch}')
  if val_score<=f1_min_score:
    print("save model..")
    torch.save(model.state_dict(),'./model_state_dict.pth')
    f1_min_score = val_score




In [None]:
test_droped = test.drop('ID',axis=1)

test_transformed = pipeline.transform(test_droped)
submission_data = TensorData(test_transformed, test_transformed)
submissionloader = torch.utils.data.DataLoader(submission_data, batch_size=BATCH_SIZE, shuffle=False)


# X_valid_transformed = pipeline.transform(valid_x)

In [None]:
from sklearn.metrics import mean_squared_error # Regression 문제의 평가를 위해 MSE(Mean Squared Error)를 불러온다.
import numpy as np

model.eval()
epoch_valid_loss = 0
preds_list =[]
label_list = []

with torch.no_grad():
    for data in submissionloader:
        inputs, values = data
        inputs = inputs.to(device)
        outputs = model(inputs)

        preds = outputs.cpu().numpy()
        preds_list.extend(preds)
        trues = values.cpu().numpy()
        label_list.extend(trues)
    
auto_pred = calF1score(preds_list,label_list,3)

       
    
    

In [None]:
auto_pred

In [None]:
path = './'
torch.save({'model':model.state_dict(),
            'optimizer':optimizer.state_dict()},
           path+'autoencoder.tar')

In [None]:
model_load_path = './autoencoder.tar'
checkpoint = torch.load(model_load_path)
model.load_state_dict(checkpoint['model'])
optimizer.load_state_dict(checkpoint['optimizer'])

In [None]:
auto_pred = pd.DataFrame(outliers,columns=['Class'])
auto_pred.head()
sumission_auto = pd.concat([submission['ID'],auto_pred['Class']],axis=1)

In [None]:
sumission_auto.head()
sumission_auto.to_csv('auto_submission.csv',index=False)

# sumission_auto.info()