In [126]:
import random
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm

from sklearn.metrics import f1_score
from sklearn.preprocessing import MinMaxScaler, PowerTransformer, QuantileTransformer, StandardScaler
from sklearn.metrics import accuracy_score, recall_score, \
                                precision_score, f1_score


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

EPOCHS = 13
LR = 1e-2
BS = 16384
SEED = 42

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(SEED) # Seed 고정

train_df = pd.read_csv('../input/dasdasdad/train.csv')
train_data = train_df.drop('ID', axis = 1)

valid_df = pd.read_csv('../input/dasdasdad/val.csv')
valid_class = valid_df['Class']
valid_data = valid_df.drop('ID', axis = 1)
valid_normal = valid_data[valid_data['Class'] == 0]
valid_fraud = valid_data[valid_data['Class'] == 1]
valid_data = valid_data.drop('Class', axis = 1)
cols = valid_normal.columns[:-1] 
fracc = 0.0010540369615627855

test_df = pd.read_csv('../input/dasdasdad/test.csv')
test_data = test_df.drop('ID', axis = 1)

In [127]:
def ztest(feature):
    mean = valid_normal[feature].mean()
    std = valid_fraud[feature].std()
    zScore = (valid_fraud[feature].mean() - mean) / (std/np.sqrt(sample_size))
    
    return zScore

sample_size = len(valid_fraud)
significant_features = []
none_sinificant_features = []
critical_value = 2.58

for i in cols:
    z_vavlue = ztest(i)
#     print(i, ': ', z_vavlue)
    if (abs(z_vavlue) >= critical_value):
        significant_features.append(i)
    elif (1.5 < abs(z_vavlue) < 2.58):
        none_sinificant_features.append(i)

In [128]:
# none_sinificant_features

In [129]:
from sklearn.decomposition import PCA

def append_pca_feature(n, pca_features, significant_features, datasets):
    new_datasets = []
    
    # PCA만 추출
    for dataset in datasets:
        dataset = dataset[pca_features]
        new_datasets.append(dataset)
    
    pca = PCA(n_components = n)
    pca.fit(new_datasets[0])
    
    # PCA 데이터셋 저장
    new_datasets2 = []
    for dataset in new_datasets:
        dataset = pca.transform(dataset)
        new_datasets2.append(dataset)
    
    # PCA Col 변수 생성
    cols = []
    for i in range(n):
        temp = 'pca_' + str(i)
        cols.append(temp)
    
    # significant data와 일반 데이터 병합
    significant_datasets = []
    for dataset, dataset_1 in zip(new_datasets2, datasets):
        pca = pd.DataFrame(dataset, columns = cols)
        data_n_pca = dataset_1[significant_features]
        data = pd.concat([data_n_pca, pca], axis = 1)
        significant_datasets.append(data)
        
    train_significant_data, valid_significant_data, test_significant_data = significant_datasets[0], \
                                                                            significant_datasets[1], significant_datasets[2]
    
    return train_significant_data, valid_significant_data, test_significant_data


# '
pca_features = ['V8', 'V13','V23','V25', 'V15', 'V19', 'V20', 'V21', 'V22',  'V24', 'V27', 'V26',  'V29', 'V28', 'V30']
# pca_features = none_sinificant_features
datasets = [train_data, valid_data, test_data]
train_significant_data, valid_significant_data, test_significant_data = append_pca_feature(3, pca_features, significant_features, datasets)    

In [130]:
significant_features = significant_features + ['pca_1', 'pca_2', 'pca_3']

In [131]:
# min_scaler = StandardScaler()
min_scaler = QuantileTransformer(output_distribution='normal', random_state= 42) #  n_quantiles= 1000
power_scaler = PowerTransformer()

train_significant_data = min_scaler.fit_transform(train_significant_data)
train_significant_data = power_scaler.fit_transform(train_significant_data)
train_significant_data = pd.DataFrame(train_significant_data, columns= significant_features)

valid_significant_data = min_scaler.transform(valid_significant_data)
valid_significant_data = power_scaler.transform(valid_significant_data)
valid_significant_data = pd.DataFrame(valid_significant_data, columns= significant_features)
valid_significant_data['Class'] = valid_class
test_significant_data = min_scaler.transform(test_significant_data)
test_significant_data = power_scaler.transform(test_significant_data)
test_significant_data = pd.DataFrame(test_significant_data, columns= significant_features)

In [132]:
len(train_significant_data.columns)

In [133]:
class MyDataset(Dataset):
    def __init__ (self, df, eval_mode):
        self.df = df
        self.eval_mode = eval_mode
        if self.eval_mode:
            self.labels = self.df['Class'].values
            self.df = self.df.drop(columns = ['Class']).values
        else:
            self.df = self.df.values
    
    def __getitem__(self, index):
        if self.eval_mode:
            self.x = self.df[index]
            self.y = self.labels[index]
            return torch.Tensor(self.x), self.y
        else:
            self.x = self.df[index]
            return torch.Tensor(self.x)
    
    def __len__(self):
        return len(self.df)

In [134]:
train_dataset = MyDataset(df=train_significant_data, eval_mode=False)
train_loader = DataLoader(train_dataset, batch_size=BS, shuffle=True, num_workers=2, pin_memory= True)

val_dataset = MyDataset(df = valid_significant_data, eval_mode=True)
val_loader = DataLoader(val_dataset, batch_size=BS, shuffle=False, num_workers=2, pin_memory= True)

In [135]:
class Swish(nn.Module):
    def __init__(self):
        super().__init__()
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        return x * self.sigmoid(x)

In [136]:
class AutoEncoder(nn.Module):
    def __init__(self):
        super(AutoEncoder, self).__init__()
        self.Encoder = nn.Sequential(
            nn.Linear(18, 64),
            nn.BatchNorm1d(64),
            Swish(),
            nn.Linear(64, 128),
            Swish(),
        )
        self.Decoder = nn.Sequential(
            nn.Linear(128, 64),
            nn.BatchNorm1d(64),
            Swish(),
            nn.Linear(64, 18),
        )
    
    def forward(self, x):
        x = self.Encoder(x)
        x = self.Decoder(x)
        return x

In [137]:
class Trainer():
    def __init__(self, model, optimizer, train_loader, val_loader, scheduler, device):
        self.model = model
        self.optimizer = optimizer
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.scheduler = scheduler
        self.device = device
        # Loss Function
        self.criterion = nn.L1Loss().to(self.device)
        
    def fit(self, ):
        self.model.to(self.device)
        best_score = 0
        for epoch in range(EPOCHS):
            self.model.train()
            train_loss = []
            for x in iter(self.train_loader):
                x = x.float().to(self.device)
                self.optimizer.zero_grad()

                _x = self.model(x)
                loss = self.criterion(x, _x)

                loss.backward()
                self.optimizer.step()

                train_loss.append(loss.item())

            score = self.validation(self.model, 0.95)
            print(f'Epoch : [{epoch}] Train loss : [{np.mean(train_loss)}] Val Score : [{score}])')

            if self.scheduler is not None:
                self.scheduler.step(score)

            if best_score < score:
                best_score = score
                torch.save(model.module.state_dict(), './best_model.pth', _use_new_zipfile_serialization=False)
    
    def validation(self, eval_model, thr):
        cos = nn.CosineSimilarity(dim=1, eps=1e-6)
        eval_model.eval()
        pred = []
        true = []
        with torch.no_grad():
            for x, y in iter(self.val_loader):
                x = x.float().to(self.device)

                _x = self.model(x)
                diff = cos(x, _x).cpu().tolist()
                batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
                
                pred += batch_pred
                true += y.tolist()

        return f1_score(true, pred, average='macro')

In [138]:
model = nn.DataParallel(AutoEncoder())
model.eval()
optimizer = torch.optim.Adam(params = model.parameters(), lr = LR)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=10, threshold_mode='abs', min_lr=1e-8, verbose=True)

trainer = Trainer(model, optimizer, train_loader, val_loader, scheduler, device)
trainer.fit()
# Epoch : [8] Train loss : [0.07701918057032994] Val Score : [0.9165787375726882])

In [139]:
model.eval()
test_dataset = MyDataset(test_significant_data, False)
test_loader = DataLoader(test_dataset, batch_size=BS, shuffle=False, num_workers=2)

In [140]:
def prediction(model, thr, test_loader, device):
    model.to(device)
    model.eval()
    cos = nn.CosineSimilarity(dim=1, eps=1e-8)
    pred = []
    with torch.no_grad():
        for x in iter(test_loader):
            x = x.float().to(device)
            
            _x = model(x)
            
            diff = cos(x, _x).cpu().tolist()
            batch_pred = np.where(np.array(diff)<thr, 1,0).tolist()
            pred += batch_pred
    return pred

In [141]:
preds = prediction(model, 0.95, test_loader, device)

In [142]:
submit = pd.read_csv('../input/dasdasdad/sample_submission.csv')
submit['Class'] = preds
submit.to_csv('./submit21.csv', index=False)