In [23]:
# Base

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
import re
import sys

# Preprocessing

from impyute.imputation.cs import fast_knn

from sklearn.metrics import accuracy_score, classification_report, mean_squared_error as mse
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch
import torch.nn.functional as F
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import random as rd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV as GSCV
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader

import tensorflow as tf

from lightgbm import LGBMClassifier as lgb
from sklearn.model_selection import StratifiedKFold
import xgboost.sklearn as xgb

import optuna

In [7]:
class config:
    seed = 42
    device = "cuda:0"    
        
    lr = 1e-3
    epochs = 25
    batch_size = 32
    num_workers = 4
    train_5_folds = True

In [9]:
def seed_everything(seed: int = 42):
    rd.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic=True
    torch.backends.cudnn.benchmark=True
    
seed_everything(config.seed)

In [10]:
def balance_logloss(y_true, y_pred):
    y_pred = np.clip(y_pred, 1e-15, 1-1e-15)
    y_pred /= np.sum(y_pred, axis=1)[:, None]
    nc = np.bincount(y_true)
    
    logloss = (-1/nc[0]*(np.sum(np.where(y_true==0,1,0) * np.log(y_pred[:,0]))) - 1/nc[1]*(np.sum(np.where(y_true!=0,1,0) * np.log(y_pred[:,1])))) / 2
    
    return logloss

# def balance_loglossv2(y_true, y_pred):
#     from sklearn.metrics import log_loss
    
#     target_mean = y_true.mean()
#     w0 = 1/(1-target_mean)
#     w1 = 1/target_mean
#     sample_weight = [w0 if y == 0 else w1 for y in y_true]
#     loss = log_loss(y_true, y_pred, sample_weight=sample_weight)
    
#     return loss

def b_logloss_keras(y_true, y_pred):
    #(2, ) -> (1, ) inverse_one_hot encoding
    #y_true = y_true[:, 0] * (1 - y_true[:, 1]) + y_true[:, 1] * (1 - y_true[:, 0])
    y_true = y_true[:, 1] * (1 - y_true[:, 0])
    y_true = tf.cast(y_true, tf.int64)
    score = tf.py_function(func=balance_logloss, inp=[y_true, y_pred], Tout=tf.float32)
    return score

In [11]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')
greeks = pd.read_csv('./greeks.csv')
submission = pd.read_csv('./sample_submission.csv')

In [47]:
train.info()
print('/n__________________________________/n')
test.info()
print('/n__________________________________/n')
greeks.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 617 entries, 0 to 616
Data columns (total 58 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Id      617 non-null    object 
 1   AB      617 non-null    float64
 2   AF      617 non-null    float64
 3   AH      617 non-null    float64
 4   AM      617 non-null    float64
 5   AR      617 non-null    float64
 6   AX      617 non-null    float64
 7   AY      617 non-null    float64
 8   AZ      617 non-null    float64
 9   BC      617 non-null    float64
 10  BD      617 non-null    float64
 11  BN      617 non-null    float64
 12  BP      617 non-null    float64
 13  BQ      557 non-null    float64
 14  BR      617 non-null    float64
 15  BZ      617 non-null    float64
 16  CB      615 non-null    float64
 17  CC      614 non-null    float64
 18  CD      617 non-null    float64
 19  CF      617 non-null    float64
 20  CH      617 non-null    float64
 21  CL      617 non-null    float64
 22  CR

In [45]:
# train으로 test셋 만들기
train.Class.value_counts() # 0 : 509, 1 : 108

0    509
1    108
Name: Class, dtype: int64

In [46]:
# 60개(class : 0, class : 1) 테스트 셋 만들기
## 30개 class : 0
mask = train.Class == 0
test =  train[mask][:30]
dropIndex1 = train[mask][:30].index

In [49]:
## 30개 class : 1
mask = train.Class == 1
test = test.append(train[mask][:30]) 
dropIndex2 = train[mask][:30].index
test.Class.value_counts() # 0 : 30, 1 : 30
print(test.shape)
test = test.drop(columns = ['Class'], axis = 1)
print(test.shape)

(90, 58)
(90, 57)


In [50]:
# drop
print(train.shape)
train = train.drop(index=dropIndex1)
train = train.drop(index=dropIndex2)
print(train.shape)

(617, 58)
(557, 58)


In [53]:
test.to_csv('./test(make).csv',index = False)
train.to_csv('./train(make).csv',index = False)

In [54]:
train_make = pd.read_csv('./train(make).csv')

In [13]:
train_modifying = train.drop(columns=['Id'])

In [16]:
label_encoder = LabelEncoder()
train_modifying['EJ'] = label_encoder.fit_transform(train_modifying['EJ'])
train_modifying['EJ']

0      1
1      0
2      1
3      1
4      1
      ..
612    0
613    1
614    0
615    1
616    0
Name: EJ, Length: 617, dtype: int64

In [34]:
missing_data = train_modifying.isnull().sum()
missing_data

AB        0
AF        0
AH        0
AM        0
AR        0
AX        0
AY        0
AZ        0
BC        0
BD        0
BN        0
BP        0
BQ       60
BR        0
BZ        0
CB        2
CC        3
CD        0
CF        0
CH        0
CL        0
CR        0
CS        0
CU        0
CW        0
DA        0
DE        0
DF        0
DH        0
DI        0
DL        0
DN        0
DU        1
DV        0
DY        0
EB        0
EE        0
EG        0
EH        0
EJ        0
EL       60
EP        0
EU        0
FC        1
FD        0
FE        0
FI        0
FL        1
FR        0
FS        2
GB        0
GE        0
GF        0
GH        0
GI        0
GL        1
Class     0
dtype: int64

In [35]:
correlation_threshold = 0.7

In [44]:
correlated_cols = {}
for column in train_modifying.columns:
    if train_modifying[column].isnull().sum() > 0:
        correlated_cols[column] = train_modifying.corr().abs()[column].sort_values(ascending=False)
        correlated_cols[column] = correlated_cols[column][correlated_cols[column].index != column]
        correlated_cols[column] = correlated_cols[column][correlated_cols[column] > correlation_threshold]
        
for column, correlated_columns in correlated_cols.items():
    for correlated_column, correlated_value in correlated_columns.items():
        missing_indices = train_modifying[train_modifying[column].isnull()].index
        for index in missing_indices:
            if not pd.isnull(train_modifying[correlated_column][index]):
                train_modifying[column][index] = train_modifying[correlated_column][index]

                train_modifying.loc[index, column] = non_null_cols.mode().iloc[0]

train_modifying.to_csv('./train_modifying.csv', index=False)
                
        
        