# Library Import

In [1]:
!pip install catboost

Collecting catboost
[?25l  Downloading https://files.pythonhosted.org/packages/5a/41/24e14322b9986cf72a8763e0a0a69cc256cf963cf9502c8f0044a62c1ae8/catboost-0.26-cp37-none-manylinux1_x86_64.whl (69.2MB)
[K     |████████████████████████████████| 69.2MB 48kB/s 
Installing collected packages: catboost
Successfully installed catboost-0.26


In [3]:
# *------------ 기본 라이브러리------------*
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# *------------ sklearn ------------*
from sklearn.metrics import f1_score, log_loss, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler, StandardScaler
from sklearn.preprocessing import Normalizer, LabelEncoder


# *------------tf & keras ------------*
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.models import Model, load_model
from tensorflow.keras import callbacks
from tensorflow.keras import backend as K
from tensorflow.keras import utils
from tensorflow.keras.metrics import Metric

# *------------ Catboost ------------*
import catboost as cat
from catboost import CatBoostClassifier

# *------------기본 설정 & 파일 읽기 ------------*
pd.options.display.min_rows=100

train = pd.read_csv("/content/drive/MyDrive/관세청/train.csv", index_col='신고번호')
test = pd.read_csv("/content/drive/MyDrive/관세청/test.csv", index_col='신고번호')
ss = pd.read_csv("/content/drive/MyDrive/관세청/외계인.csv")

seed=1617
def seed_everything(seed):
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)

seed_everything(seed)

# Embedding 생성

In [4]:
# 신고일자 정리 및 수치 데이터 정제
def preprocessing(df):
    
    #날짜 특징 추출
    df['date_time'] = pd.to_datetime(df['신고일자'])
    df['day']= df.date_time.dt.day
    df['weekday'] = df.date_time.dt.weekday
    df['weekend'] = df['weekday'].isin([5,6]).astype(int)

    # 수치데이터
    df['신고중량(KG)'] = np.log1p(df['신고중량(KG)'])
    df['과세가격원화금액'] = np.log1p(df['과세가격원화금액'])
    
    df.drop(['신고일자', 'date_time', 'weekday'], axis=1, inplace=True)

    return df

train = preprocessing(train)
test = preprocessing(test)

In [5]:
cat_features = [x for x in train.columns if x not in ["우범여부", "핵심적발",'신고중량(KG)','과세가격원화금액']]
num_features = ['신고중량(KG)','과세가격원화금액']

test.loc[:,'우범여부']=-1
test.loc[:,'핵심적발']=-1

data=pd.concat([train,test])


# 각 피처별 tokenize
for feat in cat_features:
    lbl_enc = LabelEncoder()
    data[feat] = lbl_enc.fit_transform(data[feat].fillna("-1").astype(str).values)

# 데이터 분리
train = data[data.우범여부 !=-1]
test = data[data.우범여부 ==-1]

In [6]:
# callback 정의
################# binary -> crime #############################

ES_bin = tf.keras.callbacks.EarlyStopping(monitor='val_state_full_binary_f1',
                                     min_delta=1e-02, patience=5,
                                     verbose=0,
                                     mode='max',
                                     baseline=None, restore_best_weights=True)

LRPlateau_bin = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_state_full_binary_f1', 
                                               factor=0.5, patience=3, verbose=0, min_lr=1e-6,mode='max')

dir_name = '/content/drive/MyDrive/관세청'
model_bin_name = "Embed_special_crime_classifier"

checkpoint_bin_path = os.path.join(dir_name, model_bin_name+'weights.h5')
CP_bin = tf.keras.callbacks.ModelCheckpoint(checkpoint_bin_path, monitor='val_state_full_binary_f1', verbose=False, save_best_only=True, save_weights_only=True)



################# multi -> core #############################

ES_mul = tf.keras.callbacks.EarlyStopping(monitor='val_state_full_multiclass_f1',
                                     min_delta=1e-05, patience=5,
                                     verbose=0,
                                     mode='max',
                                     baseline=None, restore_best_weights=True)

LRPlateau_mul = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_state_full_multiclass_f1', 
                                               factor=0.5, patience=3, verbose=0, min_lr=1e-6, mode='max')

dir_name = '/content/drive/MyDrive/관세청'
model_mul_name = "Embed_special_core_classifier"

checkpoint_mul_path = os.path.join(dir_name, model_mul_name+'weights.h5')
CP_mul = tf.keras.callbacks.ModelCheckpoint(checkpoint_mul_path, monitor='val_state_full_multiclass_f1', verbose=False, save_best_only=True, save_weights_only=True)

In [7]:
# Ref : https://towardsdatascience.com/f-beta-score-in-keras-part-ii-15f91f07c9a4
# custom Total F1 정의

class StatefullBinaryFBeta(Metric):
  def __init__(self, name='state_full_binary_f1', beta=1, threshold=0.5, epsilon=1e-7, **kwargs): # f1 ==> beta:1
    # initializing an object of the super class
    super(StatefullBinaryFBeta, self).__init__(name=name, **kwargs)

    # initializing state variables
    self.tp = self.add_weight(name='tp', initializer='zeros') # initializing true positives 
    self.actual_positive = self.add_weight(name='fp', initializer='zeros') # initializing actual positives
    self.predicted_positive = self.add_weight(name='fn', initializer='zeros') # initializing predicted positives

    # initializing other atrributes that wouldn't be changed for every object of this class
    self.beta_squared = beta**2 
    self.threshold = threshold
    self.epsilon = epsilon

  def update_state(self, ytrue, ypred, sample_weight=None):
    # casting ytrue and ypred as float dtype
    ytrue = tf.cast(ytrue, tf.float32)
    ypred = tf.cast(ypred, tf.float32)

    # setting values of ypred greater than the set threshold to 1 while those lesser to 0
    ypred = tf.cast(tf.greater_equal(ypred, tf.constant(self.threshold)), tf.float32)
        
    self.tp.assign_add(tf.reduce_sum(ytrue*ypred)) # updating true positives atrribute
    self.predicted_positive.assign_add(tf.reduce_sum(ypred)) # updating predicted positive atrribute
    self.actual_positive.assign_add(tf.reduce_sum(ytrue)) # updating actual positive atrribute

  def result(self):
    self.precision = self.tp/(self.predicted_positive+self.epsilon) # calculates precision
    self.recall = self.tp/(self.actual_positive+self.epsilon) # calculates recall

    # calculating fbeta
    self.fb = (1+self.beta_squared)*self.precision*self.recall / (self.beta_squared*self.precision + self.recall + self.epsilon)
    
    return self.fb

  def reset_states(self):
    self.tp.assign(0) # resets true positives to zero
    self.predicted_positive.assign(0) # resets predicted positives to zero
    self.actual_positive.assign(0) # resets actual positives to zero


class StatefullMultiClassFBeta(Metric):
    
    # we create (initialize) the state variables here.
    def __init__(self, name='state_full_multiclass_f1', beta=1, n_class=3, average='macro', epsilon=1e-7, **kwargs): # f1 ==> beta:1
        # initializing an object of the super class
        super(StatefullMultiClassFBeta, self).__init__(name=name, **kwargs)

        # initializing state variables
        self.tp = self.add_weight(name='tp', shape=(n_class,), initializer='zeros')     # initializing true positives
        self.actual_positives = self.add_weight(name='ap', shape=(n_class,), initializer='zeros') # initializing actual positives
        self.predicted_positives = self.add_weight(name='pp', shape=(n_class,), initializer='zeros') # initializing predicted positives

        # initializing other atrributes that wouldn't be changed for every object of this class
        self.beta_squared = beta**2
        self.n_class = n_class
        self.average = average
        self.epsilon = epsilon
    
    # this method is called at the end of each batch and is used to change (update) the state variables.
    def update_state(self, ytrue, ypred, sample_weight=None):
        # casting ytrue and ypred as float dtype
        ytrue = tf.cast(ytrue, tf.float32)
        ypred = tf.cast(ypred, tf.float32)

        # finding the maximum probability in ypred
        max_prob = tf.reduce_max(ypred, axis=-1, keepdims=True)

        # making ypred one hot encoded such that the class with the maximum probability as encoded as 1 while others as 0
        ypred = tf.cast(tf.equal(ypred, max_prob), tf.float32)
        
        self.tp.assign_add(tf.reduce_sum(ytrue*ypred, axis=0)) # updating true positives atrribute
        self.predicted_positives.assign_add(tf.reduce_sum(ypred, axis=0)) # updating predicted positives atrribute
        self.actual_positives.assign_add(tf.reduce_sum(ytrue, axis=0)) # updating actual positives atrribute
    
    # this is called at the end of each batch after states variables are updated. It is used to compute and return the metric for each batch.
    def result(self):
        self.precision = self.tp/(self.predicted_positives+self.epsilon) # calculates precision
        self.recall = self.tp/(self.actual_positives+self.epsilon) # calculates recall

        # calculating fbeta score
        self.fb = (1+self.beta_squared)*self.precision*self.recall / (self.beta_squared*self.precision + self.recall + self.epsilon)

        if self.average == 'weighted':
            return tf.reduce_sum(self.fb*self.actual_positives / tf.reduce_sum(self.actual_positives))
    
        elif self.average == 'raw':
            return self.fb
        
        return tf.reduce_mean(self.fb)
        
    # this is called at the end of each epoch. It is used to clear (reinitialize) the state variables.
    def reset_states(self):
        self.tp.assign(tf.zeros(self.n_class)) # resets true positives to zero
        self.predicted_positives.assign(tf.zeros(self.n_class)) # resets predicted positives to zero
        self.actual_positives.assign(tf.zeros(self.n_class)) # resets actual positives to zero

## 모델 세팅

In [9]:
def create_model(data, cat_features, target, num_features):    
    inputs = []
    outputs = []

    name_scope = [f'feature_{i}' for i in range(len(cat_features))]

    for i,c in enumerate(cat_features):
        num_unique_values = int(data[c].nunique())
        
        # embed 차원 결정
        if c in ['신고인부호','반입보세구역부호','HS10단위부호']:
            embed_dim = 64
        elif c in ['수입자부호','해외거래처부호']:
            embed_dim = 128
        else:
            embed_dim = int(min(np.ceil((num_unique_values)/2), 50))

        inp = layers.Input(shape=(1,))
        out = layers.Embedding(num_unique_values + 1, embed_dim, name=name_scope[i])(inp)
        out = layers.SpatialDropout1D(0.3)(out)
        out = layers.Reshape(target_shape=(embed_dim, ))(out)
        inputs.append(inp)
        outputs.append(out)

    num_inp = layers.Input(shape=(len(num_features),))
    inputs.append(num_inp)
    outputs.append(num_inp)
    
    x = layers.Concatenate()(outputs)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(512, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    x = layers.BatchNormalization()(x)
    
    if target=='핵심적발':
        y = layers.Dense(3,
                         activation='softmax',
                         name="Core_Crime_or_not")(x)
    
    if target=='우범여부':
        y = layers.Dense(2,
                         activation='sigmoid',
                         name="Crime_or_not")(x)

    model = Model(inputs=inputs, outputs=y)
    return model

## core 모델

In [10]:
oof_mul = np.zeros((train.shape[0],3))
target= '핵심적발'
train_y=train['핵심적발']
train_x=train.drop(["우범여부","핵심적발"], axis=1)

models_core = []

N_FOLDS = 5
SEED = seed
EPOCH = 100

skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[target])):
    print(f"\n ====== TRAINING FOLD {fold} =======\n")

    K.clear_session()

    X_train_1, X_valid_1, y_train_1, y_valid_1 = train_x.iloc[train_idx], train_x.iloc[valid_idx], train_y.iloc[train_idx], train_y.iloc[valid_idx]


    X_train = [X_train_1.loc[:, cat_features].values[:, k] for k in range(X_train_1.loc[:, cat_features].values.shape[1])]+[X_train_1.loc[:,num_features].values]
    X_valid = [X_valid_1.loc[:, cat_features].values[:, k] for k in range(X_valid_1.loc[:, cat_features].values.shape[1])]+[X_valid_1.loc[:,num_features].values]

    y_train = utils.to_categorical(y_train_1)
    y_valid = utils.to_categorical(y_valid_1)

    #================= Embedding MODEL training =================
    
    print("\n-----Embedding model Training-----\n")

    model = create_model(data, cat_features, target, num_features)

    # Metrics Weights 도 있나?
    model.compile(loss='categorical_crossentropy',
                       metrics={'Core_Crime_or_not':StatefullMultiClassFBeta()},
                       optimizer = tf.keras.optimizers.Adam()
                       )
    
    model.fit(X_train,y_train,
               batch_size = 256, 
               epochs = EPOCH,
               validation_data=(X_valid, y_valid),
               callbacks=[ES_mul, LRPlateau_mul,CP_mul],
              class_weight={0:1.0, 1:3.0, 2: 3.5},
               verbose = False)
    #============== Embedding Model prediction ============== 

    pred_mul = model.predict(X_valid) 
    oof_mul[valid_idx] = pred_mul 
    
    multiclass_score = f1_score(y_true=np.argmax(y_valid, axis=1), y_pred=np.argmax(pred_mul, axis=1), average='macro')

    print(f"핵심적발 score : {multiclass_score}")
    models_core.append(model)

    
total_score = f1_score(y_true=train_y, y_pred=np.argmax(oof_mul, axis=1), average='macro')

print(f"\n=== FINAL 핵심적발 SCORE CONVOLUTION MODEL : {total_score}===\n") 




-----Embedding model Training-----

Instructions for updating:
The `validate_indices` argument has no effect. Indices are always validated on CPU and never validated on GPU.
핵심적발 score : 0.4326497236923195



-----Embedding model Training-----

핵심적발 score : 0.4461808140919848



-----Embedding model Training-----

핵심적발 score : 0.4429310921625931



-----Embedding model Training-----

핵심적발 score : 0.4439329232058166



-----Embedding model Training-----

핵심적발 score : 0.4493461652902206

=== FINAL 핵심적발 SCORE CONVOLUTION MODEL : 0.4440543760508921===



## crime 모델

In [11]:
oof_bin = np.zeros((train.shape[0],2))
target= '우범여부'
train_y=train['우범여부']
train_x=train.drop(["우범여부","핵심적발"], axis=1)

models_crime = []

N_FOLDS = 5
SEED = seed
EPOCH = 100


skf = StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (train_idx, valid_idx) in enumerate(skf.split(train, train[target])):
    print(f"\n ====== TRAINING FOLD {fold} =======\n")

    K.clear_session()

    X_train_1, X_valid_1, y_train_1, y_valid_1 = train_x.iloc[train_idx], train_x.iloc[valid_idx], train_y.iloc[train_idx], train_y.iloc[valid_idx]


    X_train = [X_train_1.loc[:, cat_features].values[:, k] for k in range(X_train_1.loc[:, cat_features].values.shape[1])]+[X_train_1.loc[:,num_features].values]
    X_valid = [X_valid_1.loc[:, cat_features].values[:, k] for k in range(X_valid_1.loc[:, cat_features].values.shape[1])]+[X_valid_1.loc[:,num_features].values]

    #X_train = [X_train_1.loc[:, cat_features].values[:, k] for k in range(X_train_1.loc[:, cat_features].values.shape[1])]+[X_train_1.loc[:,num_features]]
    #X_valid = [X_valid_1.loc[:, cat_features].values[:, k] for k in range(X_valid_1.loc[:, cat_features].values.shape[1])]+[X_valid_1.loc[:,num_features]]

    y_train = utils.to_categorical(y_train_1)
    y_valid = utils.to_categorical(y_valid_1)

    #================= Embedding MODEL training =========
    
    print("\n-----Embedding model Training----\n")

    model = create_model(data, cat_features, target, num_features)

    # Metrics Weights 도 있나?
    model.compile(loss='binary_crossentropy',
                       metrics={'Crime_or_not':StatefullBinaryFBeta()},
                       optimizer = tf.keras.optimizers.Adam()
                       )
    
    model.fit(X_train,y_train,
               batch_size = 256, 
               epochs = EPOCH,
               validation_data=(X_valid, y_valid),
               callbacks=[ES_bin, LRPlateau_bin,CP_bin],
              class_weight={0:1.0, 1:3.0},
               verbose = False)
    #============== Embedding Model prediction ==========
 
    pred_bin = model.predict(X_valid) 
    oof_bin[valid_idx] = pred_bin 
    
    binary_score = f1_score(y_true=np.argmax(y_valid, axis=1), y_pred=np.argmax(pred_bin, axis=1), average='binary')

    print(f"우범여부 score : {binary_score}")
    models_crime.append(model)
    
total_score = f1_score(y_true=train_y, y_pred=np.argmax(oof_bin, axis=1), average='binary')

print(f"\n=== FINAL 우범여부 SCORE CONVOLUTION MODEL : {total_score}===\n") 




-----Embedding model Training----

우범여부 score : 0.5083483754512637



-----Embedding model Training----

우범여부 score : 0.5309769268532155



-----Embedding model Training----

우범여부 score : 0.5201755324842128



-----Embedding model Training----

우범여부 score : 0.4750200374031526



-----Embedding model Training----

우범여부 score : 0.532127659574468

=== FINAL 우범여부 SCORE CONVOLUTION MODEL : 0.5153054463536375===



# Embedding 추출

In [40]:
tr_cat = [train.loc[:, cat_features].values[:, k] for k in range(train.loc[:, cat_features].values.shape[1])]
tr_cat=tf.convert_to_tensor(tr_cat, dtype=tf.float32)

ts_cat = [test.loc[:, cat_features].values[:, k] for k in range(test.loc[:, cat_features].values.shape[1])]
ts_cat=tf.convert_to_tensor(ts_cat, dtype=tf.float32)

잠깐 다시

In [14]:
# *--------------- core ---------------*
# train
cat_core_train = pd.DataFrame(index=train.index)
cctr = pd.DataFrame(index=train.index)

for idx, col in enumerate(cat_features):
    for fold in range(5):
        emb_fea = models_core[fold].layers[19:38][idx](tr_cat[idx])
        emb_np_fea = emb_fea.numpy()
        for i in range(emb_np_fea.shape[1]):
            if col+f'_{i}' not in cctr.columns:
                cctr[col+f'_{i}'] = np.zeros((cctr.shape[0],1))
            cctr[col+f'_{i}'] += emb_np_fea[:,i]
    cat_core_train[col+f'_{i}']= cctr[col+f'_{i}'].div(5)
        
    

# test
cat_core_test = pd.DataFrame(index=test.index)
ccts = pd.DataFrame(index=test.index)

for idx, col in enumerate(cat_features):
    for fold in range(5):
        emb_fea = models_core[fold].layers[19:38][idx](ts_cat[idx])
        emb_np_fea = emb_fea.numpy()
        for i in range(emb_np_fea.shape[1]):
            if col+f'_{i}' not in ccts.columns:
                ccts[col+f'_{i}'] = np.zeros((ccts.shape[0],1))
            ccts[col+f'_{i}'] += emb_np_fea[:,i]
    cat_core_test[col+f'_{i}']= ccts[col+f'_{i}'].div(5)

display(cat_core_train.head(3))
display(cat_core_test.head(3))

Unnamed: 0_level_0,통관지세관부호_19,신고인부호_63,수입자부호_127,해외거래처부호_127,특송업체부호_44,수입통관계획코드_3,수입신고구분코드_1,수입거래구분코드_12,수입종류코드_4,징수형태코드_5,운송수단유형코드_2,반입보세구역부호_63,HS10단위부호_63,적출국가코드_42,원산지국가코드_49,관세율구분코드_17,관세율_43,day_15,weekend_0
신고번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
37453,0.024279,0.009357,-0.019772,0.015992,-0.009543,-0.005616,0.029179,0.006506,0.009568,0.005482,-0.003049,0.012566,-0.009749,0.026963,-0.001596,0.037951,0.0067,0.018018,0.007028
150339,0.002185,0.006163,0.001367,-0.002863,-0.009543,-0.005616,0.006096,0.006506,0.009568,0.005482,0.00633,0.016711,-0.009703,0.026963,-0.001596,0.037951,0.0067,0.018018,0.007028
55710,0.012201,-0.007529,0.00582,-0.011776,0.014703,0.008404,0.006096,0.012988,0.009568,0.005482,-0.003049,-0.001441,-0.009871,0.026963,-0.001596,-0.003311,-0.027226,0.018018,0.007028


Unnamed: 0_level_0,통관지세관부호_19,신고인부호_63,수입자부호_127,해외거래처부호_127,특송업체부호_44,수입통관계획코드_3,수입신고구분코드_1,수입거래구분코드_12,수입종류코드_4,징수형태코드_5,운송수단유형코드_2,반입보세구역부호_63,HS10단위부호_63,적출국가코드_42,원산지국가코드_49,관세율구분코드_17,관세율_43,day_15,weekend_0
신고번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
982834,0.002185,-0.010284,0.015123,-0.000158,-0.009543,0.008404,0.006096,0.006506,0.009568,0.005482,-0.003049,0.016608,0.009853,-0.003628,0.000969,0.037951,0.01998,-0.015667,0.007028
828961,0.024279,-0.006692,0.010248,-0.002863,-0.009543,-0.001819,0.006096,0.012988,0.009568,0.005482,-0.003049,0.029165,0.012146,-0.003628,0.000969,0.037951,-0.001895,-0.015667,0.007028
522066,0.024279,-0.006432,-0.007961,-0.002863,-0.009543,-0.005616,0.006096,0.002137,0.009568,0.005482,-0.003049,0.016711,-0.000477,-0.003628,0.000969,0.037951,0.0067,-0.015667,0.007028


In [15]:
# *--------------- crime ---------------*
# train
cat_crime_train = pd.DataFrame(index=train.index)
cctr = pd.DataFrame(index=train.index)

for idx, col in enumerate(cat_features):
    for fold in range(5):
        emb_fea = models_crime[fold].layers[19:38][idx](tr_cat[idx])
        emb_np_fea = emb_fea.numpy()
        for i in range(emb_np_fea.shape[1]):
            if col+f'_{i}' not in cctr.columns:
                cctr[col+f'_{i}'] = np.zeros((cctr.shape[0],1))
            cctr[col+f'_{i}'] += emb_np_fea[:,i]
    cat_crime_train[col+f'_{i}']= cctr[col+f'_{i}'].div(5)

# test
cat_crime_test = pd.DataFrame(index=test.index)
ccts = pd.DataFrame(index=test.index)

for idx, col in enumerate(cat_features):
    for fold in range(5):
        emb_fea = models_crime[fold].layers[19:38][idx](ts_cat[idx])
        emb_np_fea = emb_fea.numpy()
        for i in range(emb_np_fea.shape[1]):
            if col+f'_{i}' not in ccts.columns:
                ccts[col+f'_{i}'] = np.zeros((ccts.shape[0],1))            
            ccts[col+f'_{i}'] += emb_np_fea[:,i]
    cat_crime_test[col+f'_{i}']= ccts[col+f'_{i}'].div(5)


display(cat_crime_train.head(3))
display(cat_crime_test.head(3))

Unnamed: 0_level_0,통관지세관부호_19,신고인부호_63,수입자부호_127,해외거래처부호_127,특송업체부호_44,수입통관계획코드_3,수입신고구분코드_1,수입거래구분코드_12,수입종류코드_4,징수형태코드_5,운송수단유형코드_2,반입보세구역부호_63,HS10단위부호_63,적출국가코드_42,원산지국가코드_49,관세율구분코드_17,관세율_43,day_15,weekend_0
신고번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
37453,0.039771,-0.016093,0.002802,0.022405,0.041235,0.002652,-0.019349,0.019648,0.00259,-0.030095,0.007766,-0.005343,-0.000367,-0.029909,-0.005648,-0.014754,0.004784,0.019643,-0.013589
150339,0.01581,0.023979,-0.039386,-0.004036,0.041235,0.002652,0.000481,0.019648,0.00259,-0.030095,0.006265,-0.028444,0.004161,-0.029909,-0.005648,-0.014754,0.004784,0.019643,-0.013589
55710,0.011627,-0.000616,0.005872,-0.008071,-0.0263,-0.025567,0.000481,-0.024059,0.00259,-0.030095,0.007766,0.0099,0.005374,-0.029909,-0.005648,-0.000714,-0.001931,0.019643,-0.013589


Unnamed: 0_level_0,통관지세관부호_19,신고인부호_63,수입자부호_127,해외거래처부호_127,특송업체부호_44,수입통관계획코드_3,수입신고구분코드_1,수입거래구분코드_12,수입종류코드_4,징수형태코드_5,운송수단유형코드_2,반입보세구역부호_63,HS10단위부호_63,적출국가코드_42,원산지국가코드_49,관세율구분코드_17,관세율_43,day_15,weekend_0
신고번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
982834,0.01581,-0.026803,-0.001076,0.006481,0.041235,-0.025567,0.000481,0.019648,0.00259,-0.030095,0.007766,0.000797,0.004184,0.021299,-0.005293,-0.014754,0.001252,0.011279,-0.013589
828961,0.039771,-0.004878,0.027642,-0.004036,0.041235,0.001713,0.000481,-0.024059,0.00259,-0.030095,0.007766,-0.004896,-0.004956,0.021299,-0.005293,-0.014754,-0.000536,0.011279,-0.013589
522066,0.039771,-0.00815,-0.013161,-0.004036,0.041235,0.002652,0.000481,0.024596,0.00259,-0.030095,0.007766,-0.028444,-0.001882,0.021299,-0.005293,-0.014754,0.004784,0.011279,-0.013589


여기 다시 시작

In [41]:
# *--------------- core ---------------*
# train
cat_core_train = pd.DataFrame(index=train.index)

for idx, col in enumerate(cat_features):
    emb_fea = models_core[4].layers[19:38][idx](tr_cat[idx])
    emb_np_fea = emb_fea.numpy()
    for i in range(emb_np_fea.shape[1]):
        cat_core_train[col+f'_{i}']= emb_np_fea[:,i]
    

# test
cat_core_test = pd.DataFrame(index=test.index)

for idx, col in enumerate(cat_features):
    emb_fea = models_core[4].layers[19:38][idx](ts_cat[idx])
    emb_np_fea = emb_fea.numpy()
    for i in range(emb_np_fea.shape[1]):
        cat_core_test[col+f'_{i}']= emb_np_fea[:,i]

display(cat_core_train.head(3))
display(cat_core_test.head(3))

Unnamed: 0_level_0,통관지세관부호_0,통관지세관부호_1,통관지세관부호_2,통관지세관부호_3,통관지세관부호_4,통관지세관부호_5,통관지세관부호_6,통관지세관부호_7,통관지세관부호_8,통관지세관부호_9,통관지세관부호_10,통관지세관부호_11,통관지세관부호_12,통관지세관부호_13,통관지세관부호_14,통관지세관부호_15,통관지세관부호_16,통관지세관부호_17,통관지세관부호_18,통관지세관부호_19,신고인부호_0,신고인부호_1,신고인부호_2,신고인부호_3,신고인부호_4,신고인부호_5,신고인부호_6,신고인부호_7,신고인부호_8,신고인부호_9,신고인부호_10,신고인부호_11,신고인부호_12,신고인부호_13,신고인부호_14,신고인부호_15,신고인부호_16,신고인부호_17,신고인부호_18,신고인부호_19,...,관세율_21,관세율_22,관세율_23,관세율_24,관세율_25,관세율_26,관세율_27,관세율_28,관세율_29,관세율_30,관세율_31,관세율_32,관세율_33,관세율_34,관세율_35,관세율_36,관세율_37,관세율_38,관세율_39,관세율_40,관세율_41,관세율_42,관세율_43,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,weekend_0
신고번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
37453,0.064275,-0.029446,0.011354,0.018429,0.044977,-0.035961,-0.048029,0.012356,-0.046105,-0.001873,0.019479,-0.014267,-0.02036,-0.022555,-0.054937,0.026455,0.00923,0.015258,-0.008245,0.013653,-0.054015,-0.037218,-0.021553,0.045821,0.013931,-0.043049,0.066927,-0.007234,0.006433,-0.010754,0.014997,-0.028691,0.02163,-0.022095,-0.007157,-0.013257,0.03512,-0.006817,-0.000522,-0.027899,...,-0.002301,-0.010459,0.012276,0.019648,-0.014109,-0.031346,0.025217,0.015196,-0.027359,-0.021836,0.035133,0.023912,0.03207,-0.002111,-0.02626,0.047841,0.024706,0.03247,-0.019446,0.046758,-0.005221,0.040196,0.035182,-0.037542,0.04264,-0.023423,0.039137,-0.035639,0.033582,0.056397,0.04733,0.023566,-0.005371,-0.007364,-0.001807,0.003777,0.053291,0.012038,0.049544,-0.009774
150339,-0.000811,0.006605,0.022379,-0.012366,0.037841,0.052876,0.033043,-0.019723,0.016984,-0.02278,0.029995,-0.010349,0.040193,-0.010478,-0.02654,0.030811,-0.004635,0.036932,-0.019962,0.000798,-0.027165,-0.012734,0.019167,-0.045841,0.013997,0.037788,0.055325,0.030724,0.018678,0.042377,0.026212,-0.008906,-0.040204,-0.031311,-0.00638,0.007945,-0.018674,0.03333,-0.046059,0.005469,...,-0.002301,-0.010459,0.012276,0.019648,-0.014109,-0.031346,0.025217,0.015196,-0.027359,-0.021836,0.035133,0.023912,0.03207,-0.002111,-0.02626,0.047841,0.024706,0.03247,-0.019446,0.046758,-0.005221,0.040196,0.035182,-0.037542,0.04264,-0.023423,0.039137,-0.035639,0.033582,0.056397,0.04733,0.023566,-0.005371,-0.007364,-0.001807,0.003777,0.053291,0.012038,0.049544,-0.009774
55710,-0.034512,-0.055852,0.000505,0.030783,-0.015779,-0.005553,-0.050149,0.015983,-0.04709,-0.027854,0.043107,-0.022691,-0.011528,-0.050992,-0.044013,0.03247,-0.047548,0.059217,-0.062715,0.029923,-0.025963,0.017941,0.053298,-0.022314,-0.009944,-0.031259,-0.033316,-0.022394,-0.047757,-0.018046,0.083585,-0.005007,0.035942,0.04868,-0.031267,-0.029512,0.015407,0.037808,-0.034249,-0.024082,...,0.042296,0.009842,0.034333,-0.046807,0.040609,-0.024184,0.004573,-0.04975,0.012609,-0.035466,-0.036719,-0.008284,-0.017431,-0.060992,0.044244,0.003827,-0.053117,0.014619,-0.052018,-0.046874,0.057911,0.023953,-0.023398,-0.037542,0.04264,-0.023423,0.039137,-0.035639,0.033582,0.056397,0.04733,0.023566,-0.005371,-0.007364,-0.001807,0.003777,0.053291,0.012038,0.049544,-0.009774


Unnamed: 0_level_0,통관지세관부호_0,통관지세관부호_1,통관지세관부호_2,통관지세관부호_3,통관지세관부호_4,통관지세관부호_5,통관지세관부호_6,통관지세관부호_7,통관지세관부호_8,통관지세관부호_9,통관지세관부호_10,통관지세관부호_11,통관지세관부호_12,통관지세관부호_13,통관지세관부호_14,통관지세관부호_15,통관지세관부호_16,통관지세관부호_17,통관지세관부호_18,통관지세관부호_19,신고인부호_0,신고인부호_1,신고인부호_2,신고인부호_3,신고인부호_4,신고인부호_5,신고인부호_6,신고인부호_7,신고인부호_8,신고인부호_9,신고인부호_10,신고인부호_11,신고인부호_12,신고인부호_13,신고인부호_14,신고인부호_15,신고인부호_16,신고인부호_17,신고인부호_18,신고인부호_19,...,관세율_21,관세율_22,관세율_23,관세율_24,관세율_25,관세율_26,관세율_27,관세율_28,관세율_29,관세율_30,관세율_31,관세율_32,관세율_33,관세율_34,관세율_35,관세율_36,관세율_37,관세율_38,관세율_39,관세율_40,관세율_41,관세율_42,관세율_43,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,weekend_0
신고번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
982834,-0.000811,0.006605,0.022379,-0.012366,0.037841,0.052876,0.033043,-0.019723,0.016984,-0.02278,0.029995,-0.010349,0.040193,-0.010478,-0.02654,0.030811,-0.004635,0.036932,-0.019962,0.000798,-0.060399,0.036754,-0.03187,0.011467,0.013529,0.004548,0.027652,0.016514,0.042055,-0.034709,-0.053672,0.00186,-0.007071,0.023532,0.002487,0.039827,0.0151,0.029454,-0.00015,-0.035745,...,-0.049165,-0.029215,-0.008308,0.010405,0.051298,0.050056,-0.059542,-0.079117,-0.038421,-0.020425,0.066151,-0.030459,-0.026817,-0.045282,0.019489,0.034607,0.001995,0.054781,0.029057,-0.000236,0.014841,0.006123,0.004365,-0.006914,0.061576,0.060793,0.051665,-0.049655,0.029811,-0.015067,0.053259,-0.056393,-0.049588,-0.028061,-0.021486,0.012628,0.002097,-0.006081,-0.064833,-0.009774
828961,0.064275,-0.029446,0.011354,0.018429,0.044977,-0.035961,-0.048029,0.012356,-0.046105,-0.001873,0.019479,-0.014267,-0.02036,-0.022555,-0.054937,0.026455,0.00923,0.015258,-0.008245,0.013653,0.038897,-0.060467,-0.011078,0.00807,0.007363,-0.035199,0.020286,-0.021779,-0.023382,-0.039932,0.034427,0.021153,0.071502,0.023672,0.032827,0.014711,-0.018583,0.019072,-0.004797,0.038863,...,0.022852,0.019306,-0.028398,-0.032796,-0.019097,0.007882,-0.010705,0.001392,0.016739,-0.016057,0.009552,0.019463,-0.022741,0.010477,-0.010038,0.023987,0.010821,0.020258,-0.03015,-0.022367,-0.00083,-0.024735,-0.000639,-0.006914,0.061576,0.060793,0.051665,-0.049655,0.029811,-0.015067,0.053259,-0.056393,-0.049588,-0.028061,-0.021486,0.012628,0.002097,-0.006081,-0.064833,-0.009774
522066,0.064275,-0.029446,0.011354,0.018429,0.044977,-0.035961,-0.048029,0.012356,-0.046105,-0.001873,0.019479,-0.014267,-0.02036,-0.022555,-0.054937,0.026455,0.00923,0.015258,-0.008245,0.013653,0.013893,-0.005255,-0.000263,0.029171,-0.00824,-0.001973,0.059886,-0.036244,0.041912,-0.036121,-0.016641,-0.01692,-0.012139,0.042635,0.024303,-0.016911,0.021975,0.006657,-0.010404,-0.052432,...,-0.002301,-0.010459,0.012276,0.019648,-0.014109,-0.031346,0.025217,0.015196,-0.027359,-0.021836,0.035133,0.023912,0.03207,-0.002111,-0.02626,0.047841,0.024706,0.03247,-0.019446,0.046758,-0.005221,0.040196,0.035182,-0.006914,0.061576,0.060793,0.051665,-0.049655,0.029811,-0.015067,0.053259,-0.056393,-0.049588,-0.028061,-0.021486,0.012628,0.002097,-0.006081,-0.064833,-0.009774


In [43]:
# *--------------- crime ---------------*
# train
cat_crime_train = pd.DataFrame(index=train.index)

for idx, col in enumerate(cat_features):
    emb_fea = models_crime[4].layers[19:38][idx](tr_cat[idx])
    emb_np_fea = emb_fea.numpy()
    for i in range(emb_np_fea.shape[1]):
        cat_crime_train[col+f'_{i}']= emb_np_fea[:,i]

# test
cat_crime_test = pd.DataFrame(index=test.index)

for idx, col in enumerate(cat_features):
    emb_fea = models_crime[4].layers[19:38][idx](ts_cat[idx])
    emb_np_fea = emb_fea.numpy()
    for i in range(emb_np_fea.shape[1]):
        cat_crime_test[col+f'_{i}']= emb_np_fea[:,i]


display(cat_crime_train.head(3))
display(cat_crime_test.head(3))

Unnamed: 0_level_0,통관지세관부호_0,통관지세관부호_1,통관지세관부호_2,통관지세관부호_3,통관지세관부호_4,통관지세관부호_5,통관지세관부호_6,통관지세관부호_7,통관지세관부호_8,통관지세관부호_9,통관지세관부호_10,통관지세관부호_11,통관지세관부호_12,통관지세관부호_13,통관지세관부호_14,통관지세관부호_15,통관지세관부호_16,통관지세관부호_17,통관지세관부호_18,통관지세관부호_19,신고인부호_0,신고인부호_1,신고인부호_2,신고인부호_3,신고인부호_4,신고인부호_5,신고인부호_6,신고인부호_7,신고인부호_8,신고인부호_9,신고인부호_10,신고인부호_11,신고인부호_12,신고인부호_13,신고인부호_14,신고인부호_15,신고인부호_16,신고인부호_17,신고인부호_18,신고인부호_19,...,관세율_21,관세율_22,관세율_23,관세율_24,관세율_25,관세율_26,관세율_27,관세율_28,관세율_29,관세율_30,관세율_31,관세율_32,관세율_33,관세율_34,관세율_35,관세율_36,관세율_37,관세율_38,관세율_39,관세율_40,관세율_41,관세율_42,관세율_43,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,weekend_0
신고번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
37453,0.018121,0.044032,0.037644,0.024416,-0.011122,0.042222,-0.066015,0.041293,-0.031958,-0.039154,-0.02425,-0.045447,-0.021189,-0.009322,-0.042616,0.014053,-0.043027,0.005336,0.031526,0.05528,0.024858,-0.014047,0.035779,-0.018057,-0.019106,0.027097,0.038422,0.029888,-0.075304,-0.011693,0.053882,-0.041976,-0.029658,0.015022,0.047647,0.026695,0.013649,-0.016586,0.022635,-0.034284,...,-0.007769,-0.006705,0.014137,0.003178,-0.048988,0.010556,-0.047466,0.027861,0.02889,0.002035,0.02917,-0.016657,-0.020387,0.007694,0.004169,-0.008247,-0.006931,0.001764,0.04231,-0.005421,-0.002822,0.018943,-0.011157,-0.026133,-0.002669,-0.03529,0.030449,-0.022885,-0.007078,0.031616,-0.040033,0.019869,-0.012442,-0.002093,-0.018178,-0.029972,0.038159,-0.0416,0.04022,0.023327
150339,-0.04211,0.013625,0.013675,-0.0308,0.046402,0.03282,-0.009104,-0.00361,0.056347,-0.049839,-0.010478,0.042602,-0.04169,0.032653,-0.012903,-0.007768,-0.004157,0.06408,-0.055446,0.025814,0.000832,-0.034185,0.027859,0.017278,-0.001149,-0.030662,-0.015281,-0.037803,-0.003758,0.022341,0.038941,0.043093,0.055558,-0.020783,-0.006578,-0.040546,0.016633,0.0161,0.03375,-0.014636,...,-0.007769,-0.006705,0.014137,0.003178,-0.048988,0.010556,-0.047466,0.027861,0.02889,0.002035,0.02917,-0.016657,-0.020387,0.007694,0.004169,-0.008247,-0.006931,0.001764,0.04231,-0.005421,-0.002822,0.018943,-0.011157,-0.026133,-0.002669,-0.03529,0.030449,-0.022885,-0.007078,0.031616,-0.040033,0.019869,-0.012442,-0.002093,-0.018178,-0.029972,0.038159,-0.0416,0.04022,0.023327
55710,0.008354,0.042612,-0.075511,0.062349,-0.003222,0.04517,-0.011197,-0.019656,0.046828,-0.035607,-0.102185,-0.017689,-0.067398,-0.046916,-0.047099,0.058365,-0.085595,-0.015552,0.04702,0.019375,-0.043488,0.042741,0.031486,-0.042923,-0.011304,0.031941,0.006728,0.052161,-0.018685,-0.026011,0.034446,-0.03361,0.030065,-0.049935,-0.053718,0.031772,-0.068455,0.029539,-0.055849,0.017993,...,0.007442,0.025986,0.004155,0.027204,-0.021961,-0.003833,-0.024267,-0.030486,-0.055402,0.045315,-0.04583,-0.051701,0.000122,-0.001364,0.029655,-0.041862,0.014546,0.004839,-0.038878,-0.007707,0.027599,-0.006663,-0.005837,-0.026133,-0.002669,-0.03529,0.030449,-0.022885,-0.007078,0.031616,-0.040033,0.019869,-0.012442,-0.002093,-0.018178,-0.029972,0.038159,-0.0416,0.04022,0.023327


Unnamed: 0_level_0,통관지세관부호_0,통관지세관부호_1,통관지세관부호_2,통관지세관부호_3,통관지세관부호_4,통관지세관부호_5,통관지세관부호_6,통관지세관부호_7,통관지세관부호_8,통관지세관부호_9,통관지세관부호_10,통관지세관부호_11,통관지세관부호_12,통관지세관부호_13,통관지세관부호_14,통관지세관부호_15,통관지세관부호_16,통관지세관부호_17,통관지세관부호_18,통관지세관부호_19,신고인부호_0,신고인부호_1,신고인부호_2,신고인부호_3,신고인부호_4,신고인부호_5,신고인부호_6,신고인부호_7,신고인부호_8,신고인부호_9,신고인부호_10,신고인부호_11,신고인부호_12,신고인부호_13,신고인부호_14,신고인부호_15,신고인부호_16,신고인부호_17,신고인부호_18,신고인부호_19,...,관세율_21,관세율_22,관세율_23,관세율_24,관세율_25,관세율_26,관세율_27,관세율_28,관세율_29,관세율_30,관세율_31,관세율_32,관세율_33,관세율_34,관세율_35,관세율_36,관세율_37,관세율_38,관세율_39,관세율_40,관세율_41,관세율_42,관세율_43,day_0,day_1,day_2,day_3,day_4,day_5,day_6,day_7,day_8,day_9,day_10,day_11,day_12,day_13,day_14,day_15,weekend_0
신고번호,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
982834,-0.04211,0.013625,0.013675,-0.0308,0.046402,0.03282,-0.009104,-0.00361,0.056347,-0.049839,-0.010478,0.042602,-0.04169,0.032653,-0.012903,-0.007768,-0.004157,0.06408,-0.055446,0.025814,0.009543,-0.078394,-0.059451,0.016502,0.018955,0.012109,0.046664,0.041366,-0.011186,0.066168,0.031288,0.014067,-0.002964,0.01138,0.048481,0.022176,-0.039073,0.028543,-0.05213,0.028468,...,-0.029521,0.049924,0.04149,0.047475,-0.032311,0.081233,-0.026337,-0.020287,-0.060234,0.049493,0.044189,-0.043656,-0.041216,-0.000679,-0.017433,0.012311,-0.041897,0.055727,-0.027564,-0.029521,-0.019588,-0.051609,0.034571,0.00124,-0.019877,0.003208,-0.069878,0.025644,0.03857,-0.031915,0.005517,0.00306,-0.015315,-0.010015,0.052841,0.002427,-0.03027,-0.019413,-0.030976,0.023327
828961,0.018121,0.044032,0.037644,0.024416,-0.011122,0.042222,-0.066015,0.041293,-0.031958,-0.039154,-0.02425,-0.045447,-0.021189,-0.009322,-0.042616,0.014053,-0.043027,0.005336,0.031526,0.05528,-0.037673,-0.055785,0.039998,0.041843,-0.011257,-0.034963,0.003428,-0.001797,-0.03486,-0.056145,0.025576,-0.016958,0.032329,-0.012644,-0.028697,-0.000319,0.020404,-0.047959,-0.015849,0.026098,...,0.023294,-0.037613,0.008294,0.020557,0.054755,0.009647,-0.016978,-0.030463,-0.038906,-0.038005,0.001871,0.019633,-0.005108,0.012515,-0.02013,0.045909,-0.001728,-0.017197,-0.002981,-0.004234,-0.001474,0.027593,0.037962,0.00124,-0.019877,0.003208,-0.069878,0.025644,0.03857,-0.031915,0.005517,0.00306,-0.015315,-0.010015,0.052841,0.002427,-0.03027,-0.019413,-0.030976,0.023327
522066,0.018121,0.044032,0.037644,0.024416,-0.011122,0.042222,-0.066015,0.041293,-0.031958,-0.039154,-0.02425,-0.045447,-0.021189,-0.009322,-0.042616,0.014053,-0.043027,0.005336,0.031526,0.05528,-0.017653,0.008775,0.066512,-0.051595,0.049834,-0.037143,-0.030067,0.02757,-0.009675,0.015527,0.033402,-0.01174,-0.04325,-0.083775,-0.044043,0.027572,-0.041218,-0.054315,0.021966,0.014982,...,-0.007769,-0.006705,0.014137,0.003178,-0.048988,0.010556,-0.047466,0.027861,0.02889,0.002035,0.02917,-0.016657,-0.020387,0.007694,0.004169,-0.008247,-0.006931,0.001764,0.04231,-0.005421,-0.002822,0.018943,-0.011157,0.00124,-0.019877,0.003208,-0.069878,0.025644,0.03857,-0.031915,0.005517,0.00306,-0.015315,-0.010015,0.052841,0.002427,-0.03027,-0.019413,-0.030976,0.023327


In [44]:
cat_core_train.to_csv("/content/drive/MyDrive/관세청/embed_cat_core_train_2.csv", encoding='utf-8', index=True)
cat_core_test.to_csv("/content/drive/MyDrive/관세청/embed_cat_core_test_2.csv", encoding='utf-8', index=True)

cat_crime_train.to_csv("/content/drive/MyDrive/관세청/embed_cat_crime_train_2.csv", encoding='utf-8', index=True)
cat_crime_test.to_csv("/content/drive/MyDrive/관세청/embed_cat_crime_test_2.csv", encoding='utf-8', index=True)

# Catboost

In [17]:
train_y_crime = train['우범여부']
train_y_core = train['핵심적발']

train_y_crime.shape, train_y_core.shape

((89619,), (89619,))

In [35]:
cat_crime_train = pd.read_csv("/content/drive/MyDrive/관세청/embed_cat_crime_train.csv")
cat_crime_test = pd.read_csv("/content/drive/MyDrive/관세청/embed_cat_crime_test.csv")

cat_core_train = pd.read_csv("/content/drive/MyDrive/관세청/embed_cat_core_train.csv")
cat_core_test = pd.read_csv("/content/drive/MyDrive/관세청/embed_cat_core_test.csv")

In [45]:
cat_crime_train['신고중량(KG)']= train['신고중량(KG)']
cat_crime_test['신고중량(KG)']= test['신고중량(KG)']

cat_crime_train['과세가격원화금액']= train['과세가격원화금액']
cat_crime_test['과세가격원화금액']= test['과세가격원화금액']

cat_core_train['신고중량(KG)']= train['신고중량(KG)']
cat_core_test['신고중량(KG)']= test['신고중량(KG)']

cat_core_train['과세가격원화금액']= train['과세가격원화금액']
cat_core_test['과세가격원화금액']= test['과세가격원화금액']

In [46]:
cat_core_params = {
    'bootstrap_type': 'Poisson',
    'custom_metric':'F1',
    'random_seed': seed,
    'task_type': 'GPU',
    'learning_rate': 1e-1,
    'n_estimators': 2000,
    'auto_class_weights':"Balanced"
}

cat_crime_params = {
    'bootstrap_type': 'Poisson',
    'custom_metric':'F1',
    'random_seed': seed,
    'task_type': 'GPU',
    'learning_rate': 1e-1,
    'n_estimators': 2000,
    'auto_class_weights':"Balanced"    
}

core

In [20]:
n_fold = 5

cat_pred = np.zeros((cat_core_train.shape[0], 1))
pred_core_test = pd.DataFrame()
feat_core_importance = pd.DataFrame({'fea_name':cat_core_train.columns.to_list()})

train_x = cat_core_train
test_x= cat_core_test
target_y = train_y_core

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_core_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid)
    #pred_core_test[f'{fold}_pred']=model_cat.predict(cat_core_test).reshape(-1,)
    feat_core_importance[f'importance_{fold}'] = model_cat.get_feature_importance()
    
    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='macro'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='macro'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='macro'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='macro'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='macro'))
print('recall Score:', recall_score(target_y, cat_pred, average='macro'))


----------------- Fold 0 -----------------

0:	learn: 1.0778348	test: 1.0776663	best: 1.0776663 (0)	total: 22.6ms	remaining: 45.2s
100:	learn: 0.9195796	test: 0.9558806	best: 0.9556670 (91)	total: 1.44s	remaining: 27s
200:	learn: 0.8873408	test: 0.9546067	best: 0.9540728 (180)	total: 2.67s	remaining: 23.9s
bestTest = 0.9540728187
bestIteration = 180
Shrink model to first 181 iterations.

CV f1 Score: 0.44918274787774876

CV precision Score: 0.4509730149189464
CV recall Score: 0.5036852593013844

----------------- Fold 1 -----------------

0:	learn: 1.0772399	test: 1.0784745	best: 1.0784745 (0)	total: 12.6ms	remaining: 25.1s
100:	learn: 0.9175765	test: 0.9578853	best: 0.9578853 (100)	total: 1.12s	remaining: 21s
bestTest = 0.957805917
bestIteration = 113
Shrink model to first 114 iterations.

CV f1 Score: 0.44620381730976755

CV precision Score: 0.4484841477694094
CV recall Score: 0.4997967771408665

----------------- Fold 2 -----------------

0:	learn: 1.0780366	test: 1.0783365	best: 1

crime

In [21]:
n_fold = 5

cat_pred = np.zeros((cat_crime_train.shape[0], 1))
pred_crime_test = pd.DataFrame()
feat_crime_importance = pd.DataFrame({'fea_name':cat_crime_train.columns.to_list()})

train_x = cat_crime_train
test_x= cat_crime_test
target_y = train_y_crime

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_crime_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid).reshape(-1,1)
    #pred_crime_test[f'{fold}_pred']=model_cat.predict(cat_crime_test).reshape(-1,)
    
    feat_crime_importance[f'importance_{fold}'] = model_cat.get_feature_importance()

    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='binary'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='binary'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='binary'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='binary'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='binary'))
print('recall Score:', recall_score(target_y, cat_pred, average='binary'))



----------------- Fold 0 -----------------

0:	learn: 0.6668838	test: 0.6671397	best: 0.6671397 (0)	total: 9.84ms	remaining: 19.7s
100:	learn: 0.5201650	test: 0.5367427	best: 0.5367427 (100)	total: 896ms	remaining: 16.8s
200:	learn: 0.5046396	test: 0.5341767	best: 0.5341599 (193)	total: 1.79s	remaining: 16s
300:	learn: 0.4922046	test: 0.5333664	best: 0.5331743 (259)	total: 2.7s	remaining: 15.3s
bestTest = 0.5331743182
bestIteration = 259
Shrink model to first 260 iterations.

CV f1 Score: 0.5472493942385354

CV precision Score: 0.42423820787533045
CV recall Score: 0.7707280080889788

----------------- Fold 1 -----------------

0:	learn: 0.6668698	test: 0.6675375	best: 0.6675375 (0)	total: 9.03ms	remaining: 18.1s
100:	learn: 0.5196384	test: 0.5374085	best: 0.5374085 (100)	total: 871ms	remaining: 16.4s
200:	learn: 0.5045493	test: 0.5359109	best: 0.5357747 (169)	total: 1.76s	remaining: 15.8s
bestTest = 0.5357747291
bestIteration = 169
Shrink model to first 170 iterations.

CV f1 Score: 0

# 피처중요도

core

In [None]:
feat_core_importance.sort_values("importance_1", ascending=False)

In [22]:
#feat_crime_importance.loc[feat_crime_importance.apply(lambda row: np.count_nonzero(row[1:])), axis=1)]

feat_core_drop_list = feat_core_importance.loc[feat_core_importance.apply(lambda row: row[1:].sum()==0, axis=1)].fea_name.to_list()

if '과세가격원화금액' not in feat_core_drop_list:
    feat_core_drop_list.append('과세가격원화금액')
    
print('과세가격원화금액' in feat_core_drop_list)
print(feat_core_drop_list)
print(len(feat_core_drop_list))

True
['과세가격원화금액']
1


crime

In [25]:
feat_crime_importance.sort_values("importance_1", ascending=False)

Unnamed: 0,fea_name,importance_0,importance_1,importance_2,importance_3,importance_4
3,해외거래처부호_127,19.535566,21.478648,20.887793,17.916253,20.451294
19,신고중량(KG),17.273907,18.644268,18.04645,16.903875,18.123535
4,특송업체부호_44,13.663461,16.001799,14.353923,13.596549,14.870355
7,수입거래구분코드_12,4.867349,5.409218,5.199052,4.927528,5.072357
9,징수형태코드_5,4.466812,5.124188,5.050657,4.397276,4.732566
0,통관지세관부호_19,4.115221,4.222089,4.664513,4.797944,4.52736
5,수입통관계획코드_3,3.832906,4.021779,3.529952,3.785962,3.810101
11,반입보세구역부호_63,6.499261,4.014725,4.71569,5.566476,5.639652
8,수입종류코드_4,2.302282,2.986295,2.851516,2.534026,2.590457
1,신고인부호_63,3.771944,2.926055,3.544174,3.944224,3.129049


In [26]:
feat_crime_importance.sort_values("importance_1",)

Unnamed: 0,fea_name,importance_0,importance_1,importance_2,importance_3,importance_4
15,관세율구분코드_17,0.556427,0.417085,0.51272,1.190068,0.605468
13,적출국가코드_42,1.208173,0.705536,1.002064,1.301321,0.850927
16,관세율_43,1.348933,0.80244,1.495763,2.243302,0.903237
18,weekend_0,0.701237,0.852169,0.873704,0.608951,0.691705
14,원산지국가코드_49,1.831852,0.857915,1.041814,1.448561,1.060213
17,day_15,1.816144,1.256552,1.324704,1.692778,1.255277
12,HS10단위부호_63,2.549951,1.625987,1.855763,3.092634,2.142688
6,수입신고구분코드_1,1.880391,2.094371,2.018442,1.874722,2.014558
2,수입자부호_127,2.963295,2.139003,2.511332,2.835911,2.6077
10,운송수단유형코드_2,1.993056,2.209347,2.032769,2.124874,2.289231


In [24]:
feat_crime_importance.loc[feat_crime_importance.apply(lambda row: row[1:].sum()==0, axis=1)]

Unnamed: 0,fea_name,importance_0,importance_1,importance_2,importance_3,importance_4


In [23]:
feat_crime_drop_list = feat_crime_importance.loc[feat_crime_importance.apply(lambda row: row[1:].sum()==0, axis=1)].fea_name.to_list()

if '과세가격원화금액' not in feat_crime_drop_list:
    feat_crime_drop_list.append('과세가격원화금액')

print('과세가격원화금액' in feat_crime_drop_list)
print(feat_crime_drop_list)
print(len(feat_crime_drop_list))

True
['과세가격원화금액']
1


# bad 피처 제거 후 catboost 재실행

In [27]:
cat_core_train.drop(feat_core_drop_list, axis=1, inplace=True)
cat_core_test.drop(feat_core_drop_list, axis=1, inplace=True)

cat_crime_train.drop(feat_crime_drop_list, axis=1, inplace=True)
cat_crime_test.drop(feat_crime_drop_list, axis=1, inplace=True)

core

In [28]:
n_fold = 5

cat_pred = np.zeros((cat_core_train.shape[0], 1))
pred_core_test = pd.DataFrame()
feat_core_importance = pd.DataFrame({'fea_name':cat_core_train.columns.to_list()})

train_x = cat_core_train
target_y = train_y_core

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_core_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid)
    pred_core_test[f'{fold}_pred']=model_cat.predict(cat_core_test).reshape(-1,)
    feat_core_importance[f'importance_{fold}'] = model_cat.get_feature_importance()
    
    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='macro'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='macro'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='macro'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='macro'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='macro'))
print('recall Score:', recall_score(target_y, cat_pred, average='macro'))


----------------- Fold 0 -----------------

0:	learn: 1.0778351	test: 1.0776663	best: 1.0776663 (0)	total: 17.7ms	remaining: 35.4s
100:	learn: 0.9198997	test: 0.9546410	best: 0.9545356 (91)	total: 1.41s	remaining: 26.4s
200:	learn: 0.8880878	test: 0.9539131	best: 0.9530677 (161)	total: 2.66s	remaining: 23.8s
bestTest = 0.9530677496
bestIteration = 161
Shrink model to first 162 iterations.

CV f1 Score: 0.45041501078430696

CV precision Score: 0.45219121844181737
CV recall Score: 0.5057396353130962

----------------- Fold 1 -----------------

0:	learn: 1.0772401	test: 1.0784746	best: 1.0784746 (0)	total: 14.1ms	remaining: 28.1s
100:	learn: 0.9184089	test: 0.9575585	best: 0.9575585 (100)	total: 1.14s	remaining: 21.5s
bestTest = 0.9565721937
bestIteration = 140
Shrink model to first 141 iterations.

CV f1 Score: 0.44707990284687166

CV precision Score: 0.4489342929467088
CV recall Score: 0.5010540857957663

----------------- Fold 2 -----------------

0:	learn: 1.0780364	test: 1.0783365	b

crime

In [29]:
n_fold = 5

cat_pred = np.zeros((cat_crime_train.shape[0], 1))
pred_crime_test = pd.DataFrame()
feat_crime_importance = pd.DataFrame({'fea_name':cat_crime_train.columns.to_list()})

train_x = cat_crime_train
target_y = train_y_crime

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_crime_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid).reshape(-1,1)
    pred_crime_test[f'{fold}_pred']=model_cat.predict(cat_crime_test).reshape(-1,)
    
    feat_crime_importance[f'importance_{fold}'] = model_cat.get_feature_importance()

    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='binary'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='binary'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='binary'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='binary'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='binary'))
print('recall Score:', recall_score(target_y, cat_pred, average='binary'))



----------------- Fold 0 -----------------

0:	learn: 0.6668839	test: 0.6671399	best: 0.6671399 (0)	total: 9.27ms	remaining: 18.5s
100:	learn: 0.5203354	test: 0.5366359	best: 0.5365319 (95)	total: 861ms	remaining: 16.2s
200:	learn: 0.5055614	test: 0.5346990	best: 0.5346990 (200)	total: 1.7s	remaining: 15.2s
300:	learn: 0.4937688	test: 0.5341775	best: 0.5338523 (274)	total: 2.54s	remaining: 14.3s
bestTest = 0.5338522955
bestIteration = 274
Shrink model to first 275 iterations.

CV f1 Score: 0.54416897878815

CV precision Score: 0.4212276569211584
CV recall Score: 0.7684529828109201

----------------- Fold 1 -----------------

0:	learn: 0.6668698	test: 0.6675375	best: 0.6675375 (0)	total: 9.4ms	remaining: 18.8s
100:	learn: 0.5201080	test: 0.5370282	best: 0.5370282 (100)	total: 854ms	remaining: 16.1s
200:	learn: 0.5053450	test: 0.5354655	best: 0.5354600 (175)	total: 1.67s	remaining: 14.9s
bestTest = 0.5352200295
bestIteration = 249
Shrink model to first 250 iterations.

CV f1 Score: 0.54

#제출

In [54]:
ss.head(3)

Unnamed: 0,신고번호,우범여부,핵심적발
0,982834,1,0.0
1,828961,0,0.0
2,522066,0,0.0


In [55]:
pred_core_test

Unnamed: 0,0_pred,1_pred,2_pred,3_pred,4_pred
0,1,0,0,0,0
1,0,0,0,0,0
2,0,0,0,0,0
3,0,0,0,0,0
4,0,0,0,0,0
5,0,0,0,0,0
6,0,0,0,0,0
7,0,0,0,0,0
8,0,0,0,0,0
9,0,0,0,0,0


In [56]:
ss['핵심적발']= pred_core_test.T.mode().rename(index={0:"pred_mode"}).T['pred_mode']
ss['우범여부']= pred_crime_test.T.mode().rename(index={0:"pred_mode"}).T['pred_mode']

In [64]:
# 안 합친 애
display(ss.우범여부.value_counts())
display(ss.핵심적발.value_counts())

0    6348
1    3925
Name: 우범여부, dtype: int64

0.0    6418
2.0    2015
1.0    1840
Name: 핵심적발, dtype: int64

In [71]:
3925-2015-1840

70

In [69]:
ss.query("우범여부==1 and 핵심적발==0")

Unnamed: 0,신고번호,우범여부,핵심적발
0,982834,1,0.0
22,787545,1,0.0
38,583102,1,0.0
47,584614,1,0.0
71,827632,1,0.0
73,604879,1,0.0
82,624008,1,0.0
102,838366,1,0.0
134,948568,1,0.0
144,954784,1,0.0


In [70]:
710+640

1350

In [68]:
ss.query("우범여부==0 and 핵심적발!=0")

Unnamed: 0,신고번호,우범여부,핵심적발
17,527587,0,2.0
20,963859,0,1.0
27,908960,0,1.0
34,570682,0,2.0
62,927429,0,1.0
76,555581,0,2.0
78,839653,0,2.0
87,675459,0,1.0
89,718528,0,1.0
92,540955,0,2.0


In [67]:
ss.head(100)

Unnamed: 0,신고번호,우범여부,핵심적발
0,982834,1,0.0
1,828961,0,0.0
2,522066,0,0.0
3,999547,0,0.0
4,919320,0,0.0
5,792304,0,0.0
6,778082,0,0.0
7,624568,0,0.0
8,649243,0,0.0
9,834543,0,0.0


In [35]:
# embedding 다 합친 거 /외계인 1
display(ss.우범여부.value_counts())
display(ss.핵심적발.value_counts())

0    5954
1    4319
Name: 우범여부, dtype: int64

0.0    5768
2.0    2365
1.0    2140
Name: 핵심적발, dtype: int64

In [38]:
display(df.우범여부.value_counts())
display(df.핵심적발.value_counts())

0    5874
1    4399
Name: 우범여부, dtype: int64

0.0    5857
2.0    2476
1.0    1940
Name: 핵심적발, dtype: int64

In [58]:
ss.to_csv("외계인.csv", index=False)

In [65]:
(ss == df).sum()

신고번호    10273
우범여부    10273
핵심적발    10273
dtype: int64

In [75]:
ss.query("우범여부==1 and 핵심적발==0").shape, ss.query("우범여부==0 and 핵심적발!=0").shape

((710, 3), (640, 3))

In [74]:
df.query("우범여부==1 and 핵심적발==0").shape, df.query("우범여부==0 and 핵심적발!=0").shape

((157, 3), (174, 3))

In [72]:
# encoding 확인
df = pd.read_csv("/content/제출.csv", encoding='utf-8')
df.head(3)

Unnamed: 0,신고번호,우범여부,핵심적발
0,982834,1,1.0
1,828961,0,0.0
2,522066,0,0.0


# 따로 확인

In [None]:
models_core[0].summary()

In [None]:
models_crime[0].summary()

# 다시 하나로 진행해보기

core

In [47]:
n_fold = 5

cat_pred = np.zeros((cat_core_train.shape[0], 1))
pred_core_test = pd.DataFrame()
feat_core_importance = pd.DataFrame({'fea_name':cat_core_train.columns.to_list()})

train_x = cat_core_train
test_x= cat_core_test
target_y = train_y_core

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_core_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid)
    #pred_core_test[f'{fold}_pred']=model_cat.predict(cat_core_test).reshape(-1,)
    feat_core_importance[f'importance_{fold}'] = model_cat.get_feature_importance()
    
    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='macro'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='macro'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='macro'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='macro'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='macro'))
print('recall Score:', recall_score(target_y, cat_pred, average='macro'))


----------------- Fold 0 -----------------

0:	learn: 1.0753269	test: 1.0746984	best: 1.0746984 (0)	total: 57ms	remaining: 1m 53s
100:	learn: 0.7922023	test: 0.7862990	best: 0.7862990 (100)	total: 2.99s	remaining: 56.2s
200:	learn: 0.7189646	test: 0.7395841	best: 0.7395841 (200)	total: 5.53s	remaining: 49.5s
300:	learn: 0.6680647	test: 0.7209516	best: 0.7209516 (300)	total: 7.89s	remaining: 44.5s
400:	learn: 0.6256907	test: 0.7102141	best: 0.7102141 (400)	total: 10.2s	remaining: 40.6s
500:	learn: 0.5894431	test: 0.7024050	best: 0.7024050 (500)	total: 12.5s	remaining: 37.3s
600:	learn: 0.5568662	test: 0.6987732	best: 0.6987732 (600)	total: 14.7s	remaining: 34.3s
700:	learn: 0.5266658	test: 0.6961367	best: 0.6961367 (700)	total: 17s	remaining: 31.6s
800:	learn: 0.4997530	test: 0.6936563	best: 0.6936563 (800)	total: 19.3s	remaining: 28.9s
900:	learn: 0.4746283	test: 0.6926868	best: 0.6925847 (881)	total: 21.6s	remaining: 26.3s
1000:	learn: 0.4508907	test: 0.6914727	best: 0.6913489 (997)	

crime

In [48]:
n_fold = 5

cat_pred = np.zeros((cat_crime_train.shape[0], 1))
pred_crime_test = pd.DataFrame()
feat_crime_importance = pd.DataFrame({'fea_name':cat_crime_train.columns.to_list()})

train_x = cat_crime_train
test_x= cat_crime_test
target_y = train_y_crime

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_crime_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid).reshape(-1,1)
    #pred_crime_test[f'{fold}_pred']=model_cat.predict(cat_crime_test).reshape(-1,)
    
    feat_crime_importance[f'importance_{fold}'] = model_cat.get_feature_importance()

    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='binary'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='binary'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='binary'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='binary'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='binary'))
print('recall Score:', recall_score(target_y, cat_pred, average='binary'))



----------------- Fold 0 -----------------

0:	learn: 0.6657596	test: 0.6662148	best: 0.6662148 (0)	total: 28.4ms	remaining: 56.8s
100:	learn: 0.4915571	test: 0.5061930	best: 0.5061930 (100)	total: 1.67s	remaining: 31.5s
200:	learn: 0.4639818	test: 0.4962383	best: 0.4962383 (200)	total: 3.17s	remaining: 28.4s
300:	learn: 0.4425266	test: 0.4928430	best: 0.4926888 (296)	total: 4.64s	remaining: 26.2s
400:	learn: 0.4235740	test: 0.4915704	best: 0.4915043 (392)	total: 6.12s	remaining: 24.4s
500:	learn: 0.4055062	test: 0.4909730	best: 0.4908952 (468)	total: 7.63s	remaining: 22.8s
bestTest = 0.4908952391
bestIteration = 468
Shrink model to first 469 iterations.

CV f1 Score: 0.5876914186651256

CV precision Score: 0.474716041699082
CV recall Score: 0.7712335692618807

----------------- Fold 1 -----------------

0:	learn: 0.6655175	test: 0.6661270	best: 0.6661270 (0)	total: 27.7ms	remaining: 55.5s
100:	learn: 0.4906500	test: 0.5064862	best: 0.5064862 (100)	total: 1.62s	remaining: 30.5s
200:	l

In [49]:
#feat_crime_importance.loc[feat_crime_importance.apply(lambda row: np.count_nonzero(row[1:])), axis=1)]

feat_core_drop_list = feat_core_importance.loc[feat_core_importance.apply(lambda row: row[1:].sum()==0, axis=1)].fea_name.to_list()

if '과세가격원화금액' not in feat_core_drop_list:
    feat_core_drop_list.append('과세가격원화금액')
    
print('과세가격원화금액' in feat_core_drop_list)
print(feat_core_drop_list)
print(len(feat_core_drop_list))

True
['과세가격원화금액']
1


In [50]:
feat_crime_drop_list = feat_crime_importance.loc[feat_crime_importance.apply(lambda row: row[1:].sum()==0, axis=1)].fea_name.to_list()

if '과세가격원화금액' not in feat_crime_drop_list:
    feat_crime_drop_list.append('과세가격원화금액')

print('과세가격원화금액' in feat_crime_drop_list)
print(feat_crime_drop_list)
print(len(feat_crime_drop_list))

True
['특송업체부호_5', '특송업체부호_34', '특송업체부호_43', '수입거래구분코드_1', '수입거래구분코드_9', '적출국가코드_4', '적출국가코드_22', '적출국가코드_24', '적출국가코드_34', '원산지국가코드_13', '원산지국가코드_14', '원산지국가코드_30', '원산지국가코드_37', '원산지국가코드_42', '원산지국가코드_43', '관세율구분코드_2', '관세율구분코드_3', '관세율구분코드_8', '관세율구분코드_15', '관세율_7', '관세율_10', '관세율_12', '관세율_16', '관세율_29', '관세율_36', '과세가격원화금액']
26


# 다시의 bad 피처 제거 후 catboost 재실행

In [51]:
cat_core_train.drop(feat_core_drop_list, axis=1, inplace=True)
cat_core_test.drop(feat_core_drop_list, axis=1, inplace=True)

cat_crime_train.drop(feat_crime_drop_list, axis=1, inplace=True)
cat_crime_test.drop(feat_crime_drop_list, axis=1, inplace=True)

core

In [52]:
n_fold = 5

cat_pred = np.zeros((cat_core_train.shape[0], 1))
pred_core_test = pd.DataFrame()
feat_core_importance = pd.DataFrame({'fea_name':cat_core_train.columns.to_list()})

train_x = cat_core_train
target_y = train_y_core

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_core_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid)
    pred_core_test[f'{fold}_pred']=model_cat.predict(cat_core_test).reshape(-1,)
    feat_core_importance[f'importance_{fold}'] = model_cat.get_feature_importance()
    
    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='macro'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='macro'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='macro'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='macro'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='macro'))
print('recall Score:', recall_score(target_y, cat_pred, average='macro'))


----------------- Fold 0 -----------------

0:	learn: 1.0753269	test: 1.0746985	best: 1.0746985 (0)	total: 50.4ms	remaining: 1m 40s
100:	learn: 0.7922027	test: 0.7862992	best: 0.7862992 (100)	total: 2.58s	remaining: 48.6s
200:	learn: 0.7189646	test: 0.7395840	best: 0.7395840 (200)	total: 4.94s	remaining: 44.2s
300:	learn: 0.6680647	test: 0.7209515	best: 0.7209515 (300)	total: 7.25s	remaining: 41s
400:	learn: 0.6264729	test: 0.7097309	best: 0.7097309 (400)	total: 9.56s	remaining: 38.1s
500:	learn: 0.5895047	test: 0.7031032	best: 0.7031032 (500)	total: 11.9s	remaining: 35.5s
600:	learn: 0.5574110	test: 0.6999031	best: 0.6997512 (598)	total: 14.1s	remaining: 32.8s
700:	learn: 0.5275040	test: 0.6973175	best: 0.6973175 (700)	total: 16.4s	remaining: 30.3s
800:	learn: 0.4997615	test: 0.6959929	best: 0.6959289 (797)	total: 18.6s	remaining: 27.9s
900:	learn: 0.4749043	test: 0.6944491	best: 0.6943709 (891)	total: 20.9s	remaining: 25.5s
bestTest = 0.6938789895
bestIteration = 933
Shrink model to

crime

In [53]:
n_fold = 5

cat_pred = np.zeros((cat_crime_train.shape[0], 1))
pred_crime_test = pd.DataFrame()
feat_crime_importance = pd.DataFrame({'fea_name':cat_crime_train.columns.to_list()})

train_x = cat_crime_train
target_y = train_y_crime

skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)




for fold, (train_idx, valid_idx) in enumerate(skfold.split(train_x, target_y)):
    print(f'\n----------------- Fold {fold} -----------------\n')
    X_train, X_valid, y_train, y_valid = train_x.iloc[train_idx], train_x.iloc[valid_idx], target_y.iloc[train_idx], target_y.iloc[valid_idx]

    model_cat = CatBoostClassifier(**cat_crime_params)

    model_cat.fit(X_train, y_train, 
        #cat_features=cat_feats, 
        eval_set=(X_valid, y_valid), 
        early_stopping_rounds = 50,

        verbose= 100
    )
  
    cat_pred[valid_idx] = model_cat.predict(X_valid).reshape(-1,1)
    pred_crime_test[f'{fold}_pred']=model_cat.predict(cat_crime_test).reshape(-1,)
    
    feat_crime_importance[f'importance_{fold}'] = model_cat.get_feature_importance()

    print('\nCV f1 Score:', f1_score(y_valid,  cat_pred[valid_idx], average='binary'))
    print('\nCV precision Score:', precision_score(y_valid, cat_pred[valid_idx], average='binary'))
    print('CV recall Score:', recall_score(y_valid, cat_pred[valid_idx], average='binary'))

print('\n\nf1 Score:', f1_score(target_y, cat_pred, average='binary'))
print('\nprecision Score:', precision_score(target_y, cat_pred, average='binary'))
print('recall Score:', recall_score(target_y, cat_pred, average='binary'))



----------------- Fold 0 -----------------

0:	learn: 0.6658307	test: 0.6662913	best: 0.6662913 (0)	total: 27.5ms	remaining: 55s
100:	learn: 0.4910671	test: 0.5060383	best: 0.5060383 (100)	total: 1.6s	remaining: 30s
200:	learn: 0.4639909	test: 0.4955645	best: 0.4955645 (200)	total: 3.04s	remaining: 27.2s
300:	learn: 0.4423540	test: 0.4923138	best: 0.4921882 (294)	total: 4.49s	remaining: 25.4s
400:	learn: 0.4232469	test: 0.4911252	best: 0.4910316 (387)	total: 5.96s	remaining: 23.8s
500:	learn: 0.4057486	test: 0.4905795	best: 0.4903808 (466)	total: 7.46s	remaining: 22.3s
bestTest = 0.4903807777
bestIteration = 466
Shrink model to first 467 iterations.

CV f1 Score: 0.5871896435127042

CV precision Score: 0.4752150117279124
CV recall Score: 0.7682002022244692

----------------- Fold 1 -----------------

0:	learn: 0.6656016	test: 0.6663036	best: 0.6663036 (0)	total: 28.2ms	remaining: 56.4s
100:	learn: 0.4904195	test: 0.5066310	best: 0.5066310 (100)	total: 1.66s	remaining: 31.2s
200:	learn

# 정리

NN seed=1927
cat seed=1617 