In [1]:
#-*- coding: utf-8 -*-
import gc
import random
import os
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
# import seaborn as sns

# from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

# from sklearn.preprocessing import StandardScaler
# from sklearn.impute import SimpleImputer
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier 
# from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
# import xgboost as xgb
# import catboost
# import lightgbm as lgb
from sklearn.ensemble import ExtraTreesClassifier
# from sklearn.neural_network import MLPClassifier
# from sklearn.svm import SVC

### model hyperparameter
* Decision Tree : default
* LGBM : default
* Catboost : task_type = 'GPU', verbose = None, logging_level = 'Silent'

### 이 파일은 시각화 및 grid search, best stacking model search 코드 등은 제외하고 최종예측에 관여하는 코드만 포함합니다.

In [2]:
SEED = 37
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(SEED) 

## Data load

In [3]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [4]:
train_x = train_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP', 'Y_Class', 'Y_Quality'])
train_y = train_df['Y_Class']

test_x = test_df.drop(columns=['PRODUCT_ID', 'TIMESTAMP'])

## Pre-processing

In [6]:
train_x = train_x.fillna(0)
test_x = test_x.fillna(0)

In [7]:
# qualitative to quantitative
qual_col = ['LINE', 'PRODUCT_CODE']

for i in qual_col:
    le = LabelEncoder()
    le = le.fit(train_x[i])
    train_x[i] = le.transform(train_x[i])
    
    for label in np.unique(test_x[i]): 
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test_x[i] = le.transform(test_x[i]) 
print('Done.')

Done.


In [8]:
print(train_x) 

     LINE  PRODUCT_CODE   X_1   X_2  X_3   X_4   X_5  X_6   X_7   X_8  ...  \
0       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
1       3             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
2       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
3       3             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
4       2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
..    ...           ...   ...   ...  ...   ...   ...  ...   ...   ...  ...   
593     5             2   2.0  95.0  0.0  45.0  10.0  0.0  50.0  10.0  ...   
594     2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
595     2             0   0.0   0.0  0.0   0.0   0.0  0.0   0.0   0.0  ...   
596     4             1  40.0  94.0  0.0  45.0  11.0  0.0  45.0  10.0  ...   
597     5             1  21.0  87.0  0.0  45.0  10.0  0.0  61.0  10.0  ...   

     X_2866  X_2867  X_2868  X_2869  X_2870  X_2871  X_2872  X_

In [9]:
# for permutation importance check 
X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size = 0.15, random_state = 37)
print (f'X_train:{X_train.shape} y_train: {y_train.shape}')
print (f'X_test:{X_test.shape} y_test: {y_test.shape}')

X_train:(508, 2877) y_train: (508,)
X_test:(90, 2877) y_test: (90,)


In [10]:
lgb_params = {
    'metric': 'cross_entropy',
    'n_estimators': 10000,
    'objective': 'softmax',
    'learning_rate': 0.02,
    'min_child_samples': 150,
    'reg_alpha': 3e-5,
    'reg_lambda': 9e-2,
    'num_leaves': 20,
    'max_depth': 16,
    'colsample_bytree': 0.8,
    'subsample': 0.8,
    'subsample_freq': 2,
    'max_bin': 240,
    'device': 'gpu'
}

rf_params = {
    'max_depth': 7,
    'min_samples_leaf': 10,
    'random_state': 37
}

In [11]:
y_train

311    1
371    1
18     1
183    1
215    2
      ..
451    1
22     0
437    1
251    1
363    0
Name: Y_Class, Length: 508, dtype: int64

In [12]:
cl1 = KNeighborsClassifier(n_neighbors = 9)
cl2 = RandomForestClassifier(**rf_params)
cl3 = GaussianNB()
cl4 = DecisionTreeClassifier() # selected
cl5 = CatBoostClassifier(task_type = 'GPU', verbose = None, logging_level = 'Silent') # selected
cl6 = LGBMClassifier()  # selected
cl7 = ExtraTreesClassifier(bootstrap=False, criterion='entropy', max_features=0.55, min_samples_leaf=8, min_samples_split=4, n_estimators=100) # Optimized using TPOT
# cl8 = XGBClassifier(eval_metric='mlogloss', objective ='multi:softmax',use_label_encoder=False)

In [13]:
classifiers = {
    "RandomForest": cl2,
    "DecisionTree": cl4,
    "CatBoost": cl5,
    "LGBM": cl6,
    "ExtraTrees": cl7,
    # "XGboost":cl8
}

In [14]:
mlr = LogisticRegression() # stacking meta model

In [15]:
models_scores_results, models_names = list(), list() 

In [16]:
RANDOM_SEED = 37
PROBAS = True
FOLDS = 5
N_ESTIMATORS = 1000

TARGET = 'Y_Class'

In [17]:
taken_classifiers = ["RandomForest", "DecisionTree", "CatBoost", "LGBM", "ExtraTrees"]

In [18]:
import itertools
from mlxtend.classifier import StackingCVClassifier

In [19]:
import eli5
from eli5.sklearn import PermutationImportance

In [20]:
model=CatBoostClassifier(silent=True, random_state=37).fit(X_train, y_train)

In [21]:
perm = PermutationImportance(model, random_state=37).fit(X_test, y_test)


In [22]:
# feature selection
minimum_importance = -0.001
mask = perm.feature_importances_ > minimum_importance
features = X_train.columns[mask]
X_train = X_train[features]
X_test = X_test[features]

In [23]:
best_cls_experiment = [ 'LGBM',"DecisionTree",'CatBoost'] 

In [24]:
print(f'The best models configuration: {best_cls_experiment}')

classifier_exp = []
for label in best_cls_experiment:
        classifier = classifiers[label]
        classifier_exp.append(classifier)

The best models configuration: ['LGBM', 'DecisionTree', 'CatBoost']


In [25]:
classifier_exp

[LGBMClassifier(),
 DecisionTreeClassifier(),
 <catboost.core.CatBoostClassifier at 0x244dddc5550>]

## Train

In [26]:
scl = StackingCVClassifier(classifiers= classifier_exp,
                            meta_classifier = mlr, # use meta-classifier
                            use_probas = PROBAS,   # use_probas = True/False
                            random_state = 37)

scores =cross_val_score(scl, X_train, y_train, cv = FOLDS, scoring='accuracy')
models_scores_results.append(scores)
models_names.append('scl')
print("Meta model (slc) - accuracy: %0.5f " % (scores.mean()))
scl.fit(X_train, y_train)

top_meta_model = scl
base_acc = scores.mean()

Meta model (slc) - accuracy: 0.78348 


### meta model 결과값이 0.77566일경우 Y_class의 결과값이 265 38 7이 나옵니다

In [27]:
#top_meta_model = hyper_meta_model
classifiers["scl"] = top_meta_model

## Predict

In [28]:

test_x = test_x[features]
 

In [29]:
test_preds = classifiers['scl'].predict(test_x)[:,]

In [30]:
submission = pd.read_csv('./sample_submission.csv')

In [31]:
submission['Y_Class'] = test_preds

In [32]:
submission.to_csv('./decisionlgbmcatBoost_remove.csv', index=False)

In [33]:
submission['Y_Class'].value_counts()

1    264
0     40
2      6
Name: Y_Class, dtype: int64